In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
from datetime import datetime
import numpy as np
from time import sleep
import requests
from tqdm.notebook import tqdm
import time
import random

In [2]:
pd.options.display.max_rows = 100

#### Website to scrape: https://comicbookrealm.com/publisher/3/

In [2]:
def get_publisher(num):
    link = f'https://comicbookrealm.com/publisher/{num}/'
    return link

In [6]:
headers = {'User-agent':''}
response = requests.get(get_publisher(3), headers = headers)
response.status_code

200

In [7]:
soup = BeautifulSoup(response.content, 'html.parser')

In [10]:
pub_name = soup.find('div', class_='module').find('h2').text

In [47]:
pub_list = (soup.find('div', class_='publisher-list')
 .find('div', class_='publisher-list')
 .find_all('a')
)
pub_list = [{'link': a['href'], 'info': a['title'], 'letter': a.text} for a in pub_list]

In [23]:
pub_data_labels = [td.text for td in soup.find_all('td', class_='item')]

In [27]:
pub_data_items = [td.text for td in soup.find_all('td', class_='data')][:len(pub_data_labels)]

In [50]:
pub_data = {label: item for label, item in zip(pub_data_labels, pub_data_items)}

In [51]:
pub_data

{'Titles Tracking': '2,303',
 'Comics Tracking': '21,472',
 'Most Valuable': 'Spawn  Issue #185e $4,200.00',
 'Most Owned': '6463 - Spawn Issue #1 $30.00',
 'Most Wanted': '152 - Walking Dead Issue #1 $1,800.00',
 'New Additions': ''}

In [45]:
logo = soup.find('td', class_='logo required').find('img')
logo = {'src': logo['src'], 'title': logo['alt']}

In [44]:
logo

{'src': '/images/publishers/images/3.gif', 'descr': "Image Comics's logo"}

In [3]:
def publisher_scraper(id=3):
    
    headers = {'User-agent': ''}
    response = requests.get(get_publisher(id), headers = headers)
    soup = BeautifulSoup(response.content, 'html.parser')
    name = soup.find('div', class_='module').find('h2').text
    labels = [td.text for td in soup.find_all('td', class_='item')]
    items = [td.text for td in soup.find_all('td', class_='data')][:len(labels)]
    # data = {label: item for label, item in zip(data_labels, data_items)}
    
    if name == '' or items[0] == '0':
        # no publisher data to scrape
        return False    
    
    title_lst = (soup.find('div', class_='publisher-list')
                 .find('div', class_='publisher-list')
                 .find_all('a')
                )
    title_lst = [[a['href'], a['title'], a.text.strip()]for a in title_lst]
    try:
        logo = soup.find('td', class_='logo required').find('img')
        logo = logo['src']
    except TypeError:
        logo = 'not provided'
    

    data_lst = [id, name, logo]
    data_lst.extend(items)
    data_lst.append(title_lst)
    
    return data_lst

In [4]:

publishers = []
for i in tqdm(range(1, 10000)):

    res = publisher_scraper(id=i)
    if not res:
        continue
    else:
        publishers.append(res)
    time.sleep(0.5+ random.random())

  0%|          | 0/9999 [00:00<?, ?it/s]

In [5]:
df = pd.DataFrame(publishers, columns=['id', 'name', 'logo_link', 'titles_tracking', 'comics_tracking', 
                           'most_valuable', 'most_owned', 'most_wanted', 
                           'new_additions', 'titles_AZ'])

In [7]:
df.to_csv('publishers_data.csv')

In [8]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6153 entries, 0 to 6152
Data columns (total 10 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   id               6153 non-null   int64 
 1   name             6153 non-null   object
 2   logo_link        6153 non-null   object
 3   titles_tracking  6153 non-null   object
 4   comics_tracking  6153 non-null   object
 5   most_valuable    6153 non-null   object
 6   most_owned       6153 non-null   object
 7   most_wanted      6153 non-null   object
 8   new_additions    6153 non-null   object
 9   titles_AZ        6153 non-null   object
dtypes: int64(1), object(9)
memory usage: 3.9 MB
