In [632]:
import re
import jsonlines
import pandas as pd
import urllib.request
from bs4 import BeautifulSoup
from requests.exceptions import RequestException
import random
from time import sleep
from os import path

import comicvision.webscraper as webscraper

In [633]:
def strip_channel_from_title(title: str):
    channel = re.search(r"\[(.*?)\]", title)
    if channel is None:
        return title
    else:
        return title.split(channel.group())[0].strip()


def check_if_issue_is_variant(title: str, metadata_path: str) -> bool: 
    logged_metadata = []

    with jsonlines.open('./metadata/covers.jsonl', mode='r') as reader:
        for item in reader:
            logged_metadata.append(item)

    titles = list(pd.DataFrame(logged_metadata)['title'].values)
    
    return reduce(lambda x, y: x | y, [strip_channel_from_title(title) in x for x in titles])

In [634]:
# TODO: refactor this method to make it more functional and easier to test
# TODO: add tests!

def get_all_from_publisher_page(publisher_url: str, page: int):
    """
    Do a thing...
    """
    # global val
    URL = 'https://www.comics.org'

    # get publisher page
    publisher_html = webscraper.simple_get(publisher_url + '?page={}'.format(page))
    publisher_soup = webscraper.transform_simple_get_html(publisher_html)
    
    # parse series table from publisher page
    series_name = [result.find('a').contents[0] for result in publisher_soup.find_all('td', {'class': 'name'})]
    series_href = [result.find('a')['href'] for result in  publisher_soup.find_all('td', {'class': 'name'})]
    series_year = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'year'})]
    series_issue_count = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'issue_count'})]
    series_published = [result.contents[0] for result in publisher_soup.find_all('td', {'class': 'published'})]
    
    # create dataframe of publisher series (on page)
    series_df = pd.DataFrame(list(zip(series_name, series_href, series_year, series_issue_count, series_published)),
                             columns=['name', 'href', 'year', 'issue_count', 'published'])

    # parse issue count as int from issue_count column
    series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

    # iterate over series dataframe and get issue covers and metadata
    for series_name, series_page_href, issue_count in zip(series_df['name'], series_df['href'], series_df['issue_count_int']):
        
        if issue_count < 12:
            pass
        else:
            print(series_name, series_page_href, issue_count)
            
            # construct series page url
            series_page_url  = URL + series_page_href

            # get series page
            series_page_html = webscraper.simple_get(series_page_url)
            series_page_soup = webscraper.transform_simple_get_html(series_page_html)

            # get cover gallery url for series
            if series_page_soup.find('a',  href=True, text='Cover Gallery') is None:
                pass
            else:
                cover_gallery_href = series_page_soup.find('a',  href=True, text='Cover Gallery')['href']
                cover_gallery_base_url = URL + cover_gallery_href

                # get cover gallery page
                cover_gallery_html = webscraper.simple_get(cover_gallery_base_url)
                cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

                if len(cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})) == 0:

                    # get issue hrefs from all linked issues on cover gallery
                    cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
                    issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

                    #  construct issue urls from issue hrefs
                    issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

                    # scrape issues
                    for issue_url in issue_urls:

                        # get issue page
                        issue_html = webscraper.simple_get(issue_url)
                        issue_soup = webscraper.transform_simple_get_html(issue_html)

                        # metadata
                        metadata = {}
                        metadata['series_name'] = series_name.replace('/', '|')

                        # scrape metadata from issue page
                        # title, price, pages, color, dimension, paper_stock, binding, publishing_format
                        def get_issue_metadata(soup, name):
                            if len(soup.find_all('dd', id=name)) > 0:
                                if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
                                    return soup.find_all('dd', id=name)[0].contents[0].strip()
                                else:
                                    try:
                                        return soup.find_all('dd', id=name)[0].find('a').contents[0]
                                    except:
                                        return ""
                            else:
                                return ""

                        # post process the issue title removing extraneous characters
                        metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')

                        # check if issue is redundant to an issue  already we pulled (variant)
                        if path.exists('./metadata/covers.jsonl'):
                            is_variant = check_if_issue_is_variant(title=metadata['title'], metadata_path='./metadata/covers.jsonl')
                            if is_variant:
                                pass
                            else:
                                metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                all_issue_credits = list(zip(
                                    issue_soup.find_all('span', {'class': 'credit_label'}),  
                                    issue_soup.find_all('span', {'class': 'credit_value'})))

                                metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                # get cover section
                                cover = issue_soup.find("div", {"class": "cover"})

                                # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                cover_credits = list(zip(
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                ))

                                metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                metadata.pop('cover_reprints', None)

                                # get the cover url
                                cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                cover_img_url = URL + cover_img_href

                                # get cover page
                                cover_img_html = webscraper.simple_get(cover_img_url)
                                cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                # get image urls from cover page
                                cover_images = cover_img_soup.select('img')

                                # get raw, highest res image
                                cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], cover_images))
                                cover = cover_images[0]['src']

                                # construct where to save the cover image
                                save_as = "{} -- {}".format(metadata['series_name'], metadata['title'])
                                save_to = './covers/' + save_as + '.jpg'

                                metadata["cover_image_file_name"] = save_as

                                # save cover image
                                urllib.request.urlretrieve(cover, save_to)

                                # TODO: reason about response of save; if successful, save metadata, else contine
                                # save metadata
                                with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                    writer.write(metadata)

                                # TODO: write to log... timestamp/publisher/series/issue/
                                now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                publisher_int = publisher_url.split('/')[-2]

                                log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                    writer.write(log)

                                # slow down the requests so we don't take too many resources and get blocked
                                sleep(random.uniform(5, 10))

                        else:
                            metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                            metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                            metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                            metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                            metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                            metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                            metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                            metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                            metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                            metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                            metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                            metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                            metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                            all_issue_credits = list(zip(
                                issue_soup.find_all('span', {'class': 'credit_label'}), 
                                issue_soup.find_all('span', {'class': 'credit_value'})))

                            metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                            # get cover section
                            cover = issue_soup.find("div", {"class": "cover"})

                            # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                            cover_credits = list(zip(
                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                            ))

                            metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                            metadata.pop('cover_reprints', None)

                            # get the cover url
                            cover_img_href = cover.find("div", {'coverImage'}).a['href']
                            cover_img_url = URL + cover_img_href

                            # get cover page
                            cover_img_html = webscraper.simple_get(cover_img_url)
                            cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                            # get image urls from cover page
                            cover_images = cover_img_soup.select('img')

                            # get raw, highest res image
                            cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], cover_images))
                            cover = cover_images[0]['src']

                            # construct where to save the cover image
                            save_as = "{} -- {}".format(metadata['series_name'], metadata['title'])
                            save_to = './covers/' + save_as + '.jpg'

                            metadata["cover_image_file_name"] = save_as

                            # save cover image
                            urllib.request.urlretrieve(cover, save_to)

                            # TODO: reason about response of save; if successful, save metadata, else contine
                            # save metadata
                            with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                writer.write(metadata)

                            # TODO: write to log... timestamp/publisher/series/issue/
                            now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                            publisher_int = publisher_url.split('/')[-2]

                            log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                            with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                writer.write(log)

                            # slow down the requests so we don't take too many resources and get blocked
                            sleep(random.uniform(5, 10))
                else:
                    cover_gallery_pages = list(filter(lambda x: x.isdigit(), [x.contents[0] for x in cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})]))
                    cover_gallery_range = max([int(x) for x in cover_gallery_pages])

                    for i in range(1, cover_gallery_range + 1):
                        cover_gallery_url = str(cover_gallery_base_url + '/?page={}').format(i)
                        cover_gallery_html = webscraper.simple_get(cover_gallery_url)
                        cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

                        # get issue hrefs from all linked issues on cover gallery
                        cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
                        issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

                        #  construct issue urls from issue hrefs
                        issue_urls = [URL + issue_href for issue_href in  issue_hrefs]

                        # scrape issues
                        for issue_url in issue_urls:

                            # get issue page
                            issue_html = webscraper.simple_get(issue_url)
                            issue_soup = webscraper.transform_simple_get_html(issue_html)

                            # metadata
                            metadata = {}
                            metadata['series_name'] = series_name.replace('/', '|')

                            # scrape metadata from issue page
                            # title, price, pages, color, dimension, paper_stock, binding, publishing_format
                            def get_issue_metadata(soup, name):
                                if len(soup.find_all('dd', id=name)) > 0:
                                    if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
                                        return soup.find_all('dd', id=name)[0].contents[0].strip()
                                    else:
                                        try:
                                            return soup.find_all('dd', id=name)[0].find('a').contents[0]
                                        except:
                                            return ""
                                else:
                                    return ""

                            metadata['title'] = issue_soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1].replace('/', '|')

                            # check if issue is redundant to an issue  already we pulled (variant)
                            if path.exists('./metadata/covers.jsonl'):
                                is_variant = check_if_issue_is_variant(title=metadata['title'], metadata_path='./metadata/covers.jsonl')
                                if is_variant:
                                    pass
                                else:
                                    metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                    metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                    metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                    metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                    metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                    metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                    metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                    metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                    metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                    metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                    metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                    metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                    metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                    all_issue_credits = list(zip(
                                        issue_soup.find_all('span', {'class': 'credit_label'}),  
                                        issue_soup.find_all('span', {'class': 'credit_value'})))

                                    metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                    # get cover section
                                    cover = issue_soup.find("div", {"class": "cover"})

                                    # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                    cover_credits = list(zip(
                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                        [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                    ))

                                    metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                    metadata.pop('cover_reprints', None)

                                    # get the cover url
                                    cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                    cover_img_url = URL + cover_img_href

                                    # get cover page
                                    cover_img_html = webscraper.simple_get(cover_img_url)
                                    cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                    # get image urls from cover page
                                    cover_images = cover_img_soup.select('img')

                                    # get raw, highest res image
                                    cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], cover_images))
                                    cover = cover_images[0]['src']

                                    # construct where to save the cover image
                                    save_as = "{} -- {}".format(metadata['series_name'], metadata['title'])
                                    save_to = './covers/' + save_as + '.jpg'

                                    metadata["cover_image_file_name"] = save_as

                                    # save cover image
                                    urllib.request.urlretrieve(cover, save_to)

                                    # TODO: reason about response of save; if successful, save metadata, else contine
                                    # save metadata
                                    with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                        writer.write(metadata)

                                    # TODO: write to log... timestamp/publisher/series/issue/
                                    now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                    publisher_int = publisher_url.split('/')[-2]

                                    log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                    with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                        writer.write(log)

                                    # slow down the requests so we don't take too many resources and get blocked
                                    sleep(random.uniform(5, 10))

                            else:
                                metadata['on_sale_date'] = get_issue_metadata(issue_soup, name='on_sale_date')
                                metadata['indicia_frequency'] = get_issue_metadata(issue_soup, name='indicia_frequency')
                                metadata['issue_indicia_publisher'] = get_issue_metadata(issue_soup, name='issue_indicia_publisher')
                                metadata['issue_brand'] = get_issue_metadata(issue_soup, name='issue_brand')
                                metadata['issue_price'] = get_issue_metadata(issue_soup, name='issue_price')
                                metadata['issue_pages'] = get_issue_metadata(issue_soup, name='issue_pages')
                                metadata['format_color'] = get_issue_metadata(issue_soup, name='format_color')
                                metadata['format_dimensions'] = get_issue_metadata(issue_soup, name='format_dimensions')
                                metadata['format_paper_stock'] = get_issue_metadata(issue_soup, name='format_paper_stock')
                                metadata['format_binding'] = get_issue_metadata(issue_soup, name='format_binding')
                                metadata['format_publishing_format'] = get_issue_metadata(issue_soup, name='format_publishing_format')
                                metadata['rating'] = get_issue_metadata(issue_soup, name='rating')
                                metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

                                all_issue_credits = list(zip(
                                    issue_soup.find_all('span', {'class': 'credit_label'}), 
                                    issue_soup.find_all('span', {'class': 'credit_value'})))

                                metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

                                # get cover section
                                cover = issue_soup.find("div", {"class": "cover"})

                                # cover credits: editing, script, pencils, inks, colors, letters, characters, etc...
                                cover_credits = list(zip(
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
                                    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
                                ))

                                metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
                                metadata.pop('cover_reprints', None)

                                # get the cover url
                                cover_img_href = cover.find("div", {'coverImage'}).a['href']
                                cover_img_url = URL + cover_img_href

                                # get cover page
                                cover_img_html = webscraper.simple_get(cover_img_url)
                                cover_img_soup = webscraper.transform_simple_get_html(cover_img_html)

                                # get image urls from cover page
                                cover_images = cover_img_soup.select('img')

                                # get raw, highest res image
                                cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], cover_images))
                                cover = cover_images[0]['src']

                                # construct where to save the cover image
                                save_as = "{} -- {}".format(metadata['series_name'], metadata['title'])
                                save_to = './covers/' + save_as + '.jpg'

                                metadata["cover_image_file_name"] = save_as

                                # save cover image
                                urllib.request.urlretrieve(cover, save_to)

                                # TODO: reason about response of save; if successful, save metadata, else contine
                                # save metadata
                                with jsonlines.open('./metadata/covers.jsonl', mode='a') as writer:
                                    writer.write(metadata)

                                # TODO: write to log... timestamp/publisher/series/issue/
                                now = datetime.datetime.today().strftime("%d/%m/%Y %H:%M:%S")
                                publisher_int = publisher_url.split('/')[-2]

                                log = {'timestamp': now, 'publisher': publisher_int, 'series': metadata['series_name'],  'issue': metadata['title']}
                                with jsonlines.open('./metadata/log.jsonl', mode='a') as writer:
                                    writer.write(log)

                                # slow down the requests so we don't take too many resources and get blocked
                                sleep(random.uniform(5, 10))


In [635]:
get_all_from_publisher_page(publisher_url='https://www.comics.org/publisher/54/', page=3)

All-American Western /series/532/ 24
All-Flash /series/211/ 32
The All-New Batman: The Brave and the Bold /series/53270/ 16
All-Star Comics /series/140/ 57
https://www.comics.org/issue/877/
https://www.comics.org/issue/1037/
https://www.comics.org/issue/1205/
https://www.comics.org/issue/935246/
https://www.comics.org/issue/1291/
https://www.comics.org/issue/1443/
https://www.comics.org/issue/1551/
https://www.comics.org/issue/1681/
https://www.comics.org/issue/1816/
https://www.comics.org/issue/1980/
https://www.comics.org/issue/2094/
https://www.comics.org/issue/2206/
https://www.comics.org/issue/2322/
https://www.comics.org/issue/2422/
https://www.comics.org/issue/2545/
https://www.comics.org/issue/2703/
https://www.comics.org/issue/2822/
https://www.comics.org/issue/2934/
https://www.comics.org/issue/3160/
https://www.comics.org/issue/3361/
https://www.comics.org/issue/3530/
https://www.comics.org/issue/3691/
https://www.comics.org/issue/3862/
https://www.comics.org/issue/4110/
htt

In [636]:
get_all_from_publisher_page(publisher_url='https://www.comics.org/publisher/54/', page=4)

Animaniacs /series/14399/ 59
https://www.comics.org/issue/249264/
https://www.comics.org/issue/249265/
https://www.comics.org/issue/1844242/
https://www.comics.org/issue/249266/
https://www.comics.org/issue/1540444/
https://www.comics.org/issue/249267/
https://www.comics.org/issue/249268/
https://www.comics.org/issue/249269/
https://www.comics.org/issue/249270/
https://www.comics.org/issue/249271/
https://www.comics.org/issue/1971955/
https://www.comics.org/issue/1541115/
https://www.comics.org/issue/249273/
https://www.comics.org/issue/249274/
https://www.comics.org/issue/1980066/
https://www.comics.org/issue/249275/
https://www.comics.org/issue/249276/
https://www.comics.org/issue/249277/
https://www.comics.org/issue/249278/
https://www.comics.org/issue/249279/
https://www.comics.org/issue/249280/
https://www.comics.org/issue/249281/
https://www.comics.org/issue/1811997/
https://www.comics.org/issue/249282/
https://www.comics.org/issue/249283/
https://www.comics.org/issue/1979950/
ht

In [None]:
get_all_from_publisher_page(publisher_url='https://www.comics.org/publisher/54/', page=5)

Art Ops /series/93356/ 12
Astro City /series/74788/ 52
https://www.comics.org/issue/1119068/
https://www.comics.org/issue/1142945/
https://www.comics.org/issue/1127335/
https://www.comics.org/issue/1136759/
https://www.comics.org/issue/1140832/
https://www.comics.org/issue/1145504/
https://www.comics.org/issue/1158648/
https://www.comics.org/issue/1168285/
https://www.comics.org/issue/1173527/
https://www.comics.org/issue/1179220/
https://www.comics.org/issue/1186388/
https://www.comics.org/issue/1196969/
https://www.comics.org/issue/1206036/
https://www.comics.org/issue/1212824/
https://www.comics.org/issue/1246738/
https://www.comics.org/issue/1251362/
https://www.comics.org/issue/1261780/
https://www.comics.org/issue/1281095/
https://www.comics.org/issue/1288560/
https://www.comics.org/issue/1304992/
https://www.comics.org/issue/1318736/
https://www.comics.org/issue/1328030/
https://www.comics.org/issue/1338542/
https://www.comics.org/issue/1353467/
https://www.comics.org/issue/1420

In [610]:
# # TODO: load metadata and return aggregate / summary statistics
# # TODO: write method to display cover image w/ cover metadata and add annotations to image
# # TODO: consider ideate/innotater for annotating directly in Jupyter notebooks

logged_metadata = []

with jsonlines.open('./metadata/covers.jsonl', mode='r') as reader:
    for item in reader:
        logged_metadata.append(item)
        
df = pd.DataFrame(logged_metadata)

In [614]:
def get_issue_number_from_title(title):
    issue = re.search(r"([#?])(\d+)\b", title.replace(',', ''))
    if issue is None:
        return np.nan
    else:
        return np.int(issue.group().replace('#', ''))

df['issue_number'] = df['title'].apply(get_issue_number_from_title)

df.drop('issue_number', axis=1).describe().T

Unnamed: 0,count,unique,top,freq
cover_characters,2333,1324,Superman,227
cover_colors,2711,189,?,1721
cover_editing,47,4,Robert Greenberger (cover editor),33
cover_first line of dialogue or text,277,277,"A little more off the bottom, please!",1
cover_genre,2681,29,superhero,1915
cover_image_file_name,2770,2770,Adventures of Superman -- Adventures of Superm...,1
cover_inks,2712,481,Stan Kaye,220
cover_job number,312,311,C-422,2
cover_keywords,781,594,celebrity,27
cover_letters,2350,37,?,1654


In [602]:
# get list of unique characters across all  covers
def get_value_counts(df, column):
    """
    """
    return pd.Series("; ".join(df[column].dropna()).split('; ')).value_counts()

get_value_counts(df, 'cover_characters')[:20]

Superman                          635
Kal-El]                           208
Superman [Clark Kent              202
Superman [Clark Kent]             186
Lois Lane                         142
Superboy                          123
Bob Hope                          102
Lex Luthor                         83
Supergirl                          73
Jerry Lewis                        60
Superboy [Clark Kent]              52
Doiby Dickles                      49
Jimmy Olsen                        48
The Green Lantern [Alan Scott]     39
Saturn Girl                        36
Lana Lang                          33
Krypto                             30
Green Lantern                      29
Cosmic Boy                         29
Agent Graves                       29
dtype: int64

In [604]:
get_value_counts(df, 'cover_pencils')[:20]

Curt Swan                  349
Bob Oksner                 113
Gil Kane                    99
Dave Johnson (signed)       91
Win Mortimer                84
Owen Fitzgerald             78
Wayne Boring                68
Jerry Grandenetti           59
J. G. Jones (signed)        49
Al Plastino                 49
Ross Andru (signed)         48
Jack Burnley                41
Tom Grummett (signed)       37
Neal Adams                  35
Dan Jurgens (signed)        29
Stuart Immonen (signed)     29
Jerry Ordway (signed)       27
Nick Cardy                  27
Mort Drucker                21
Irwin Hasen                 21
dtype: int64

In [618]:
get_value_counts(df, 'series_name')[:20]

Action Comics                                  928
Adventure Comics                               499
Adventures of Superman                         241
All-American Men of War                        116
100 Bullets                                    113
The Adventures of Bob Hope                     109
All-American Comics                            102
All Star Western                                89
The Adventures of Jerry Lewis                   84
80 Page Giant Magazine                          56
52                                              52
The Adventures of Rex the Wonder Dog            46
Action Comics Weekly                            42
The Adventures of Dean Martin & Jerry Lewis     40
Advanced Dungeons & Dragons Comic Book          36
The All New Atom                                25
Air                                             24
All Funny Comics                                23
Adventures in the DC Universe                   19
All Star Batman                

In [625]:
def concat_values_by_series(df, concat_column, series_name):    

    list_of_synopsis = df[
        (df['series_name'] == series_name) & df['issue_number'] != 0.0].sort_values('issue_number')[concat_column]

    return " | ".join(list_of_synopsis.values)

print(concat_values_by_series(df, concat_column='synopsis', series_name='Adventure Comics')[:2000], '...')

Conner settles into his new life, following in the footsteps of Superman -- living with Ma Kent, going to Smallville High, joining a team of super-heroes, and helping anyone who needs help -- but he worries that he will also follow the path of his other "father." | Though the Legion has almost fully re-formed in the future, Starman and Tellus remain in the present, where Starman begs Tellus to help him fulfill R. J. Brande's final request so he can be reunited with his one true love, Dream Girl. | As Luthor aids Brainiac in recovering Kandor, in exchange for the Earth, Superboy meets with Wonder Girl for the first time since his return. | Lightning Lord summons Lightning Lad and sends him on a quest to find a lost relative. | When Krypto rounds up some of Superboy’s old enemies, Conner wonders if Krypto can sniff out where Lex Luthor is.  When that doesn’t work, Superboy turns to Red Robin for help.  While the two catch up in Paris, the object of Superboy’s search discovers that the Te

In [517]:
df.describe().T

Unnamed: 0,count,unique,top,freq
cover_characters,2333,1324,Superman,227
cover_colors,2711,189,?,1721
cover_editing,47,4,Robert Greenberger (cover editor),33
cover_first line of dialogue or text,277,277,"A little more off the bottom, please!",1
cover_genre,2681,29,superhero,1915
cover_image_file_name,2770,2770,Adventures of Superman -- Adventures of Superm...,1
cover_inks,2712,481,Stan Kaye,220
cover_job number,312,311,C-422,2
cover_keywords,781,594,celebrity,27
cover_letters,2350,37,?,1654


In [440]:
# dc US comics publisher page
url = 'https://www.comics.org/publisher/54/?page=1'

html = webscraper.simple_get(url)
soup = webscraper.transform_simple_get_html(html)

In [441]:
# parse series' metadata

name = soup.find_all('td', {'class': 'name'})
year = soup.find_all('td', {'class': 'year'})
issue_count = soup.find_all('td', {'class': 'issue_count'})
covers = soup.find_all('td', {'class': 'covers'})
published = soup.find_all('td', {'class': 'published'})

In [442]:
# TODO: get list of series names and urls from a publisher page (e.g. page=1)

n = [result.find('a').contents[0] for result in name]
href = [result.find('a')['href'] for result in  name]
y = [result.contents[0] for result in year]
i = [result.contents[0] for result in issue_count]
# c = [result.find('a').contents[0] for result in covers]
p = [result.contents[0] for result in published]

In [443]:
# TODO: reason about a series' metadata (# of issues, #  of covers)

# create series dataframe
series_df = pd.DataFrame(list(zip(n, href,  y, i, p)), columns=['name', 'href', 'year', 'issue_count', 'published'])

# parse issue count as int from issue_count  column
series_df['issue_count_int'] = series_df['issue_count'].apply(lambda x: int(re.search(r'\d+', x).group()))

In [444]:
series_df[series_df['issue_count_int'] > 12]

Unnamed: 0,name,href,year,issue_count,published,issue_count_int
3,100 Bullets,/series/6133/,1999,100 issues (100 indexed),August 1999 - April 2009,100
4,100 Bullets,/series/24535/,2000,13 issues (8 indexed),[January] 2000 - [July] 2009,13
16,1st Issue Special,/series/2212/,1975,13 issues (13 indexed),April 1975 - April 1976,13
22,52,/series/16626/,2006,52 issues (52 indexed),July 2006 - July 2007,52
34,80 Page Giant Magazine,/series/1620/,1964,56 issues (15 indexed),August 1964 - February-March 1969,56
98,Action Comics,/series/97/,1938,866 issues (866 indexed),June 1938 - October 2011,866
99,Action Comics,/series/59922/,2011,117 issues (116 indexed),November 2011 - Present,117


In [445]:
url = 'https://www.comics.org'
series_urls = url + series_df['href']

# take a series  with many issues...
series_page_url = series_urls[98]

series_page_html = webscraper.simple_get(series_page_url)
series_page_soup = webscraper.transform_simple_get_html(series_page_html)

In [484]:
# get 'series details cover gallery' url
cover_gallery_url = url + series_page_soup.find('a',  href=True, text='Cover Gallery')['href']

cover_gallery_html = webscraper.simple_get(cover_gallery_url)
cover_gallery_soup = webscraper.transform_simple_get_html(cover_gallery_html)

try:
    cover_gallery_pages = list(filter(lambda x: x.isdigit(), [x.contents[0] for x in cover_gallery_soup.find_all('a', {'class': "btn btn-default btn-sm"})]))
    cover_gallery_range = max([int(x) for x in cover_gallery_pages])

    for i in range(1, cover_gallery_range+1):
        cover_gallery_url = str(cover_gallery_url + '/?page={}').format(i)
        
except:
    continue

In [379]:
# # get issue hrefs from all linked issues on cover gallery
# issue_hrefs = list()
# for i in range(1, (84 + 1)):
#     if cover_gallery_soup.find('a',  href=True,  text=i) is not None:
#         issue_tag = cover_gallery_soup.find('a',  href=True,  text=i)
#         issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
#         continue
#     elif cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i)) is not None:
#         issue_tag = cover_gallery_soup.find('a', href=True,  text='{} [Direct]'.format(i))
#         issue_hrefs.append(issue_hrefs.append(issue_tag['href']))
#         continue
#     else:
#         continue

# # filter out junk hrefs (I dunno... this is scraping... <shrug>)
# issue_hrefs = list(filter(lambda x: 'issue' in x, [i for i in issue_hrefs if i]))

# get issue hrefs from all linked issues on cover gallery
cover_gallery_hrefs = filter(lambda x: '/issue/' in x['href'] and '/cover/' not in x['href'], cover_gallery_soup.find_all('a',  href=True))
issue_hrefs = [x['href'] for x in cover_gallery_hrefs]

#  construct issue urls from issue hrefs
issue_urls = [url + issue_href for issue_href in  issue_hrefs]

In [380]:
issue_urls

['https://www.comics.org/issue/552706/',
 'https://www.comics.org/issue/625171/',
 'https://www.comics.org/issue/873728/',
 'https://www.comics.org/issue/625172/',
 'https://www.comics.org/issue/875543/',
 'https://www.comics.org/issue/625173/',
 'https://www.comics.org/issue/875546/',
 'https://www.comics.org/issue/671557/',
 'https://www.comics.org/issue/875550/',
 'https://www.comics.org/issue/680941/',
 'https://www.comics.org/issue/875549/',
 'https://www.comics.org/issue/703055/',
 'https://www.comics.org/issue/875498/',
 'https://www.comics.org/issue/716816/',
 'https://www.comics.org/issue/875586/',
 'https://www.comics.org/issue/731506/',
 'https://www.comics.org/issue/875587/',
 'https://www.comics.org/issue/740788/',
 'https://www.comics.org/issue/875588/',
 'https://www.comics.org/issue/746131/',
 'https://www.comics.org/issue/875193/',
 'https://www.comics.org/issue/756186/',
 'https://www.comics.org/issue/875338/',
 'https://www.comics.org/issue/760340/',
 'https://www.co

In [381]:
# take an issue url from the covers gallery..
issue_url = issue_urls[11]

issue_html = webscraper.simple_get(issue_url)
issue_soup = webscraper.transform_simple_get_html(issue_html)

In [382]:
issue_url

'https://www.comics.org/issue/703055/'

In [409]:
def get_issue_metadata(soup, name):
    if len(soup.find_all('dd', id=name)) > 0:
        if (name != 'issue_indicia_publisher') & (name != 'issue_brand'):
            return soup.find_all('dd', id=name)[0].contents[0].strip()
        else:
            return soup.find_all('dd', id=name)[0].find('a').contents[0]
    else:
        return ""
                    
get_issue_metadata(issue_soup, name='issue_brand')
# get_issue_metadata(issue_soup, name='issue_price')


# issue_soup.find_all('dd', id='issue_indicia_publisher')[0].find('a').contents[0]

'DC [swirl]'

In [357]:
# get metadata from issue url

# title, price, pages, color, dimension, paper_stock, binding, publishing_format
metadata = {}

metadata['title'] = soup.find('title').contents[0].replace('\n', '').strip().split(' :: ')[-1]
metadata['issue_price'] = soup.find_all('dd', id='issue_price')[0].contents[0].strip()
metadata['issue_pages'] = soup.find_all('dd', id='issue_pages')[0].contents[0].strip()
metadata['format_color'] = soup.find_all('dd', id='format_color')[0].contents[0].strip()
metadata['format_dimensions'] = soup.find_all('dd', id='format_dimensions')[0].contents[0].strip()
metadata['format_paper_stock'] = soup.find_all('dd', id='format_paper_stock')[0].contents[0].strip()
metadata['format_binding'] = soup.find_all('dd', id='format_binding')[0].contents[0].strip()
metadata['format_publishing_format'] = soup.find_all('dd', id='format_publishing_format')[0].contents[0].strip()


# get cover section
cover = soup.find("div", {"class": "cover"})
metadata['indexer_notes'] = " | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

all_issue_credits = list(zip(issue_soup.find_all('span', {'class': 'credit_label'}),  issue_soup.find_all('span', {'class': 'credit_value'})))
metadata['synopsis'] = " | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

# editing, script, pencils, inks, colors, letters, characters, etc...
cover_credits = list(zip(
    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_label'})],
    [result.contents[0] for result in cover.find_all('span', {'class': 'credit_value'})]
))

metadata.update({'cover_{}'.format(x.lower()): y for x, y in cover_credits})
metadata

IndexError: list index out of range

In [352]:
" | ".join([x.contents[0].replace('\n', '').strip() for x in issue_soup.find_all('p')])

# issue_soup

'This issue has variants. | Indicia reads: Adventure Comics #6/Adventure Comics #509 (Variant Cover).'

In [353]:
all_issue_credits = list(zip(issue_soup.find_all('span', {'class': 'credit_label'}),  issue_soup.find_all('span', {'class': 'credit_value'})))
" | ".join(list(filter(lambda x: x != '', [x[1].contents[0] if x[0].contents[0] == 'Synopsis' else '' for x in all_issue_credits])))

'Luthor sends Superboy across time and space to retrieve the ingredients he needs to cure his sister Lena. But immediately after restoring Lena to health, Luthor shows his true colors. In anger, Superboy strikes out at Luthor, but is stopped by Brainiac. After a brief skirmish, Luthor and Brainiac teleport away, leaving Superboy to pick up the pieces. Later, in space, Luthor gazes on his next experiment.'

In [354]:
# issue_credit_labels = [x.contents[0] for x in  issue_soup.find_all('span', {'class': 'credit_label'})]
# issue_credit_values = [x.contents[0] for x in  issue_soup.find_all('span', {'class': 'credit_value'})]

# issue_credits = dict(zip(issue_credit_labels, issue_credit_values))
# issue_credits['Script']
# issue_credits['Synopsis']

In [336]:
# issue_credits

In [41]:
# href = cover.find("div", {'coverImage'}).a['href']
# cover_img_url = 'https://www.comics.org' + href
# cover_img_url

In [42]:
# raw_html = webscraper.simple_get(cover_img_url)
# html = BeautifulSoup(raw_html, 'html.parser')
# images = html.select('img')

# cover_images = list(filter(lambda x:'files1.comics.org//img/' in x['src'], images))

# cover_image = cover_images[0]['src']
# cover_image

In [43]:
# issue_title = metadata['title']
# issue_title

In [44]:
# save_to = './' + issue_title + '.jpg'

# urllib.request.urlretrieve(cover_image, save_to)