In [530]:
import re

import jsonlines
import pandas as pd
import numpy as np

In [531]:
# # TODO: load metadata and return aggregate / summary statistics
# # TODO: write method to display cover image w/ cover metadata and add annotations to image
# # TODO: consider ideate/innotater for annotating directly in Jupyter notebooks

logged_metadata = []

with jsonlines.open('../metadata/covers.jsonl', mode='r') as reader:
    for item in reader:
        logged_metadata.append(item)
        
df = pd.DataFrame(logged_metadata)

In [532]:
def get_issue_number_from_title(title):
    issue = re.search(r"([#?])(\d+)\b", title.replace(',', ''))
    if issue is None:
        return np.nan
    else:
        return np.int(issue.group().replace('#', ''))

df['issue_number'] = df['title'].apply(get_issue_number_from_title)

df.drop(['issue_number', 'covers'], axis=1).describe().T

Unnamed: 0,count,unique,top,freq
format_binding,19606,52,Saddle-stitched,7872
format_color,19606,20,Color,10614
format_dimensions,19606,61,standard Modern Age US,4125
format_paper_stock,19606,62,,5722
format_publishing_format,19606,43,was ongoing series,7799
indexer_notes,19606,12484,,2940
indicia_frequency,19606,461,monthly,10263
issue_brand,19606,115,DC [bullet],3489
issue_indicia_publisher,19606,70,DC Comics,4913
issue_pages,19606,96,36,13739


In [533]:
flatten = lambda l: [item for sublist in l for item in sublist]

def match_brackets(characters: str):
    p = re.compile(r"\[(.*?)\]")
    matches = {}
    for m in p.finditer(characters):
        matches[m.group()] = {"start": m.start(), "end": m.end()}
    return matches


def replace_semicolons_in_brackets(characters: str):
    p = re.compile(r"\[(.*?)\]")
    matches = []
    for m in p.finditer(characters):
        matches.append((m.start(), m.end(), m.group()))
    for match in matches:
        substring = characters[match[0]: match[1]]
        if substring.count(";") == 1 and (("Kal-El" in substring) |  ("Zor-El" in substring)):
            characters = characters.replace(substring,  substring.replace(";", "/"))
        if substring.count(";") > 1:
            pass
    return characters


def look_behind(t, end_idx):
    span = t[: (end_idx - 1)]
    semicolon_matches = [(m.group(),m.start(),m.end()) for m in re.finditer(r"(?<=(;))", span)]
    if len(semicolon_matches) ==  0:
        start_idx = 0
    else:
        start_idx = semicolon_matches[-1][2]
    return span[start_idx: end_idx].strip()

def convert_character_dict_to_str(character_dict):
    return  str(character_dict["Teams"]).\
    replace("{", "").\
    replace("}", "").\
    replace("'", "").\
    replace(",", ";")

def diff_strings(string1,  string2):
    diff = ""
    for idx, val in enumerate(difflib.ndiff(string1, string2)):
        if val[0] == "+":
            diff += val[2]
    return diff

In [534]:
def convert_characters_to_list(t):
    
    t = replace_semicolons_in_brackets(t)

    stack = 0
    startIndex = None
    results = []

    matches = []
    for i, c in enumerate(t):
        if c == '[':
            if stack == 0:
                startIndex = i + 1 # string to extract starts one index later

            # push to stack
            stack += 1
        elif c == ']':
            # pop stack
            stack -= 1

            if stack == 0:
                matches.append((startIndex, i))
                results.append(t[startIndex:i])

    character_dict = {}
    character_dict["Teams"] = {}
    character_dict["Individuals"] = {}

    for span in matches:
        entity = t[span[0]: span[1]]
        if entity.count(";") == 0:
            person_name = look_behind(t, span[0])
            person_identity = entity
            character_dict["Individuals"][person_name] = person_identity

        elif entity.count(";") > 1:
            team_name = look_behind(t, span[0])
            team_members = list(filter(lambda x: x != "", entity.split("; ")))
            character_dict["Teams"][team_name] = team_members


    team_string = convert_character_dict_to_str(character_dict)

    remainder = diff_strings(team_string, t)

    remainder = list(filter(lambda x: x != "", remainder.split("; ")))

    character_dict["Individuals"] = remainder

    character_list = []
    for k in character_dict["Teams"]:
        character_list.append(character_dict["Teams"][k])

    character_list.append(character_dict["Individuals"])

    return flatten(character_list)

In [535]:
# #  list of all characters
# characters = list(df_covers["cover_characters"].dropna().values)

# test = characters[11_150]

# print(test)
# convert_characters_to_list(test)

In [536]:
# create cover df
df_covers = pd.concat(df['covers'].apply(lambda x: pd.DataFrame(x).T).tolist(), axis=0).reset_index(drop=True)

# create cover characters df
df_cover_characters = df_covers[df_covers["cover_characters"].notnull()].copy().reset_index(drop=True)
df_cover_characters["cover_characters_list"] = df_cover_characters["cover_characters"].apply(convert_characters_to_list)

In [537]:
print("Cover Characters:")

list(zip(pd.Series(flatten(df_cover_characters["cover_characters_list"].values)).value_counts()[:100].index,
    pd.Series(flatten(df_cover_characters["cover_characters_list"].values)).value_counts()[:100].values))

Cover Characters:


[('Superman', 1341),
 ('Batman [Bruce Wayne]', 1129),
 ('Iron Man [Tony Stark]', 1076),
 ('Spider-Man [Peter Parker]', 939),
 ('Thor', 864),
 ('Batman', 718),
 ('Captain America [Steve Rogers]', 671),
 ('Hulk [Bruce Banner]', 633),
 ('Human Torch [Johnny Storm]', 594),
 ('Superman [Clark Kent/ Kal-El]', 564),
 ('Wonder Woman', 518),
 ('Conan', 516),
 ('Cyclops', 505),
 ('Wolverine', 461),
 ('Superman [Clark Kent]', 348),
 ('Beast', 345),
 ('Robin [Dick Grayson]', 339),
 ('Vision', 317),
 ('Lois Lane', 315),
 ('Superboy', 313),
 ('Storm', 298),
 ('Iceman', 296),
 ('Mr. Fantastic', 279),
 ('Hulk', 249),
 ('Colossus', 249),
 ('Spider-Man', 249),
 ('Wonder Woman [Diana Prince]', 246),
 ('Hercules', 234),
 ('Aquaman', 231),
 ('The Thing [Ben Grimm]', 223),
 ('Punisher', 223),
 ('The Thing', 218),
 ('Mr. Fantastic [Reed Richards]', 217),
 ('Cable', 210),
 ('Thor [Donald Blake]', 207),
 ('Namor, the Sub-Mariner', 200),
 ('Angel', 198),
 ('Nightcrawler', 188),
 ('Joker', 188),
 ('Invisible Gir

In [538]:
print("Cover Artists:")

list(zip(get_value_counts(df_covers, 'cover_pencils')[:100].index, 
         get_value_counts(df_covers, 'cover_pencils')[:100].values))

Cover Artists:


[('Gil Kane', 504),
 ('Jack Kirby', 484),
 ('Curt Swan', 420),
 ('?', 372),
 ('John Buscema', 363),
 ('John Byrne', 342),
 ('Alex Ross', 333),
 ('John Romita', 248),
 ('Rich Buckler', 238),
 ('Ross Andru', 235),
 ('Nick Cardy', 227),
 ('George Pérez', 224),
 ('Salvador Larroca', 194),
 ('Neal Adams', 191),
 ('Ron Frenz', 180),
 ('Herb Trimpe', 177),
 ('Brian Bolland', 174),
 ('Paul Ryan', 161),
 ('Terry Dodson', 136),
 ('Andy Kubert', 135),
 ('Dan Jurgens', 134),
 ('Win Mortimer', 123),
 ('Jim Aparo', 123),
 ('Ryan Ottley', 123),
 ('Greg Capullo', 122),
 ('Mike McKone', 119),
 ('Al Milgrom', 117),
 ('Tom Grummett', 117),
 ('Walt Simonson', 115),
 ('Wayne Boring', 113),
 ('Humberto Ramos', 108),
 ('Dave Cockrum', 108),
 ('Sheldon Mayer', 107),
 ('Ed McGuinness', 105),
 ('Bob Layton', 103),
 ('David Finch', 98),
 ('Paul Pelletier', 97),
 ('Amanda Conner', 97),
 ('Keith Pollard', 97),
 ('Adam Kubert', 96),
 ('Jim Lee', 95),
 ('Leinil Francis Yu', 95),
 ('Steve Ditko', 94),
 ('Gary Frank',

In [None]:
def create_training_dirs():
    """
    Given some args, create two compressed files: images.tar.gz & annos.tar.gz
    """