In [2]:
import pandas as pd
import numpy as np
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [3]:
gotham_academy = pd.read_csv("data/gotham_academy.csv")

In [4]:
gotham_academy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   18 non-null     int64  
 1   aliases                      0 non-null      float64
 2   api_detail_url               18 non-null     object 
 3   associated_images            18 non-null     object 
 4   character_credits            18 non-null     object 
 5   character_died_in            18 non-null     object 
 6   concept_credits              18 non-null     object 
 7   cover_date                   18 non-null     object 
 8   date_added                   18 non-null     object 
 9   date_last_updated            18 non-null     object 
 10  deck                         0 non-null      float64
 11  description                  18 non-null     object 
 12  first_appearance_characters  0 non-null      float64
 13  first_appearance_conce

In [5]:
def clean_data(input_dataframe):
    df = input_dataframe.copy()
    df.drop(columns=["Unnamed: 0", "aliases", "deck", "first_appearance_characters", "first_appearance_concepts", 
                    "first_appearance_locations", "first_appearance_objects", "first_appearance_storyarcs",
                    "first_appearance_teams", "store_date"], inplace=True)
    df.dropna(subset=['name'], inplace=True)
    df['cover_date'] = pd.to_datetime(df['cover_date'], errors='coerce')
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['date_last_updated'] = pd.to_datetime(df['date_last_updated'], errors='coerce')
    df['description'] = df['description'].fillna('')
    return df

In [6]:
def clean_description(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text(separator='\n')
    return cleaned_text

def clean_character_credits(json_str):
    json_str = json_str.replace("'", '"')
    characters = json.loads(json_str)
    names = [character['name'] for character in characters]
    return names

def clean_character_died_in(json_str):
    json_str = json_str.replace("'",'"')
    characters = json.loads(json_str)
    names = [character['name'] for character in characters]
    return names

def clean_concept_credits(json_str):
    json_str = json_str.replace("'", '"')
    concepts = json.loads(json_str)
    names = [concept['name'] for concept in concepts]
    return names

def clean_location_credits(json_str):
    json_str = json_str.replace("'", '"')
    locations = json.loads(json_str)
    names = [location['name'] for location in locations]
    return names
    
def clean_object_credits(json_str):
    json_str = json_str.replace("'", '"')
    objects = json.loads(json_str)
    names = [object['name'] for object in objects]
    return names

def clean_person_credits(json_str):
    json_str = json_str.replace("'", '"')
    persons = json.loads(json_str)
    names = [person['name'] for person in persons]
    return names

def clean_story_arc_credits(json_str):
    json_str = json_str.replace("'", '"')
    story_arcs = json.loads(json_str)
    names = [story_arc['name'] for story_arc in story_arcs]
    return names

def clean_team_credits(json_str):
    json_str = json_str.replace("'",'"')
    team_credits = json.loads(json_str)
    names = [team_credit['name'] for team_credit in team_credits]
    return names

def clean_team_disbanded_in(json_str):
    json_str = json_str.replace("'",'"')
    lst = json.loads(json_str)
    names = [item['name'] for item in lst]
    return names
    
def clean_volume(json_str):
    json_str = json_str.replace("'",'"')
    lst = json.loads(json_str)
    names = [item["name"] for item in lst]
    return names
    
    



In [7]:
def clean_dataframe(df):
    df["description"] = df["description"].apply(clean_description)
    df["character_credits"] = df["character_credits"].apply(clean_character_credits)
    df["character_died_in"] = df["character_died_in"].apply(clean_character_died_in)
    df["concept_credits"] = df["concept_credits"].apply(clean_concept_credits)
    df["location_credits"] = df["location_credits"].apply(clean_location_credits)
    df["object_credits"] = df["object_credits"].apply(clean_object_credits)
    df["person_credits"] = df["person_credits"].apply(clean_person_credits)
    df["story_arc_credits"] = df["story_arc_credits"].apply(clean_story_arc_credits)
    df["team_credits"] = df["team_credits"].apply(clean_team_credits)
    df["team_disbanded_in"] = df["team_disbanded_in"].apply(clean_team_disbanded_in)
    df["volume"] = df["volume"].apply(clean_volume)

In [8]:
gotham_academy = clean_data(gotham_academy)

In [9]:
clean_dataframe(gotham_academy)

TypeError: string indices must be integers, not 'str'

In [10]:
gotham_academy["name"]

0                             Welcome to Gotham Academy
1                    The Diary of Millie Jane Cobblepot
2                           The Ghost in the North Hall
3                              The Secret of the Symbol
4                                   Save The Last Dance
5                                            Pizza Club
6                          Curse of the Inishtree Quill
7                                               Requiem
8                                              Calamity
9                                       The Cursed Play
10                                      Mission: Gotham
12                        Robins vs. Zombies: Robin War
13    Yearbook Part One; Animal Science 101; Queen G...
14    Yearbook Part Two; Staff Party; Serpents & Sec...
15    Yearbook Part Three; Maps' Day Out; Boring Sun...
16    Yearbook Part Four; This One's For You; A Fami...
17    Yearbook Part Five; Whatever Happened to Profe...
Name: name, dtype: object

In [11]:
tfidf = TfidfVectorizer(stop_words='english')

In [12]:
tfidf_matrix = tfidf.fit_transform(gotham_academy['description'])

In [13]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [15]:
def get_recommendations(df, title, cosine_sim=cosine_sim):
    # Get the index of the issue that matches the title
    idx = df[df['name'] == title].index[0]
    # Get the pairwise similarity scores of all issues with that issue
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the issues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar issues
    sim_scores = sim_scores[1:11]
    # Get the issue indices
    issue_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar issues
    return df['name'].iloc[issue_indices]



In [16]:
recommendations = get_recommendations(gotham_academy, "Pizza Club")
print(recommendations)

15    Yearbook Part Three; Maps' Day Out; Boring Sun...
10                                      Mission: Gotham
0                             Welcome to Gotham Academy
1                    The Diary of Millie Jane Cobblepot
17    Yearbook Part Five; Whatever Happened to Profe...
9                                       The Cursed Play
12                        Robins vs. Zombies: Robin War
16    Yearbook Part Four; This One's For You; A Fami...
13    Yearbook Part One; Animal Science 101; Queen G...
2                           The Ghost in the North Hall
Name: name, dtype: object


In [19]:
batman1 = pd.read_csv("data/batman_vol1.csv")

In [20]:
batman1 = clean_data(batman1)
clean_dataframe(batman1)

JSONDecodeError: Expecting ',' delimiter: line 1 column 676 (char 675)

In [43]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix2 = tfidf.fit_transform(batman1['description'])
cosine_sim2 = linear_kernel(tfidf_matrix2, tfidf_matrix2)

In [37]:
cosine_sim[500]

array([0.        , 0.01750024, 0.01911008, 0.        , 0.0217669 ,
       0.        , 0.02376147, 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.02247309, 0.01885381, 0.        , 0.        , 0.        ,
       0.02168544, 0.02067199, 0.02108084, 0.02272511, 0.        ,
       0.0188669 , 0.01718545, 0.03840262, 0.        , 0.        ,
       0.        , 0.        , 0.02818209, 0.02132388, 0.01693196,
       0.        , 0.05278448, 0.06691953, 0.        , 0.        ,
       0.04752175, 0.        , 0.        , 0.        , 0.        ,
       0.097014  , 0.01559735, 0.        , 0.        , 0.02328583,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.11397234, 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.        , 0.        , 0.     

In [44]:
recommendations = get_recommendations(batman1, "Batman in Bethlehem", cosine_sim=cosine_sim2)
print(recommendations)

624                    Batman & Son, Part 3: Wonderboys
550                Batman & Son, Part 4: Absent Fathers
688      A Battle Within (Battle For The Cowl Epilogue)
561    Batman & Son Part 1: Building a Better Batmobile
150                    "Bruce Wayne - - Rest In Peace"!
1                       The Gorilla Boss of Gotham City
441     Darkest Night of the Man-Bat, Part 1: Predation
233                                     Double Jeopardy
608                               Batman In The Future!
704                                    The Great Escape
Name: name, dtype: object


In [39]:
batman1["name"][555]

'Batman in Bethlehem'