In [9]:
import pandas as pd
import numpy as np
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [10]:
gotham_academy = pd.read_csv("data/gotham_academy.csv")

In [11]:
gotham_academy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18 entries, 0 to 17
Data columns (total 32 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Unnamed: 0                   18 non-null     int64  
 1   aliases                      0 non-null      float64
 2   api_detail_url               18 non-null     object 
 3   associated_images            18 non-null     object 
 4   character_credits            18 non-null     object 
 5   character_died_in            18 non-null     object 
 6   concept_credits              18 non-null     object 
 7   cover_date                   18 non-null     object 
 8   date_added                   18 non-null     object 
 9   date_last_updated            18 non-null     object 
 10  deck                         0 non-null      float64
 11  description                  18 non-null     object 
 12  first_appearance_characters  0 non-null      float64
 13  first_appearance_conce

In [12]:
def clean_data(input_dataframe):
    df = input_dataframe.copy()
    df.drop(columns=["Unnamed: 0", "aliases", "deck", "first_appearance_characters", "first_appearance_concepts", 
                    "first_appearance_locations", "first_appearance_objects", "first_appearance_storyarcs",
                    "first_appearance_teams", "store_date"], inplace=True)
    df.dropna(subset=['name'], inplace=True)
    df['cover_date'] = pd.to_datetime(df['cover_date'], errors='coerce')
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
    df['date_last_updated'] = pd.to_datetime(df['date_last_updated'], errors='coerce')
    df['description'] = df['description'].fillna('')
    return df

In [13]:
def clean_description(html_text):
    soup = BeautifulSoup(html_text, 'html.parser')
    cleaned_text = soup.get_text(separator='\n')
    return cleaned_text

def clean_character_credits(json_str):
    json_str = json_str.replace("'", '"')
    characters = json.loads(json_str)
    names = [character['name'] for character in characters]
    return names

def clean_character_died_in(json_str):
    json_str = json_str.replace("'",'"')
    characters = json.loads(json_str)
    names = [character['name'] for character in characters]
    return names

def clean_concept_credits(json_str):
    json_str = json_str.replace("'", '"')
    concepts = json.loads(json_str)
    names = [concept['name'] for concept in concepts]
    return names

def clean_location_credits(json_str):
    json_str = json_str.replace("'", '"')
    locations = json.loads(json_str)
    names = [location['name'] for location in locations]
    return names
    
def clean_object_credits(json_str):
    json_str = json_str.replace("'", '"')
    objects = json.loads(json_str)
    names = [object['name'] for object in objects]
    return names

def clean_person_credits(json_str):
    json_str = json_str.replace("'", '"')
    persons = json.loads(json_str)
    names = [person['name'] for person in persons]
    return names

def clean_story_arc_credits(json_str):
    json_str = json_str.replace("'", '"')
    story_arcs = json.loads(json_str)
    names = [story_arc['name'] for story_arc in story_arcs]
    return names

def clean_team_credits(json_str):
    json_str = json_str.replace("'",'"')
    team_credits = json.loads(json_str)
    names = [team_credit['name'] for team_credit in team_credits]
    return names

def clean_team_disbanded_in(json_str):
    json_str = json_str.replace("'",'"')
    lst = json.loads(json_str)
    names = [item['name'] for item in lst]
    return names
    
def clean_volume(json_str):
    json_str = json_str.replace("'",'"')
    lst = json.loads(json_str)
    names = [item["name"] for item in lst]
    return names
    
    



In [14]:
def clean_dataframe(df):
    df["description"] = df["description"].apply(clean_description)
    df["character_credits"] = df["character_credits"].apply(clean_character_credits)
    df["character_died_in"] = df["character_died_in"].apply(clean_character_died_in)
    df["concept_credits"] = df["concept_credits"].apply(clean_concept_credits)
    df["location_credits"] = df["location_credits"].apply(clean_location_credits)
    df["object_credits"] = df["object_credits"].apply(clean_object_credits)
    df["person_credits"] = df["person_credits"].apply(clean_person_credits)
    df["story_arc_credits"] = df["story_arc_credits"].apply(clean_story_arc_credits)
    df["team_credits"] = df["team_credits"].apply(clean_team_credits)
    df["team_disbanded_in"] = df["team_disbanded_in"].apply(clean_team_disbanded_in)
    #df["volume"] = df["volume"].apply(clean_volume)

In [15]:
gotham_academy = clean_data(gotham_academy)

In [16]:
clean_dataframe(gotham_academy)

In [17]:
gotham_academy["name"]

0                             Welcome to Gotham Academy
1                    The Diary of Millie Jane Cobblepot
2                           The Ghost in the North Hall
3                              The Secret of the Symbol
4                                   Save The Last Dance
5                                            Pizza Club
6                          Curse of the Inishtree Quill
7                                               Requiem
8                                              Calamity
9                                       The Cursed Play
10                                      Mission: Gotham
12                        Robins vs. Zombies: Robin War
13    Yearbook Part One; Animal Science 101; Queen G...
14    Yearbook Part Two; Staff Party; Serpents & Sec...
15    Yearbook Part Three; Maps' Day Out; Boring Sun...
16    Yearbook Part Four; This One's For You; A Fami...
17    Yearbook Part Five; Whatever Happened to Profe...
Name: name, dtype: object

In [18]:
tfidf = TfidfVectorizer(stop_words='english')

In [19]:
tfidf_matrix = tfidf.fit_transform(gotham_academy['description'])

In [20]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [21]:
def get_recommendations(df, title, cosine_sim=cosine_sim):
    # Get the index of the issue that matches the title
    idx = df[df['name'] == title].index[0]
    # Get the pairwise similarity scores of all issues with that issue
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the issues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar issues
    sim_scores = sim_scores[1:11]
    # Get the issue indices
    issue_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar issues
    return df['name'].iloc[issue_indices]

In [22]:
recommendations = get_recommendations(gotham_academy, "Pizza Club")
print(recommendations)

15    Yearbook Part Three; Maps' Day Out; Boring Sun...
10                                      Mission: Gotham
0                             Welcome to Gotham Academy
1                    The Diary of Millie Jane Cobblepot
17    Yearbook Part Five; Whatever Happened to Profe...
9                                       The Cursed Play
12                        Robins vs. Zombies: Robin War
16    Yearbook Part Four; This One's For You; A Fami...
13    Yearbook Part One; Animal Science 101; Queen G...
2                           The Ghost in the North Hall
Name: name, dtype: object


In [23]:
gotham_academy.sample(5)

Unnamed: 0,api_detail_url,associated_images,character_credits,character_died_in,concept_credits,cover_date,date_added,date_last_updated,description,has_staff_review,...,issue_number,location_credits,name,object_credits,person_credits,site_detail_url,story_arc_credits,team_credits,team_disbanded_in,volume
17,https://comicvine.gamespot.com/api/issue/4000-...,[],"[Coach Humphreys, Colton Rivera, Damian Wayne,...",[],[],2016-07-31,2016-05-10 05:33:34,2016-05-22 01:07:38,As the “Gotham Academy Yearbook” storyline com...,False,...,18,[],Yearbook Part Five; Whatever Happened to Profe...,[],"[Adam Archer, Annie Wu, Brenden Fletcher, Fait...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
3,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Batman, Calamity, Coach Humphreys, Gray Ghost...",[],[Batman Villains],2015-03-25,2015-01-26 20:56:37,2015-09-09 19:42:42,The hunt for the Ghost of Gotham Academy begins!,False,...,4,"[Gotham Academy, Gotham City]",The Secret of the Symbol,[],"[Becky Cloonan, Brenden Fletcher, Dave McCaig,...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
8,https://comicvine.gamespot.com/api/issue/4000-...,[],"[Calamity, Clayface (Karlo), Coach Humphreys, ...",[],[],2015-10-31,2015-08-11 00:54:48,2019-06-21 04:57:55,If the gang thought it was hard to keep up wit...,False,...,9,[Gotham Academy],Calamity,[],"[Becky Cloonan, Brenden Fletcher, Karl Kerschl...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
0,https://comicvine.gamespot.com/api/issue/4000-...,[{'original_url': 'https://comicvine.gamespot....,"[Aunt Harriet, Batman, Calamity, Colton Rivera...",[],"[Batman Villains, The New 52]",2014-12-31,2014-09-29 20:58:22,2015-03-25 22:10:19,WELCOME TO GOTHAM ACADEMY! Gotham City’s most ...,{'api_detail_url': 'https://comicvine.gamespot...,...,1,"[Gotham Academy, Gotham City]",Welcome to Gotham Academy,"[Bat-Signal, Lipstick]","[Becky Cloonan, Brenden Fletcher, Dave McCaig,...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
13,https://comicvine.gamespot.com/api/issue/4000-...,[],"[Clayface (Karlo), Colton Rivera, Dillyn, Head...",[],[],2016-03-31,2016-01-12 03:52:49,2016-01-17 22:21:06,An all-new era of GOTHAM ACADEMY begins here w...,False,...,14,[Gotham Academy],Yearbook Part One; Animal Science 101; Queen G...,[],"[Adam Archer, Brenden Fletcher, Derek Fridolfs...",https://comicvine.gamespot.com/gotham-academy-...,[],[Gotham Academy Student Body],[],{'api_detail_url': 'https://comicvine.gamespot...
