In [1]:
import pandas as pd
import numpy as np

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics.pairwise import linear_kernel

# LOADING CSV FILES

In [2]:
df = pd.read_csv("data/final/final_dataset_clean.csv")

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6767 entries, 0 to 6766
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   character_credits     6767 non-null   object
 1   character_died_in     6767 non-null   object
 2   concept_credits       6767 non-null   object
 3   cover_date            6767 non-null   object
 4   description           6669 non-null   object
 5   has_staff_review      6767 non-null   object
 6   id                    6767 non-null   int64 
 7   issue_number          6767 non-null   object
 8   location_credits      6767 non-null   object
 9   name                  6767 non-null   object
 10  object_credits        6767 non-null   object
 11  person_credits        6767 non-null   object
 12  story_arc_credits     6767 non-null   object
 13  team_credits          6767 non-null   object
 14  team_disbanded_in     6767 non-null   object
 15  volume                6767 non-null   

In [4]:
missing_values = df['description'].isnull().sum()
print(f"Missing Values: {missing_values}")

Missing Values: 98


**final_fillna** is just some final preprocessing. There cannot be null values when I go to vectorize. I found that my description lost some data between being exported and imported. 

In [5]:
def final_fillna(df):
    df['description'] = df['description'].fillna('')
    df['combined_description'] = df['combined_description'].fillna('')
    lst = ['character_credits', 'character_died_in', 'concept_credits',
           'location_credits', 'object_credits', 'person_credits', 'story_arc_credits', 
           'team_credits', 'team_disbanded_in', 'volume']
final_fillna(df)

# TfidVectorizing
I vectorized both the combined_description and each of the individual features in preparation for the model. 

In [6]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_description'])

**features within combined_description** 'character_credits', 'concept_credits', 'location_credits', 'object_credits', 'person_credits', 'story_arc_credits', 'team_credits', 'volume'

In [7]:
tfidf_des = TfidfVectorizer(stop_words='english')
tfidf_des_matrix = tfidf_des.fit_transform(df['description'])

In [8]:
tfidf_char = TfidfVectorizer(stop_words='english')
tfidf_char_matrix = tfidf_char.fit_transform(df['character_credits'])

In [9]:
tfidf_concept = TfidfVectorizer(stop_words='english')
tfidf_concept_matrix = tfidf_concept.fit_transform(df['concept_credits'])

In [10]:
tfidf_location = TfidfVectorizer(stop_words='english')
tfidf_location_matrix = tfidf_location.fit_transform(df['location_credits'])

In [11]:
tfidf_object = TfidfVectorizer(stop_words='english')
tfidf_object_matrix = tfidf_object.fit_transform(df['object_credits'])

In [12]:
tfidf_person = TfidfVectorizer(stop_words='english')
tfidf_person_matrix = tfidf_person.fit_transform(df['person_credits'])

In [13]:
tfidf_arc = TfidfVectorizer(stop_words='english')
tfidf_arc_matrix = tfidf_arc.fit_transform(df['story_arc_credits'])

In [14]:
tfidf_team = TfidfVectorizer(stop_words='english')
tfidf_team_matrix = tfidf_team.fit_transform(df['team_credits'])

In [15]:
tfidf_vol = TfidfVectorizer(stop_words='english')
tfidf_vol_matrix = tfidf_vol.fit_transform(df['volume'])

# Combining Matrixes and Weighting Them

In [16]:
tfidf_vectorizers = [tfidf_des, tfidf_char, tfidf_concept, 
                    tfidf_location, tfidf_object, tfidf_person,
                    tfidf_arc, tfidf_team, tfidf_vol]

In [17]:
all_matrices = [tfidf_des_matrix, tfidf_char_matrix, tfidf_concept_matrix, 
               tfidf_location_matrix, tfidf_object_matrix, tfidf_person_matrix,
               tfidf_arc_matrix, tfidf_team_matrix, tfidf_vol_matrix]

In [18]:
weights = [3, 2, 2,
           1, 0, 1,
           0, 1, 1]

In [19]:
# Scale each TF-IDF matrix by its corresponding weight
weighted_tfidf_matrices = [matrix * weight for matrix, weight in zip(all_matrices, weights)]

In [20]:
combined_matrix = hstack(weighted_tfidf_matrices)

# COSINE_SIM & RECOMMENDATION FUNCTION

In [21]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [22]:
cosine_sim_combined = linear_kernel(combined_matrix, combined_matrix)

In [23]:
def get_recommendations(df, title, cosine_sim=cosine_sim):
    # Get the index of the issue that matches the title
    idx = df[df['name'] == title].index[0]
    # Get the pairwise similarity scores of all issues with that issue
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the issues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar issues
    sim_scores = sim_scores[0:20]
    # Get the issue indices
    issue_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar issues
    return df[['name', 'issue_number', 'description', 'character_credits','concept_credits', 'story_arc_credits','volume']].iloc[issue_indices]

In [24]:
rec = get_recommendations(df, "Robins vs. Zombies: Robin War", cosine_sim)
rec_combined = get_recommendations(df, "Robins vs. Zombies: Robin War", cosine_sim_combined)

In [25]:
rec

Unnamed: 0,name,issue_number,description,character_credits,concept_credits,story_arc_credits,volume
2373,Robins vs. Zombies: Robin War,13.0,A “Robin War” tie-in! With Robins fighting cop...,"['Colton_Rivera', 'Damian_Wayne', 'Efrem', 'He...",[],['Robin_War'],['Gotham_Academy']
2378,Yearbook Part Five; Whatever Happened to Profe...,18.0,As the “Gotham Academy Yearbook” storyline com...,"['Coach_Humphreys', 'Colton_Rivera', 'Damian_W...",[],[],['Gotham_Academy']
2366,Save The Last Dance,5.0,This month’s assignment: Uncover the hideous s...,"['Batman', 'Calamity', 'Colton_Rivera', 'Headm...",[],[],['Gotham_Academy']
2370,Calamity,9.0,If the gang thought it was hard to keep up wit...,"['Calamity', 'Clayface_(Karlo)', 'Coach_Humphr...",[],[],['Gotham_Academy']
2365,The Secret of the Symbol,4.0,The hunt for the Ghost of Gotham Academy begins!,"['Batman', 'Calamity', 'Coach_Humphreys', 'Gra...",['Batman_Villains'],[],['Gotham_Academy']
2364,The Ghost in the North Hall,3.0,"If you thought getting detention was a pain, j...","['Aunt_Harriet', 'Batman', 'Colton_Rivera', 'H...",[],[],['Gotham_Academy']
2367,Pizza Club,6.0,"Holy cow, it’s Taco Tuesday! Oh…and Olive batt...","['Amadeus_Arkham', 'Batman', 'Calamity', 'Colt...",[],[],['Gotham_Academy']
2368,Curse of the Inishtree Quill,7.0,Special guest student Damian Wayne drops by th...,"['Batman', 'Bookworm', 'Colton_Rivera', 'Damia...",['Joker_75th_Anniversary_Variant'],[],['Gotham_Academy']
2374,Yearbook Part One; Animal Science 101; Queen G...,14.0,An all-new era of GOTHAM ACADEMY begins here w...,"['Clayface_(Karlo)', 'Colton_Rivera', 'Dillyn'...",[],[],['Gotham_Academy']
2375,Yearbook Part Two; Staff Party; Serpents & Sec...,15.0,It’s part two of “Gotham Academy Yearbook”! Th...,"['Batman', 'Bookworm', 'Clayface_(Karlo)', 'Co...",['Anthology'],[],['Gotham_Academy']


In [26]:
rec_combined

Unnamed: 0,name,issue_number,description,character_credits,concept_credits,story_arc_credits,volume
2373,Robins vs. Zombies: Robin War,13.0,A “Robin War” tie-in! With Robins fighting cop...,"['Colton_Rivera', 'Damian_Wayne', 'Efrem', 'He...",[],['Robin_War'],['Gotham_Academy']
2374,Yearbook Part One; Animal Science 101; Queen G...,14.0,An all-new era of GOTHAM ACADEMY begins here w...,"['Clayface_(Karlo)', 'Colton_Rivera', 'Dillyn'...",[],[],['Gotham_Academy']
2378,Yearbook Part Five; Whatever Happened to Profe...,18.0,As the “Gotham Academy Yearbook” storyline com...,"['Coach_Humphreys', 'Colton_Rivera', 'Damian_W...",[],[],['Gotham_Academy']
2368,Curse of the Inishtree Quill,7.0,Special guest student Damian Wayne drops by th...,"['Batman', 'Bookworm', 'Colton_Rivera', 'Damia...",['Joker_75th_Anniversary_Variant'],[],['Gotham_Academy']
2362,Welcome to Gotham Academy,1.0,WELCOME TO GOTHAM ACADEMY! Gotham City’s most ...,"['Aunt_Harriet', 'Batman', 'Calamity', 'Colton...","['Batman_Villains', 'The_New_52']",[],['Gotham_Academy']
2376,Yearbook Part Three; Maps' Day Out; Boring Sun...,16.0,It’s “Yearbook” part 3! As Olive and the gang ...,"['Barbara_Gordon', 'Batman', 'Colton_Rivera', ...",['Robin'],[],['Gotham_Academy']
2365,The Secret of the Symbol,4.0,The hunt for the Ghost of Gotham Academy begins!,"['Batman', 'Calamity', 'Coach_Humphreys', 'Gra...",['Batman_Villains'],[],['Gotham_Academy']
2370,Calamity,9.0,If the gang thought it was hard to keep up wit...,"['Calamity', 'Clayface_(Karlo)', 'Coach_Humphr...",[],[],['Gotham_Academy']
2375,Yearbook Part Two; Staff Party; Serpents & Sec...,15.0,It’s part two of “Gotham Academy Yearbook”! Th...,"['Batman', 'Bookworm', 'Clayface_(Karlo)', 'Co...",['Anthology'],[],['Gotham_Academy']
2366,Save The Last Dance,5.0,This month’s assignment: Uncover the hideous s...,"['Batman', 'Calamity', 'Colton_Rivera', 'Headm...",[],[],['Gotham_Academy']
