In [90]:
import pandas as pd
import numpy as np

from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import hstack
from sklearn.metrics.pairwise import linear_kernel

# Loading CSV Files and Preprocessing

In [91]:
df = pd.read_csv("data/final/final_dataset_clean.csv")

In [92]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6767 entries, 0 to 6766
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   character_credits     6767 non-null   object
 1   character_died_in     6767 non-null   object
 2   concept_credits       6767 non-null   object
 3   cover_date            6767 non-null   object
 4   description           6669 non-null   object
 5   has_staff_review      6767 non-null   object
 6   id                    6767 non-null   int64 
 7   issue_number          6767 non-null   object
 8   location_credits      6767 non-null   object
 9   name                  6767 non-null   object
 10  object_credits        6767 non-null   object
 11  person_credits        6767 non-null   object
 12  story_arc_credits     6767 non-null   object
 13  team_credits          6767 non-null   object
 14  team_disbanded_in     6767 non-null   object
 15  volume                6767 non-null   

In [93]:
missing_values = df['description'].isnull().sum()
print(f"Missing Values: {missing_values}")

Missing Values: 98


**final_fillna** is just some final preprocessing. There cannot be null values when I go to vectorize. I found that my description lost some data between being exported and imported. 

In [94]:
def final_fillna(df):
    df['description'] = df['description'].fillna('')
    df['combined_description'] = df['combined_description'].fillna('')
    lst = ['character_credits', 'character_died_in', 'concept_credits',
           'location_credits', 'object_credits', 'person_credits', 'story_arc_credits', 
           'team_credits', 'team_disbanded_in', 'volume']
final_fillna(df)

# TfidVectorizing
I vectorized both the combined_description and each of the individual features in preparation for the model. 

In [95]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined_description'])

**features within combined_description** 'character_credits', 'concept_credits', 'location_credits', 'object_credits', 'person_credits', 'story_arc_credits', 'team_credits', 'volume'

In [96]:
tfidf_des = TfidfVectorizer(stop_words='english')
tfidf_des_matrix = tfidf_des.fit_transform(df['description'])

In [97]:
tfidf_char = TfidfVectorizer(stop_words='english')
tfidf_char_matrix = tfidf_char.fit_transform(df['character_credits'])

In [98]:
tfidf_concept = TfidfVectorizer(stop_words='english')
tfidf_concept_matrix = tfidf_concept.fit_transform(df['concept_credits'])

In [99]:
tfidf_location = TfidfVectorizer(stop_words='english')
tfidf_location_matrix = tfidf_location.fit_transform(df['location_credits'])

In [100]:
tfidf_object = TfidfVectorizer(stop_words='english')
tfidf_object_matrix = tfidf_object.fit_transform(df['object_credits'])

In [101]:
tfidf_person = TfidfVectorizer(stop_words='english')
tfidf_person_matrix = tfidf_person.fit_transform(df['person_credits'])

In [102]:
tfidf_arc = TfidfVectorizer(stop_words='english')
tfidf_arc_matrix = tfidf_arc.fit_transform(df['story_arc_credits'])

In [103]:
tfidf_team = TfidfVectorizer(stop_words='english')
tfidf_team_matrix = tfidf_team.fit_transform(df['team_credits'])

In [104]:
tfidf_vol = TfidfVectorizer(stop_words='english')
tfidf_vol_matrix = tfidf_vol.fit_transform(df['volume'])

# Combining Matrixes and Weighting Them

In [105]:
tfidf_vectorizers = [tfidf_des, tfidf_char, tfidf_concept, 
                    tfidf_location, tfidf_object, tfidf_person,
                    tfidf_arc, tfidf_team, tfidf_vol]

In [106]:
all_matrices = [tfidf_des_matrix, tfidf_char_matrix, tfidf_concept_matrix, 
               tfidf_location_matrix, tfidf_object_matrix, tfidf_person_matrix,
               tfidf_arc_matrix, tfidf_team_matrix, tfidf_vol_matrix]

In [107]:
weights = [3, 2, 1,
           1, 0, 1,
           0, 1, 0]

In [108]:
# Scale each TF-IDF matrix by its corresponding weight
weighted_tfidf_matrices = [matrix * weight for matrix, weight in zip(all_matrices, weights)]

In [109]:
combined_matrix = hstack(weighted_tfidf_matrices)

# COSINE_SIM & RECOMMENDATION FUNCTION

In [110]:
cosine_sim = linear_kernel(combined_matrix, combined_matrix)

In [111]:
def get_recommendations(df, title, cosine_sim=cosine_sim):
    # Get the index of the issue that matches the title
    idx = df[df['name'] == title].index[0]
    # Get the pairwise similarity scores of all issues with that issue
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the issues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar issues
    sim_scores = sim_scores[0:11]
    # Get the issue indices
    issue_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar issues
    return df[['issue_number', 'name', 'description', 'character_credits', 'concept_credits', 'volume']].iloc[issue_indices]

# EXAMPLES & TESTING

In [112]:
test1 = get_recommendations(df, "Robins vs. Zombies: Robin War", cosine_sim)
test1

Unnamed: 0,issue_number,name,description,character_credits,concept_credits,volume
2373,13,Robins vs. Zombies: Robin War,A “Robin War” tie-in! With Robins fighting cop...,"['Colton_Rivera', 'Damian_Wayne', 'Efrem', 'He...",[],['Gotham_Academy']
2374,14,Yearbook Part One; Animal Science 101; Queen G...,An all-new era of GOTHAM ACADEMY begins here w...,"['Clayface_(Karlo)', 'Colton_Rivera', 'Dillyn'...",[],['Gotham_Academy']
2378,18,Yearbook Part Five; Whatever Happened to Profe...,As the “Gotham Academy Yearbook” storyline com...,"['Coach_Humphreys', 'Colton_Rivera', 'Damian_W...",[],['Gotham_Academy']
2368,7,Curse of the Inishtree Quill,Special guest student Damian Wayne drops by th...,"['Batman', 'Bookworm', 'Colton_Rivera', 'Damia...",['Joker_75th_Anniversary_Variant'],['Gotham_Academy']
2362,1,Welcome to Gotham Academy,WELCOME TO GOTHAM ACADEMY! Gotham City’s most ...,"['Aunt_Harriet', 'Batman', 'Calamity', 'Colton...","['Batman_Villains', 'The_New_52']",['Gotham_Academy']
2376,16,Yearbook Part Three; Maps' Day Out; Boring Sun...,It’s “Yearbook” part 3! As Olive and the gang ...,"['Barbara_Gordon', 'Batman', 'Colton_Rivera', ...",['Robin'],['Gotham_Academy']
2365,4,The Secret of the Symbol,The hunt for the Ghost of Gotham Academy begins!,"['Batman', 'Calamity', 'Coach_Humphreys', 'Gra...",['Batman_Villains'],['Gotham_Academy']
2370,9,Calamity,If the gang thought it was hard to keep up wit...,"['Calamity', 'Clayface_(Karlo)', 'Coach_Humphr...",[],['Gotham_Academy']
2375,15,Yearbook Part Two; Staff Party; Serpents & Sec...,It’s part two of “Gotham Academy Yearbook”! Th...,"['Batman', 'Bookworm', 'Clayface_(Karlo)', 'Co...",['Anthology'],['Gotham_Academy']
2366,5,Save The Last Dance,This month’s assignment: Uncover the hideous s...,"['Batman', 'Calamity', 'Colton_Rivera', 'Headm...",[],['Gotham_Academy']


**All the recommendations are from the same small volume and are greatly related to the issue we inputed.**

In [113]:
test2 = get_recommendations(df, "A Death in the Family Chapter 1 and 2", cosine_sim)
test2

Unnamed: 0,issue_number,name,description,character_credits,concept_credits,volume
611,426,A Death in the Family Chapter 1 and 2,Jason Todd's been getting out of control in hi...,"['Alfred_Pennyworth', 'Batman', 'Catherine_Tod...","['Batman_Villains', 'Editorial_Note', 'Flashba...",['Batman']
612,427,A Death In the Family Chapter 3 and 4,Finally discovering his mother isn't the happy...,"['Alfred_Pennyworth', 'Batman', 'Jason_Todd', ...","['Assassination', 'Batman_Villains', 'Cliffhan...",['Batman']
595,410,Two Of A Kind,The training of Jason Todd. Batman tries to ke...,"['Alfred_Pennyworth', 'Batman', 'Dick_Grayson'...","['Martial_Arts', 'Origin_Story']",['Batman']
596,411,Second Chance,Jason Todd's debut as Robin continues now that...,"['Alfred_Pennyworth', 'Batman', 'James_Gordon'...","['Baseball', 'Henchmen']",['Batman']
613,428,A Death in the Family Chapter 5,"Picking up directly where last issue left off,...","['Alfred_Pennyworth', 'Ayatollah_Khomeini', 'B...","['Batman_Villains', 'Flashback_', 'Martial_Art...",['Batman']
593,408,"""Did Robin Die Tonight?""","When Dick Grayson is injured by the Joker, Bat...","['Alfred_Pennyworth', 'Batman', 'Catherine_Tod...","['Flashback_', 'Origin_Story']",['Batman']
1315,580,Double Image,"Batman has reasons to believe that the ""Harvey...","['Alfred_Pennyworth', 'Batman', 'Charlatan', '...","['Flashback_', 'Origin_Story']",['Detective_Comics']
720,534,A Wound on the Heart of Heaven,"Legacy' part 5, continued from BATMAN: SHADOW ...","['Barbara_Gordon', 'Batman', 'Lady_Shiva']",[],['Batman']
915,645,"Show Me Yesterday, For I Can't Find Today : A...",With the Red Hood's identity revealed to be Ja...,"['Alfred_Pennyworth', 'Batman', 'Black_Mask', ...",[],['Batman']
3961,8,Death's Door,Batman returns! Bruce Wayne continues his trai...,"['Bane', 'Batman', 'Dick_Grayson', 'Harold', '...","['Ninjas', 'Robin', 'Secret_Identity']",['Robin']


**Testing a larger volume. We get recommendations from different volumes, but still related to the inputted issue. In addition, the other recommendations all have some tie-in to the input's plotline (Jason Todd's Death).**

In [119]:
test3 = get_recommendations(df, "War for the Books of Magic, Part 1", cosine_sim)
test3

Unnamed: 0,issue_number,name,description,character_credits,concept_credits,volume
3601,12.0,"War for the Books of Magic, Part 1",FAUST’S true master is revealed!The divided te...,"['Black_Boris', 'Black_Orchid_(Garcia)', 'Blac...",['Magick'],['Justice_League_Dark']
3598,9.0,The Black Room,New series writer JEFF LEMIRE introduces a new...,"['Andrew_Bennett', 'Black_Orchid_(Garcia)', 'D...",['Magick'],['Justice_League_Dark']
3600,11.0,"The Black Room, Part Three",The secrets of the BLACK ROOM revealed!The tea...,"['Abnegazar', 'Black_Orchid_(Garcia)', 'Deadma...",['Magick'],['Justice_League_Dark']
3605,15.0,"The Death of Magic, Part 1: Up is Down",A new story arc starts here!Trapped in a techn...,"['Alkion', 'Black_Orchid_(Garcia)', 'Deadman',...",['Magick'],['Justice_League_Dark']
3607,17.0,"The Death of Magic, Part 3: Prisoners of Epoch",Constantine and the others are trapped on a wo...,"['Alkion', 'Black_Orchid_(Garcia)', 'Deadman',...",['Magick'],['Justice_League_Dark']
3606,16.0,"The Death of Magic, Part 2: Night of the Hunter",The team is trapped on a magic-less planet tha...,"['Alkion', 'Black_Orchid_(Garcia)', 'Deadman',...",['Magick'],['Justice_League_Dark']
3608,18.0,"The Death of Magic, Part 4: The Last Stand",The conclusion to the “DEATH OF MAGIC” comes c...,"['Alkion', 'Black_Orchid_(Garcia)', 'Deadman',...",['Magick'],['Justice_League_Dark']
3604,14.0,Enter the House of Mystery,A major new storyline starts here!In the after...,"['Alec_Holland', 'Amethyst', 'Animal_Man', 'Ap...",['Magick'],['Justice_League_Dark']
3603,13.0,"War for the Books of Magic, Part 2: Revelations",House of Mystery vs. House of Secrets!The team...,[],['Magick'],['Justice_League_Dark']
3610,20.0,"Horror City, Part 2: The Nightmare Gospel",The Flash and Swamp Thing guest-star as the te...,"['Alec_Holland', 'Barry_Allen', 'Deadman', 'Do...",['Magick'],['Justice_League_Dark']


**All of these are about the Justice League Dark getting trapped in various places.**

# Conclusion

Our recommendation model does very well to give similar/related recommendations. They are always tied-in via plotline, main idea or characters. 