In [83]:
import pandas as pd
import numpy as np
import json

from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

In [5]:
marvel = pd.read_csv("data/marvel_comics.csv")

In [75]:
def clean_marvel_df(input_df):
    df = input_df.copy()
    df["issue_description"] = df["issue_description"].fillna("")
    #df["penciler"] = df["penciler"].fillna("")
    #df["writer"] = df["writer"].fillna("")
    #df["cover_artist"] = df["cover_artist"].fillna("")
    return df

In [76]:
clean_marvel = clean_marvel_df(marvel)

In [77]:
clean_marvel.sample(2)

Unnamed: 0,comic_name,active_years,issue_title,publish_date,issue_description,penciler,writer,cover_artist,Imprint,Format,Rating,Price
28969,Two-Gun Kid (1948 - 1977),(1948 - 1977),Two-Gun Kid (1948) #129,"April 10, 1976",,,,,,Comic,,Free
5722,Cyclops (2014 - 2015),(2014 - 2015),Cyclops (2014) #5,"September 24, 2014",Still weeks away from pick-up by the Starjamme...,Alexander Lozano,Greg Rucka,,,Comic,,$3.99


In [78]:
clean_marvel.isnull().sum()

comic_name               0
active_years             0
issue_title            526
publish_date           526
issue_description        0
penciler              9510
writer                7397
cover_artist         22737
Imprint              23308
Format                2098
Rating               22373
Price                 2098
dtype: int64

In [79]:
clean_marvel.groupby("penciler")["penciler"].count()

penciler
A CO                                7
ART & COMICS INT'L                  1
Aaron Kim Jacinto                   5
Aaron Kim Jacinto, Ario Anindito    1
Aaron Kim Jacinto, R.B. Silva       1
                                   ..
Yvel Guichet                        2
Yves Bigerel                        3
Zach Howard                         2
Zachary Baldus                      1
Zachary Montoya                     1
Name: penciler, Length: 3915, dtype: int64

# Content Base Recommendation System

In [12]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(clean_marvel['issue_description'])
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

In [29]:
def get_recommendations(df, title, cosine_sim=cosine_sim):
    # Get the index of the issue that matches the title
    idx = df[df['comic_name'] == title].index[0]
    # Get the pairwise similarity scores of all issues with that issue
    sim_scores = list(enumerate(cosine_sim[idx]))
    # Sort the issues based on the similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    # Get the scores of the 10 most similar issues
    sim_scores = sim_scores[1:11]
    # Get the issue indices
    issue_indices = [i[0] for i in sim_scores]
    # Return the top 10 most similar issues
    return df[['comic_name', 'issue_title', 'issue_description']].iloc[issue_indices]

In [21]:
clean_marvel["comic_name"][400]

'All-New Inhumans (2015 - 2016)'

In [31]:
recommendations = get_recommendations(clean_marvel, "Adventure Into Fear (1970 - 1975)", cosine_sim=cosine_sim)

In [39]:
recommendations

Unnamed: 0,comic_name,issue_title,issue_description
28752,True Believers: Conan - The Secret of Skull Ri...,True Believers: Conan - The Secret of Skull Ri...,Reprinting Savage Tales (1971) #5
26613,The Astonishing Ant-Man (2015 - 2016),The Astonishing Ant-Man (2015) #3,"Because you demanded it: Scott Lang, the Aston..."
4603,Captain America: First Vengeance (2011),Captain America: First Vengeance (2011) #5,The official prequel to the upcoming summer bl...
25912,Tales to Astonish (1959 - 1968),Tales to Astonish (1959) #4,What happens when a film director is kidnapped...
13338,Iron Man 2: Agents of S.H.I.E.L.D. (2010),Iron Man 2: Agents of S.H.I.E.L.D. (2010) #1,IRON MAN 2! You saw the Hollywood smash - now ...
14513,Man-Thing (2004),Man-Thing (2004) #1,What evil lurks in the brackish waters of the ...
33463,X-Force (1991 - 2004),X-Force (1991) #126,Dead Girl's backstory is revealed!
28728,True Believers: Black Widow - Amazing Adventur...,True Believers: Black Widow - Amazing Adventur...,Amazing Adventures (1970) #1–2
16159,Marvel Team-Up (1972 - 1985),Marvel Team-Up (1972) #68,The Man-Thing is a prisoner of man! Peter Park...
11555,Hulk (2008 - 2012),Hulk (2008) #38,FEAR ITSELF tie-in. Red Hulk's battle with the...


In [22]:
recommendations = get_recommendations(clean_marvel, 'All-New Inhumans (2015 - 2016)', cosine_sim=cosine_sim)
print(recommendations)

21292              Runaways (2003 - 2004)
23029            Spider-Man (1990 - 1998)
7757              Defenders (2011 - 2012)
393        All-New Inhumans (2015 - 2016)
19536            Paradise X (2002 - 2003)
1034     Amazing Spider-Man (1999 - 2013)
4898         Captain Marvel (2002 - 2004)
18687      New Mutants: Dead Souls (2018)
6299              Daredevil (1998 - 2011)
19420             Onslaught: X-Men (1996)
Name: comic_name, dtype: object


# Decision Tree Classifier

In [23]:
clean_marvel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 34992 entries, 0 to 34991
Data columns (total 12 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   comic_name         34992 non-null  object
 1   active_years       34992 non-null  object
 2   issue_title        34466 non-null  object
 3   publish_date       34466 non-null  object
 4   issue_description  34992 non-null  object
 5   penciler           25482 non-null  object
 6   writer             27595 non-null  object
 7   cover_artist       12255 non-null  object
 8   Imprint            11684 non-null  object
 9   Format             32894 non-null  object
 10  Rating             12619 non-null  object
 11  Price              32894 non-null  object
dtypes: object(12)
memory usage: 3.2+ MB


In [81]:
X = clean_marvel.drop('issue_title', axis=1)
y = clean_marvel['issue_title']

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the decision tree classifier
clf = DecisionTreeClassifier(random_state=42)

# Fit the classifier to the training data
clf.fit(X_train, y_train)

# Predict on the test data
y_pred = clf.predict(X_test)

# Evaluate accuracy (you can use other metrics as needed)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')

ValueError: could not convert string to float: 'Wolverine: The End (2003 - 2004)'

<div align="center"> 
 END of DOCUMENT 

</div>