Reading in our anime data with pandas

In [117]:
import pandas as pd

anime = pd.read_csv("anime.csv")

In [118]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917
1,16498,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,25,8.54,896229
2,11757,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100
3,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
4,6547,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796
...,...,...,...,...,...,...,...
12289,33662,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.00,13
12290,33320,Suijun Genten,,Movie,1,7.00,13
12291,34490,Sushi Azarashi,Comedy,TV,30,3.00,12
12292,34485,Ganko-chan,,,Unknown,,11


Cleaning anime names with regex

In [119]:
import re

def clean_name(name):
  name = re.sub("[^a-zA-Z0-9 ]", "", name)
  return name

In [120]:
anime["clean_name"] = anime["name"].apply(clean_name)

In [121]:
anime

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,clean_name
0,1535,Death Note,"Mystery, Police, Psychological, Supernatural, ...",TV,37,8.71,1013917,Death Note
1,16498,Shingeki no Kyojin,"Action, Drama, Fantasy, Shounen, Super Power",TV,25,8.54,896229,Shingeki no Kyojin
2,11757,Sword Art Online,"Action, Adventure, Fantasy, Game, Romance",TV,25,7.83,893100,Sword Art Online
3,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,Fullmetal Alchemist Brotherhood
4,6547,Angel Beats!,"Action, Comedy, Drama, School, Supernatural",TV,13,8.39,717796,Angel Beats
...,...,...,...,...,...,...,...,...
12289,33662,Taka no Tsume 8: Yoshida-kun no X-Files,"Comedy, Parody",Movie,1,10.00,13,Taka no Tsume 8 Yoshidakun no XFiles
12290,33320,Suijun Genten,,Movie,1,7.00,13,Suijun Genten
12291,34490,Sushi Azarashi,Comedy,TV,30,3.00,12,Sushi Azarashi
12292,34485,Ganko-chan,,,Unknown,,11,Gankochan


Creating a tfidf matrix

In [122]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(ngram_range=(1,2))

tfidf = vectorizer.fit_transform(anime["clean_name"])

Creating a search function

In [123]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

def search(title):
  name = clean_name(title)
  query_vec = vectorizer.transform([name])
  similarity = cosine_similarity(query_vec, tfidf).flatten()
  indices = np.argpartition(similarity, -5) [-5:]
  results = anime.iloc[indices][::-1]
  return results

Building an interactive search box with Google Colab

In [124]:
import ipywidgets as widgets
from IPython.display import display

anime_input = widgets.Text(
    value='Fullmetal Alchemist: Brotherhood',
    description='Anime Title:',
    disabled=False
)
anime_list = widgets.Output()

def on_type(data):
  with anime_list:
    anime_list.clear_output()
    name = data["new"]
    if len(name) > 5:
      display(search(name))

anime_input.observe(on_type, names='value')

display(anime_input, anime_list)

Text(value='Fullmetal Alchemist: Brotherhood', description='Anime Title:')

Output()

Reading in Anime Rating Data

In [125]:
rating = pd.read_csv("rating[1].csv")

In [126]:
rating

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1.0
1,1,24,-1.0
2,1,79,-1.0
3,1,226,-1.0
4,1,241,-1.0
...,...,...,...
2841860,26591,138,9.0
2841861,26591,139,9.0
2841862,26591,202,7.0
2841863,26591,442,8.0


In [127]:
rating.dtypes

user_id       int64
anime_id      int64
rating      float64
dtype: object

Finding users who liked the same movie

In [128]:
anime_id = 5114

In [129]:
similar_users = rating[(rating["anime_id"] == anime_id) & (rating["rating"] > 4)]["user_id"].unique()

In [130]:
similar_users

array([    3,    10,    11, ..., 26586, 26587, 26588])

In [131]:
similar_user_recs = rating[(rating["user_id"].isin(similar_users)) & (rating["rating"] > 4)]["anime_id"]

In [132]:
similar_user_recs

156           20
157          154
158          170
159          199
160          225
           ...  
2841744    32485
2841745    32553
2841746    33028
2841747    33091
2841748    34103
Name: anime_id, Length: 1238399, dtype: int64

In [133]:
similar_user_recs = similar_user_recs.value_counts() / len(similar_users)

similar_user_recs = similar_user_recs[similar_user_recs > .1]

In [134]:
similar_user_recs

5114     1.000000
1535     0.717405
16498    0.617016
11757    0.593425
121      0.588782
           ...   
5258     0.101016
17729    0.100891
10588    0.100264
6325     0.100138
21327    0.100013
Name: anime_id, Length: 399, dtype: float64

Finding how much all users like anime

In [135]:
all_users = rating[(rating["anime_id"].isin(similar_user_recs.index)) & (rating["rating"] > 4)]

In [136]:
all_users_recs = all_users["anime_id"].value_counts() / len(all_users["user_id"].unique())

In [137]:
all_users_recs

1535     0.501400
11757    0.389639
16498    0.381648
6547     0.358566
1575     0.357592
           ...   
28891    0.048436
9135     0.046935
9969     0.045799
5258     0.045678
6421     0.033913
Name: anime_id, Length: 399, dtype: float64

Creating a recommendation score

In [138]:
rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
rec_percentages.columns = ["similar", "all"]

In [139]:
rec_percentages

Unnamed: 0,similar,all
5114,1.000000,0.323273
1535,0.717405,0.501400
16498,0.617016,0.381648
11757,0.593425,0.389639
121,0.588782,0.311995
...,...,...
5258,0.101016,0.045678
17729,0.100891,0.056225
10588,0.100264,0.055332
6325,0.100138,0.051803


Similarity percentages by comparing

In [140]:
rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

Sorting the dataset

In [141]:
rec_percentages = rec_percentages.sort_values("score", ascending=False)

In [142]:
rec_percentages

Unnamed: 0,similar,all,score
5114,1.000000,0.323273,3.093362
6421,0.101769,0.033913,3.000857
9135,0.135023,0.046935,2.876800
9969,0.103777,0.045799,2.265908
11061,0.237545,0.106892,2.222290
...,...,...,...
232,0.107793,0.084216,1.279960
4752,0.162756,0.129609,1.255740
853,0.286611,0.232729,1.231521
3457,0.204794,0.169527,1.208032


Merging

In [143]:
rec_percentages.head(10).merge(anime, left_index=True, right_on="anime_id")

Unnamed: 0,similar,all,score,anime_id,name,genre,type,episodes,rating,members,clean_name
3,1.0,0.323273,3.093362,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,Fullmetal Alchemist Brotherhood
864,0.101769,0.033913,3.000857,6421,Fullmetal Alchemist: Brotherhood Specials,"Adventure, Drama, Fantasy, Magic, Military, Sh...",Special,4,8.11,67962,Fullmetal Alchemist Brotherhood Specials
659,0.135023,0.046935,2.8768,9135,Fullmetal Alchemist: The Sacred Star of Milos,"Action, Adventure, Comedy, Drama, Fantasy, Mag...",Movie,1,7.5,87944,Fullmetal Alchemist The Sacred Star of Milos
334,0.103777,0.045799,2.265908,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,Gintama039
44,0.237545,0.106892,2.22229,11061,Hunter x Hunter (2011),"Action, Adventure, Shounen, Super Power",TV,148,9.13,425855,Hunter x Hunter 2011
652,0.101016,0.045678,2.211506,5258,Hajime no Ippo: New Challenger,"Comedy, Drama, Shounen, Sports",TV,26,8.75,88995,Hajime no Ippo New Challenger
207,0.171038,0.077847,2.197109,28701,Fate/stay night: Unlimited Blade Works 2nd Season,"Action, Fantasy, Magic, Shounen, Supernatural",TV,13,8.45,205987,Fatestay night Unlimited Blade Works 2nd Season
509,0.126867,0.057888,2.191583,27821,Fate/stay night: Unlimited Blade Works - Prologue,"Action, Fantasy, Magic, Shounen, Supernatural",Special,1,8.32,106382,Fatestay night Unlimited Blade Works Prologue
242,0.141799,0.064744,2.190162,23199,Durarara!!x2 Shou,"Action, Mystery, Supernatural",TV,12,8.15,189407,Durararax2 Shou
397,0.141549,0.064703,2.187656,12365,Bakuman. 3rd Season,"Comedy, Drama, Romance, Shounen",TV,25,8.71,133620,Bakuman 3rd Season


Building a recommendation function

In [144]:
def find_similar_anime(anime_id):
  similar_users = rating[(rating["anime_id"] == anime_id) & (rating["rating"] > 4)]["user_id"].unique()
  similar_user_recs = rating[(rating["user_id"].isin(similar_users)) & (rating["rating"] > 4)]["anime_id"] #Finding similar users

  similar_user_recs = similar_user_recs.value_counts() / len(similar_users)
  similar_user_recs = similar_user_recs[similar_user_recs > .10] #Finding the users who recommended that anime over 10%

  all_users = rating[(rating["anime_id"].isin(similar_user_recs.index)) & (rating["rating"] > 4)]
  all_users_recs = all_users["anime_id"].value_counts() / len(all_users["user_id"].unique()) #Finding the all users who recommend the same anime

  rec_percentages = pd.concat([similar_user_recs, all_users_recs], axis=1)
  rec_percentages.columns = ["similar", "all"]

  rec_percentages["score"] = rec_percentages["similar"] / rec_percentages["all"]

  rec_percentages = rec_percentages.sort_values("score", ascending=False)
  return rec_percentages.head(10).merge(anime, left_index=True, right_on="anime_id")[["score", "name", "genre"]]



Creating an interactive recommendation widget

In [145]:
anime_name_input = widgets.Text(
    value="Fullmetal Alchemist: Brotherhood",
    description="Anime Title:",
    disabled=False
)

recommendation_list = widgets.Output()

def on_type(data):
  with recommendation_list:
    recommendation_list.clear_output()
    name = data["new"]
    if len(name) > 5:
      results = search(name)
      anime_id = results.iloc[0]["anime_id"]
      display(find_similar_anime(anime_id))

anime_name_input.observe(on_type, names="value")

display(anime_name_input, recommendation_list)


Text(value='Fullmetal Alchemist: Brotherhood', description='Anime Title:')

Output()