In [1]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Reading top anime data

In [2]:
# Files in the data directory
os.listdir('../data')

['top_anime.json', 'user_cleaned.csv', 'user_lst_03092021.json']

In [3]:
# This data was collected on Feb. 27, 2021
with open('../data/top_anime.json') as f:
    data = json.load(f)

In [4]:
# List of columns
columns = [
    'mal_id',
    'url',
    'image_url',
    'trailer_url',
    'title',
    'title_japanese',
    'type',
    'source',
    'episodes',
    'status',
    'rating',
    'score',
    'rank',
    'popularity',
    'members',
    'favorites',
    'synopsis',
    'premiered',
    'studios',
    'genres'
]

In [5]:
# Creating pd.DataFrame object with above columns
df = pd.DataFrame(columns=columns)

In [6]:
# Adding top anime data to df
for col in columns:
    if not col in ['studios', 'genres']:
        df[col] = [ele[col] for ele in data]
    else:
        df[col] = [','.join([ele['name'] for ele in anime[col]]) for anime in data]

In [7]:
# Cleaning data

# Only including anime
df = df.loc[df['type']=='TV']

# Drop dupplicates
df = df.drop_duplicates(subset='mal_id', keep='last')

# Drop anime without rating
df = df.loc[~df['score'].isnull()]

# Filling NaN in 'synopsis' column with blank string
df['synopsis'] = df['synopsis'].fillna('')

In [8]:
# Printing number of rows and columns
print(df.shape)

(3729, 20)


# Content-based recommender

In [9]:
# Synopsis matrix using TfidfVectorizer
tf_synop = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
tf_synop_matrix = tf_synop.fit_transform(df['synopsis'])

In [10]:
# Genre matrix
# Dropping 'Cars', 'Dementia', and 'Police' genres
genre_matrix = df['genres'].str.get_dummies(sep=',').drop(columns=['Cars', 'Dementia', 'Police']).to_numpy()

In [11]:
# Concatenating synopsis matrix, genre matrix, and score matrix
matrix = np.concatenate((tf_synop_matrix.toarray(), genre_matrix), axis=1)

In [12]:
# Cosine similarity matrix
cosine_sim = cosine_similarity(matrix, matrix)

In [13]:
# Indices
indices = pd.Series(df.index, index=df['title'])
indices

title
Fullmetal Alchemist: Brotherhood                  0
Shingeki no Kyojin: The Final Season              1
Shingeki no Kyojin Season 3 Part 2                2
Steins;Gate                                       3
Gintama°                                          4
                                               ... 
Netsuzou TRap                                  9984
Ginyuu Mokushiroku Meine Liebe                 9990
Girls Bravo: First Season                      9991
Gyakuten Saiban: Sono "Shinjitsu", Igi Ari!    9996
Kaibutsu-kun (1980)                            9998
Length: 3729, dtype: int64

# Testing content-based recommender

In [14]:
# Adding cosine_sim column
anime = 'Shingeki no Kyojin Season 3 Part 2'
df['cosine_sim'] = list(cosine_sim[indices[anime]])

In [15]:
df.sort_values(by='cosine_sim', ascending=False)[['title', 'score', 'cosine_sim']][1:20]

Unnamed: 0,title,score,cosine_sim
128,Shingeki no Kyojin Season 2,8.45,0.928812
1,Shingeki no Kyojin: The Final Season,9.16,0.914079
65,Shingeki no Kyojin Season 3,8.6,0.903368
114,Shingeki no Kyojin,8.48,0.902037
6375,Katsute Kami Datta Kemono-tachi e,6.41,0.725576
2827,Kaze no Youjinbou,7.19,0.639826
548,Zetsuen no Tempest,7.99,0.6263
94,One Piece,8.52,0.625896
1246,GetBackers,7.61,0.625
3455,Concrete Revolutio: Choujin Gensou - The Last ...,7.05,0.600351


# Creating collaborative recommender

In [16]:
# Reading user_cleaned.csv
user_df = pd.read_csv("../data/user_cleaned.csv")

In [17]:
# Only including anime
user_df = user_df.loc[user_df['type']=='TV']

In [18]:
# Viewing first 5 rows of user_df
user_df.head()

Unnamed: 0,username,mal_id,title,score,type
2,--hasuki_komai--,6682,11eyes,6,TV
4,--hasuki_komai--,38101,5-toubun no Hanayome,6,TV
5,--hasuki_komai--,25397,Absolute Duo,6,TV
6,--hasuki_komai--,11759,Accel World,6,TV
8,--hasuki_komai--,34881,Aho Girl,5,TV


# Item-item collaborative recommender

In [19]:
# Creating a pivot table for collaborative recommender
pivot = user_df.pivot_table(values='score', index='title', columns='username')

In [20]:
# Dropping users without any anime reviews
pivot = pivot.dropna(axis=0, how='all')

In [21]:
# Filling NaN with 0s
pivot = pivot.fillna(0)

In [22]:
# Identifying animes from user_df that are in top_anime and
# identifying animes from user_df that are not in top_anime

include_anime = []
drop_anime = []
for title in user_df['title'].unique():
    if title in df['title'].unique():
        include_anime.append(title)
    else:
        drop_anime.append(title)

In [23]:
# Dropping animes that are not in top_anime
pivot = pivot.drop(drop_anime)

In [24]:
# Creating item_sim_df using cosine_similarity
item_sim_df = pd.DataFrame(cosine_similarity(pivot, pivot), index=pivot.index, columns=pivot.index)

In [25]:
def recommend_similar_animes(anime):
    """
    Returns a list of recommended animes and cosine similarity scores based on anime input. 
    """
    if anime not in pivot.index:
        return None, None
    else:
        sim_animes = item_sim_df.sort_values(by=anime, ascending=False).index[1:]
        sim_scores = item_sim_df.sort_values(by=anime, ascending=False).loc[:, anime].tolist()[1:]
        return sim_animes, sim_scores

In [26]:
sim_animes, sim_score = recommend_similar_animes("Shingeki no Kyojin Season 3 Part 2")

In [27]:
for anime, score in zip(sim_animes[:20], sim_score[:20]):
    print(anime, score)

Shingeki no Kyojin Season 3 0.9028301498561447
Shingeki no Kyojin Season 2 0.7852675494861486
Shingeki no Kyojin 0.6284973457263251
Yakusoku no Neverland 0.6186378270998584
Kimetsu no Yaiba 0.6112405519004415
One Punch Man 2nd Season 0.5464097847678064
One Punch Man 0.5323816072173391
Seishun Buta Yarou wa Bunny Girl Senpai no Yume wo Minai 0.5318244603818986
Boku no Hero Academia 3rd Season 0.5250242348681234
Re:Zero kara Hajimeru Isekai Seikatsu 0.5228894598565462
Boku no Hero Academia 2nd Season 0.51117392369407
Mob Psycho 100 II 0.5057781168925408
Kaguya-sama wa Kokurasetai: Tensai-tachi no Renai Zunousen 0.5037259908571896
Tokyo Ghoul 0.49938110160497573
Tate no Yuusha no Nariagari 0.49734769943935814
Boku no Hero Academia 0.49568870209462246
Vinland Saga 0.4931060185309363
Re:Zero kara Hajimeru Isekai Seikatsu 2nd Season 0.48900339979789975
Boku no Hero Academia 4th Season 0.48382361673008584
Mob Psycho 100 0.48017968621543605
