In [1]:
# Importing libraries
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import json
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Reading top anime data

In [2]:
# Files in the data directory
os.listdir('../data')

['top_anime.json', 'user_cleaned.csv', 'user_lst.json']

In [3]:
# This data was collected on Feb. 27, 2021
with open('../data/top_anime.json') as f:
    data = json.load(f)

In [4]:
# List of columns
columns = [
    'mal_id',
    'url',
    'image_url',
    'trailer_url',
    'title',
    'title_japanese',
    'type',
    'source',
    'episodes',
    'status',
    'rating',
    'score',
    'rank',
    'popularity',
    'members',
    'favorites',
    'synopsis',
    'premiered',
    'studios',
    'genres'
]

In [5]:
# Creating pd.DataFrame object with above columns
df = pd.DataFrame(columns=columns)

In [6]:
# Adding top anime data to df
for col in columns:
    if not col in ['studios', 'genres']:
        df[col] = [ele[col] for ele in data]
    else:
        df[col] = [','.join([ele['name'] for ele in anime[col]]) for anime in data]

In [7]:
df['genres']

0       Action,Military,Adventure,Comedy,Drama,Magic,F...
1       Action,Military,Mystery,Super Power,Drama,Fant...
2       Action,Drama,Fantasy,Military,Mystery,Shounen,...
3                                         Thriller,Sci-Fi
4       Action,Comedy,Historical,Parody,Samurai,Sci-Fi...
                              ...                        
9995                                 Drama,Romance,School
9996                          Comedy,Drama,Mystery,Police
9997                   Drama,Mystery,Psychological,Seinen
9998                                   Comedy,Horror,Kids
9999                                         Comedy,Magic
Name: genres, Length: 10000, dtype: object

In [8]:
# Cleaning data

# Only including anime
df = df.loc[df['type']=='TV']

# Drop dupplicates
df = df.drop_duplicates(subset='mal_id', keep='last')

# Drop anime without rating
df = df.loc[~df['score'].isnull()]

# Filling NaN in 'synopsis' column with blank string
df['synopsis'] = df['synopsis'].fillna('')

In [9]:
# Printing number of rows and columns
print(df.shape)

(3729, 20)


# Content-based recommender

In [10]:
# Synopsis matrix using TfidfVectorizer
tf_synop = TfidfVectorizer(stop_words='english', ngram_range=(1,2), min_df=10)
tf_synop_matrix = tf_synop.fit_transform(df['synopsis'])

In [11]:
# Genre matrix
# Dropping 'Cars', 'Dementia', and 'Police' genres
# genre_matrix = df['genres'].str.get_dummies(sep=',').drop(columns=['Cars', 'Dementia', 'Police']).to_numpy()
genre_matrix = df['genres'].str.get_dummies(sep=',').to_numpy()

In [12]:
# Concatenating synopsis matrix, genre matrix, and score matrix
matrix = np.concatenate((tf_synop_matrix.toarray(), genre_matrix), axis=1)

In [13]:
# Cosine similarity matrix
cosine_sim_content = pd.DataFrame(cosine_similarity(matrix, matrix), index=df['title'], columns=df['title'])

In [14]:
# Adding cosine_sim column
anime = 'Shingeki no Kyojin Season 3 Part 2'
cosine_sim_content[anime].sort_values(ascending=False)[1:20]

title
Shingeki no Kyojin Season 2                           0.928812
Shingeki no Kyojin: The Final Season                  0.914079
Shingeki no Kyojin Season 3                           0.903368
Shingeki no Kyojin                                    0.902037
Katsute Kami Datta Kemono-tachi e                     0.725576
Kaze no Youjinbou                                     0.639826
Zetsuen no Tempest                                    0.626300
One Piece                                             0.625896
GetBackers                                            0.625000
Concrete Revolutio: Choujin Gensou - The Last Song    0.600351
Fullmetal Alchemist: Brotherhood                      0.593283
Fullmetal Alchemist                                   0.593070
Tsubasa Chronicle 2nd Season                          0.592590
Concrete Revolutio: Choujin Gensou                    0.592384
Hunter x Hunter                                       0.584950
Shin Mazinger Shougeki! Z-hen                    

# Collaborative recommender

In [15]:
# Reading user_cleaned.csv
user_df = pd.read_csv("../data/user_cleaned.csv")

In [16]:
# Only including anime
user_df = user_df.loc[user_df['type']=='TV']

In [17]:
# Dropping users who are collected more than once (users who are part of multiple clubs)
user_df = user_df.drop_duplicates(subset=['username', 'title'], keep='first')

In [18]:
# Viewing first 5 rows of user_df
user_df.head()

Unnamed: 0,username,mal_id,title,score,type
2,--hasuki_komai--,6682,11eyes,6,TV
4,--hasuki_komai--,38101,5-toubun no Hanayome,6,TV
5,--hasuki_komai--,25397,Absolute Duo,6,TV
6,--hasuki_komai--,11759,Accel World,6,TV
8,--hasuki_komai--,34881,Aho Girl,5,TV


In [19]:
# Creating a pivot table for collaborative recommender
pivot = user_df.pivot_table(values='score', index='title', columns='username')

In [20]:
# Dropping users without any anime reviews
pivot = pivot.dropna(axis=0, how='all')

In [21]:
# Filling NaN with 0s
pivot = pivot.fillna(0)

In [22]:
# Identifying animes from user_df that are in top_anime and
# identifying animes from user_df that are not in top_anime

include_anime = []
drop_anime = []
for title in user_df['title'].unique():
    if title in df['title'].unique():
        include_anime.append(title)
    else:
        drop_anime.append(title)

# Dropping animes that are not in top_anime
pivot = pivot.drop(drop_anime)

In [23]:
# Cosine similarity matrix
cosine_sim_collab = pd.DataFrame(cosine_similarity(pivot, pivot), index=pivot.index, columns=pivot.index)

In [24]:
cosine_sim_collab[anime].sort_values(ascending=False)[1:20]

title
Shingeki no Kyojin Season 3                                   0.902848
Shingeki no Kyojin Season 2                                   0.785268
Shingeki no Kyojin                                            0.628497
Yakusoku no Neverland                                         0.618638
Kimetsu no Yaiba                                              0.611241
One Punch Man 2nd Season                                      0.546410
One Punch Man                                                 0.532382
Seishun Buta Yarou wa Bunny Girl Senpai no Yume wo Minai      0.531824
Boku no Hero Academia 3rd Season                              0.525024
Re:Zero kara Hajimeru Isekai Seikatsu                         0.522889
Boku no Hero Academia 2nd Season                              0.511174
Mob Psycho 100 II                                             0.505778
Kaguya-sama wa Kokurasetai: Tensai-tachi no Renai Zunousen    0.503726
Tokyo Ghoul                                                   0.499381
