# ANIME RECOMMENDER SYSTEM - CONTENT-BASED FILTERING

In [3]:
# basic library
import numpy as np 
import pandas as pd 
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

In [4]:
# load data
anime_df = pd.read_csv('dataset/processed_dataset/anime.csv')

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import sigmoid_kernel

In [4]:
def get_tf_matrix(col):
    tf = TfidfVectorizer(stop_words='english')
    anime_df[col] = anime_df[col].fillna('')

    tf_matrix = tf.fit_transform(anime_df[col])
    tf_matrix.shape
    return tf_matrix

In [5]:
def get_cosine(tf_matrix):
    # compute cosine similarity matrix
    cosine_sim = sigmoid_kernel(tf_matrix, tf_matrix)
    return cosine_sim

In [1]:
# get indices for every anime
indices = pd.Series(anime_df.index, index=anime_df['title']).drop_duplicates()

NameError: name 'pd' is not defined

In [7]:
# list to string
col_list = ['genre']
for i in col_list:
    anime_df[i] = anime_df[i].astype(str)

In [8]:
# synopsis
tf_matrix_sypnopsis = get_tf_matrix('synopsis')
cosine_sim_sypnopsis = get_cosine(tf_matrix_sypnopsis)

In [9]:
# genre
tf_matrix_genres = get_tf_matrix('genre')
cosine_sim_genres = get_cosine(tf_matrix_genres)

In [10]:
#studio
tf_matrix_studios = get_tf_matrix('studio')
cosine_sim_studios = get_cosine(tf_matrix_studios)

In [11]:
header = ['anime_id', 'title', 'score', 'rating_count', 'ranked', 'popularity', 'members', 'type', 'studio', 'synopsis', 'episode_count', 'genre', 'url', 'img']

In [13]:
# create soup combining feature:
feature = ['genre', 'type', 'studio', 'synopsis']
for i in feature:
    anime_df = anime_df.astype(str)
anime_df['soup'] = anime_df[feature].apply(lambda row: '  '.join(row.values.astype(str)), axis=1)

In [14]:
tf_matrix_soup = get_tf_matrix('soup')
cosine_sim_soup = get_cosine(tf_matrix_soup)

In [17]:
# get recommendation function for content based filtering
def get_rec_content(title, cosine_sim):
    # get index of title
    idx = indices[title]
    
    # pairwise similarity score 
    sim = list(enumerate(cosine_sim[idx]))
    sim = sorted(sim, key=lambda x: x[1], reverse=True)
    sim = sim[1:11]

    # indices sim 
    anime_indices = [i[0] for i in sim]
    return anime_df['title'].iloc[anime_indices]

In [18]:
get_rec_content('Haikyuu!!', cosine_sim_sypnopsis)

10106      Haikyuu!! Movie 1: Owari to Hajimari
15105                      Haikyuu!! To the Top
5639                           Attack No.1 1970
12771        Haikyuu!! Movie 3: Sainou to Sense
10453      Haikyuu!! Movie 2: Shousha to Haisha
9864                    Haikyuu!! Second Season
2794                              Attacker You!
1407                                Attack No.1
3582                           Ashita e Attack!
4884     Shoujo Fight: Norainu-tachi no Odekake
Name: title, dtype: object

In [19]:
get_rec_content('Haikyuu!!', cosine_sim_genres)

160                                              Whistle!
1603                                      Slam Dunk Movie
1697    Slam Dunk: Zenkoku Seiha Da! - Sakuragi Hanamichi
2289    Slam Dunk: Shouhoku Saidai no Kiki! Moero Saku...
2290    Slam Dunk: Hoero Basketman-damashii! Hanamichi...
6603                                     Kuroko no Basket
7531                          Kuroko no Basket 2nd Season
7538                            Kuroko no Basket: Tip Off
7602                  Kuroko no Basket: Oshaberi Shiyokka
7920                                       Diamond no Ace
Name: title, dtype: object

In [21]:
get_rec_content('Haikyuu!!', cosine_sim_studios)

93               Sakigake!! Cromartie Koukou
128                                   Blood+
178                            Video Girl Ai
381                  Blood: The Last Vampire
437        One Piece: Taose! Kaizoku Ganzack
438    Koukaku Kidoutai: Stand Alone Complex
439                                Innocence
492                              Otogizoushi
493               Boku no Chikyuu wo Mamotte
535                                  Jin-Rou
Name: title, dtype: object

In [22]:
get_rec_content('Haikyuu!!', cosine_sim_soup)

10106      Haikyuu!! Movie 1: Owari to Hajimari
15105                      Haikyuu!! To the Top
12771        Haikyuu!! Movie 3: Sainou to Sense
5639                           Attack No.1 1970
10453      Haikyuu!! Movie 2: Shousha to Haisha
9864                    Haikyuu!! Second Season
1407                                Attack No.1
2794                              Attacker You!
3582                           Ashita e Attack!
4884     Shoujo Fight: Norainu-tachi no Odekake
Name: title, dtype: object

In [7]:
import pickle

In [25]:
pickle.dump(cosine_sim_soup, open('pickle/cosine_sim_soup.pickle', 'wb'))

In [5]:
# get indices for every anime
indices_id = pd.Series(anime_df.index, index=anime_df['anime_id']).drop_duplicates()

In [8]:
pickle.dump(indices_id, open('pickle/indices_id.pickle', 'wb'))
