Basic Idea: using 'genre'and 'type' columns to find the similarity between anime and recommend it to the users.



In [1]:
import numpy as np
import pandas as pd
import scipy as sp
import os
import matplotlib.pyplot as plt
from google.colab import files

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
anime = pd.read_csv('/content/drive/MyDrive/anime.csv')
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [4]:
rating = pd.read_csv('/content/drive/MyDrive/rating.csv')
rating.head()

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [5]:
anime.shape

(12294, 7)

In [6]:
rating.shape

(7813737, 3)

In [7]:
missing = anime.loc[(anime['episodes']=="Unknown") & (anime['type'].isnull())]
missing.shape

(25, 7)

In [8]:
anime.isnull().sum()

anime_id      0
name          0
genre        62
type         25
episodes      0
rating      230
members       0
dtype: int64

Treating 'type' and 'genre' column of anime dataset as equal. Therefore using concat & label encoding,the algorithm will not treat a certain category more important than the other categories.

In [9]:
anime = pd.concat([anime, anime['type'].str.get_dummies(), anime['genre'].str.get_dummies(sep=',')], axis=1)
anime.head()

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members,Movie,Music,ONA,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Cosine Similarity

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

In [11]:
anime_char = anime.loc[:, "Movie":].copy()
anime_char.head()

Unnamed: 0,Movie,Music,ONA,OVA,Special,TV,Adventure,Cars,Comedy,Dementia,...,Shoujo,Shounen,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Yaoi
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
cos_sim = cosine_similarity(anime_char.values, anime_char.values)
cos_sim.shape

(12294, 12294)

In [13]:
cos_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.31622777],
       [0.        , 1.        , 0.375     , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.375     , 1.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.5       ],
       [0.        , 0.        , 0.        , ..., 1.        , 1.        ,
        0.5       ],
       [0.31622777, 0.        , 0.        , ..., 0.5       , 0.5       ,
        1.        ]])

Indexing 

In [14]:
anime_index = pd.Series(anime.index, index=anime.name).drop_duplicates()

In [15]:
def content_recommender(anime_name, similarity=cos_sim):
    index = anime_index[anime_name]

    print('Users also watched:\n')
    sim_score = list(enumerate(cos_sim[index]))
    sim_score = sorted(sim_score, key=lambda x: x[1], reverse=True)
    sim_score = sim_score[0:11]
    anime_i = [i[0] for i in sim_score]
    
    result = anime[['anime_id','name', 'genre', 'rating']].iloc[anime_i].drop(index)
    return result

In [16]:
content_recommender("Gintama")

Users also watched:



Unnamed: 0,anime_id,name,genre,rating
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",9.25
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",9.16
9,15417,Gintama&#039;: Enchousen,"Action, Comedy, Historical, Parody, Samurai, S...",9.11
10896,34096,Gintama (2017),"Action, Comedy, Historical, Parody, Samurai, S...",
8,15335,Gintama Movie: Kanketsu-hen - Yorozuya yo Eien...,"Action, Comedy, Historical, Parody, Samurai, S...",9.1
63,21899,Gintama: Yorinuki Gintama-san on Theater 2D,"Action, Comedy, Historical, Parody, Samurai, S...",8.6
65,7472,Gintama Movie: Shinyaku Benizakura-hen,"Action, Comedy, Historical, Parody, Samurai, S...",8.59
216,9735,Gintama: Shinyaku Benizakura-hen,"Action, Comedy, Historical, Parody, Samurai, S...",8.31
306,25313,Gintama: Jump Festa 2014 Special,"Action, Comedy, Historical, Parody, Samurai, S...",8.2
1833,161,Peace Maker Kurogane,"Action, Comedy, Historical, Samurai, Shounen",7.43


In [17]:
content_recommender("Monster")

Users also watched:



Unnamed: 0,anime_id,name,genre,rating
981,323,Mousou Dairinin,"Drama, Mystery, Police, Psychological, Superna...",7.74
6998,32438,Mayoiga,"Drama, Horror, Mystery, Psychological",5.8
199,28223,Death Parade,"Drama, Game, Mystery, Psychological, Thriller",8.33
669,7193,Aoi Bungaku Series,"Drama, Historical, Psychological, Seinen, Thri...",7.9
3806,1243,Night Head Genesis,"Drama, Horror, Mystery, Psychological, Superna...",6.88
6009,838,Narutaru: Mukuro Naru Hoshi Tama Taru Ko,"Drama, Seinen, Thriller",6.28
53,6114,Rainbow: Nisha Rokubou no Shichinin,"Drama, Historical, Seinen, Thriller",8.64
54,31240,Re:Zero kara Hajimeru Isekai Seikatsu,"Drama, Fantasy, Psychological, Thriller",8.64
96,9756,Mahou Shoujo Madoka★Magica,"Drama, Magic, Psychological, Thriller",8.51
201,3002,Gyakkyou Burai Kaiji: Ultimate Survivor,"Game, Psychological, Seinen, Thriller",8.33


In [18]:
content_recommender("Death Note")

Users also watched:



Unnamed: 0,anime_id,name,genre,rating
144,1889,Higurashi no Naku Koro ni Kai,"Mystery, Psychological, Supernatural, Thriller",8.41
778,2994,Death Note Rewrite,"Mystery, Police, Psychological, Supernatural, ...",7.84
833,3713,Jigoku Shoujo Mitsuganae,"Mystery, Psychological, Supernatural",7.81
2691,3614,Yakushiji Ryouko no Kaiki Jikenbo,"Mystery, Police, Supernatural",7.19
6323,2781,Saint Luminous Jogakuin,"Mystery, Psychological, Supernatural",6.17
981,323,Mousou Dairinin,"Drama, Mystery, Police, Psychological, Superna...",7.74
49,31043,Boku dake ga Inai Machi,"Mystery, Psychological, Seinen, Supernatural",8.65
541,7724,Shiki,"Mystery, Supernatural, Thriller, Vampire",7.99
1325,2596,Shinreigari: Ghost Hound,"Mystery, Psychological, Sci-Fi, Supernatural",7.59
2199,4879,Mouryou no Hako,"Mystery, Seinen, Supernatural, Thriller",7.33


Subtracting the mean from each rating to standardize and normalize 

In [19]:
anime_std = anime_char.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)


Droping all columns having zero rating

In [20]:
anime_std.fillna(0, inplace=True)
anime_std = anime_std.T
anime_std = anime_std.loc[:, (anime_std != 0).any(axis=0)]


Data is in sparse matrix format to be read by the cosine similarity functions

In [21]:
anime_sparse = sp.sparse.csr_matrix(anime_std.values)

The matrices will show the computed cosine similarity values between each user/user and item/item array

In [22]:
item_sim = cosine_similarity(anime_sparse)
user_sim = cosine_similarity(anime_sparse.T)
item_sim_df = pd.DataFrame(item_sim, index = anime_std.index, columns = anime_std.index)
user_sim_df = pd.DataFrame(user_sim, index = anime_std.columns, columns = anime_std.columns)

Returning the top users with the highest similarity value 


In [23]:
def top_users(user):
    print('Most Similar Users:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:6]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:6]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print('User #{0}, Similarity: {1:.2f}'.format(user, sim)) 

In [24]:
top_users(3)

Most Similar Users:

User #10079, Similarity: 0.81
User #3581, Similarity: 0.81
User #8910, Similarity: 0.81
User #7984, Similarity: 0.81
User #10858, Similarity: 0.81


In [25]:
top_users(5)

Most Similar Users:

User #14, Similarity: 1.00
User #5, Similarity: 1.00
User #79, Similarity: 1.00
User #10913, Similarity: 0.91
User #237, Similarity: 0.91
