Here I want to make a recommender system to find the similarity between shows, users and to help me predict whether a user will enjoy a particular anime.

# Importing Libraries 

In [1]:
# Basic imports
import pandas as pd
import numpy as np
import operator
%matplotlib inline

# EDA imports
import pandas_summary as ps
from plotly.offline import init_notebook_mode, plot, iplot
import plotly.graph_objs as go
init_notebook_mode(connected=True)


# Math imports 
import scipy as sp
import sklearn
from sklearn.metrics.pairwise import cosine_similarity
from math import sqrt
from statistics import mean 

# Data Loading/Preparation

In [2]:
# Loading CVS files as Dataframes
anime = pd.read_csv('myAnimelist-No_Hentai 2.csv')
rating = pd.read_csv('rating.csv')


In [3]:
# Displaying orginal dataframes' dimensions

print('Anime dataframe dimensions: ', anime.shape)
print('Rating dataframe dimensions: ', rating.shape)

Anime dataframe dimensions:  (5560, 14)
Rating dataframe dimensions:  (7813737, 3)


Join the two dataframes on the anime_id columns

In [4]:
# For this analysis I'm only interest in finding recommendations for the TV category

anime_show = anime[anime['type']=='TV']
print('New dataframe with only Anime Tv shows dimensions: ',anime_show.shape)
anime_show.head()

New dataframe with only Anime Tv shows dimensions:  (4103, 14)


Unnamed: 0,animeID,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1,Cowboy Bebop,TV,Original,26,0:24:00,"['Action', 'Adventure', 'Comedy', 'Drama', 'Sc...",R - 17+ (violence & profanity),8.81,405664,26,39,795733,43460
2,6,Trigun,TV,Manga,26,0:24:00,"['Action', 'Sci-Fi', 'Adventure', 'Comedy', 'D...",PG-13 - Teens 13 or older,8.3,212537,255,146,408548,10432
3,7,Witch Hunter Robin,TV,Original,26,0:25:00,"['Action', 'Magic', 'Police', 'Supernatural', ...",PG-13 - Teens 13 or older,7.33,32837,2371,1171,79397,537
4,8,Bouken Ou Beet,TV,Manga,52,0:23:00,"['Adventure', 'Fantasy', 'Shounen', 'Supernatu...",PG - Children,7.03,4894,3544,3704,11708,14
5,16,Hachimitsu to Clover,TV,Manga,24,0:23:00,"['Comedy', 'Drama', 'Josei', 'Romance', 'Slice...",PG-13 - Teens 13 or older,8.12,57065,419,536,172274,3752


In [5]:
# Merging anime_show and rating dataframe with inner join
merged = rating.merge(anime_show, left_on = 'anime_id', right_on = 'animeID', suffixes= ['_user', ''], how='right')
# Renaming column
merged.rename(columns = {'rating_user':'user_rating'}, inplace = True)
# Dropping extra column
merged = merged.drop(columns="animeID")
merged.head()

Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
0,1.0,20.0,-1.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
1,3.0,20.0,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5.0,20.0,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
3,6.0,20.0,-1.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
4,10.0,20.0,-1.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [6]:
# Making new dataframe that drops all ratings with -1 values
merged2 = merged[merged['user_rating'] != -1]
merged2.head()

Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
1,3.0,20.0,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
2,5.0,20.0,6.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
5,21.0,20.0,8.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
6,28.0,20.0,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356
7,34.0,20.0,9.0,Naruto,TV,Manga,220,0:23:00,"['Action', 'Adventure', 'Comedy', 'Super Power...",PG-13 - Teens 13 or older,7.9,716412,705,10,1091313,39356


In [7]:
# Displaying merged dataframes' dimensions
print('Merged dataframe dimensions: ',merged.shape)
print('Merged dataframe with dropped values dimensions: ',merged2.shape)

Merged dataframe dimensions:  (5185878, 16)
Merged dataframe with dropped values dimensions:  (4285126, 16)


# EDA

In [8]:
# Summary of the merged dataframe
merged_summary = ps.DataFrameSummary(merged)
print('categoricals: ', merged_summary.categoricals.tolist())
print('numerics: ', merged_summary.numerics.tolist())
merged_summary.summary()

categoricals:  ['name', 'source', 'duration', 'genre', 'rating']
numerics:  ['user_id', 'anime_id', 'user_rating', 'episodes', 'score', 'scored_by', 'rank', 'popularity', 'members', 'favorites']


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
count,5.18507e+06,5.18507e+06,5.18507e+06,,,,5.18588e+06,,,,5.18588e+06,5.18588e+06,5.18588e+06,5.18588e+06,5.18588e+06,5.18588e+06
mean,36571,9840.16,6.35733,,,,26.1084,,,,7.6791,204695,1713.36,670.587,361274,9619.32
std,21050.5,9243.69,3.64644,,,,40.6386,,,,0.633216,215322,1721.95,967.504,333502,18007.8
min,1,1,-1,,,,2,,,,3.26,3,1,1,15,0
25%,18758,1562,6,,,,12,,,,7.3,53309,386,97,113765,588
50%,36266,7148,8,,,,13,,,,7.69,131093,1141,320,255164,2148
75%,54732,15809,9,,,,25,,,,8.14,278368,2481,855,501230,9545
max,73516,34358,10,,,,3057,,,,9.25,1.10796e+06,13816,15355,1.61056e+06,120331
counts,5185071,5185071,5185071,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878,5185878
uniques,72941,3296,11,4103,1,16,196,48,2206,6,475,2991,3639,3689,3456,1129


In [9]:
# Summary of the merged dataframe with dropped -1 values
merged2_summary = ps.DataFrameSummary(merged2)
print('categoricals: ', merged2_summary.categoricals.tolist())
print('numerics: ', merged2_summary.numerics.tolist())
merged2_summary.summary()

categoricals:  ['name', 'source', 'duration', 'genre', 'rating']
numerics:  ['user_id', 'anime_id', 'user_rating', 'episodes', 'score', 'scored_by', 'rank', 'popularity', 'members', 'favorites']


Unnamed: 0,user_id,anime_id,user_rating,name,type,source,episodes,duration,genre,rating,score,scored_by,rank,popularity,members,favorites
count,4.28432e+06,4.28432e+06,4.28432e+06,,,,4.28513e+06,,,,4.28513e+06,4.28513e+06,4.28513e+06,4.28513e+06,4.28513e+06,4.28513e+06
mean,36590.9,9822.76,7.90416,,,,26.1273,,,,7.69366,208951,1676.79,648.565,368350,9951.18
std,21051.9,9226.15,1.52277,,,,40.4728,,,,0.631949,217688,1701.1,933.142,336625,18379
min,1,1,1,,,,2,,,,3.26,3,1,1,15,0
25%,18785,1564,7,,,,12,,,,7.3,55059,377,93,115952,618
50%,36296,7088,8,,,,13,,,,7.7,133639,1118,313,258422,2464
75%,54825,15793,9,,,,25,,,,8.15,288641,2456,834,514009,10432
max,73516,33421,10,,,,3057,,,,9.25,1.10796e+06,13816,15355,1.61056e+06,120331
counts,4284319,4284319,4284319,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126,4285126
uniques,68840,2968,10,3775,1,16,188,45,2148,6,465,2988,3360,3399,3384,1128


In [10]:
# Defining x axis values (user_ratings)
data = merged2['user_rating'].value_counts().sort_index(ascending=False)

trace = go.Bar(x = data.index,
               text = [f'{val:.2f} %' for val in (data.values / merged2.shape[0] * 100)],
               textposition = 'auto',
               y = [f'{cnt:.2f}' for cnt in data.values],
               )
layout = dict(title = f'Distribution of {merged2.shape[0]} Users by Rating',
              xaxis = dict(title = 'Rating'),
              yaxis = dict(title = 'User Count'))
fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [11]:
# Defining x axis values filtering the user_ratings of user that rated up to 300 animes
data = merged2.groupby('genre')['user_rating'].count().clip(upper=300)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 2,
                                  end = 300,
                                  size = 2))

layout = go.Layout(title = 'Distribution Of Number of Ratings Per Anime (Filtered up to 300 Ratings/Anime)',
                   xaxis = dict(title = 'Number of Ratings Per Anime'),
                   yaxis = dict(title = 'Count'),
                   bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

In [12]:
# Defining x axis values (filtering the user_ratings of user that rated up to 300 animes)
data = merged2.groupby('user_id')['user_rating'].count().clip(upper=300)

trace = go.Histogram(x = data.values,
                     name = 'Ratings',
                     xbins = dict(start = 0,
                                  end = 300,
                                  size = 2))
layout = go.Layout(title = 'Distribution of Number of Ratings Per User (Filtered up to 300 Ratings/User)',
                   xaxis = dict(title = 'Ratings Per User'),
                   yaxis = dict(title = 'User Count'),
                   bargap = 0.2)

fig = go.Figure(data=[trace], layout=layout)
iplot(fig)

# Filtering dataframe

In [13]:
# # For computing reasons I'm limiting the dataframe length to 25,000 users

# merged2=merged2[['user_id', 'name', 'user_rating']]
# merged_sub= merged2[merged2.user_id <= 25000]
# print(merged_sub)
# merged_sub.head()

In [14]:
# Limiting dataframe size to include value counts greater then a min value of anime_id and user_rating, for computing reasons

# Filtering for anime ratings > 50
filter_anime = merged2['anime_id'].value_counts() > 50   
filter_anime = filter_anime[filter_anime].index.tolist()

# Filtering for user_ratings > 50
filter_users = merged2['user_id'].value_counts() > 50   
filter_users = filter_users[filter_users].index.tolist()

merged2_new = merged2[(merged2['anime_id'].isin(filter_anime)) & (merged2['user_id'].isin(filter_users))]
print(f'Unfiltered dataframe dimensions: {merged2.shape}')
print(f'New filtered dataframe dimensions: {merged2_new.shape}')

Unfiltered dataframe dimensions: (4285126, 16)
New filtered dataframe dimensions: (3520589, 16)


# Pivoting table -  creating matrix

In [15]:
# Pivoting table of users on one axis and tv show names along the other. This helps us in defining the similarity between users and anime shows

piv = merged2_new.pivot_table(index=['user_id'], columns=['name'], values='user_rating')
print(f'Pivoted table dimensions: {piv.shape}')
piv.head()

Pivoted table dimensions: (25849, 2040)


name,.hack//Roots,.hack//Sign,009-1,07-Ghost,11eyes,12-sai.: Chicchana Mune no Tokimeki,30-sai no Hoken Taiiku,91 Days,A-Channel,A.D. Police (TV),...,Zoids Shinseiki/Zero,Zoku Natsume Yuujinchou,Zoku Sayonara Zetsubou Sensei,Zombie-Loan,"Zone of the Enders: Dolores, I",ef: A Tale of Melodies.,ef: A Tale of Memories.,s.CRY.ed,xxxHOLiC,xxxHOLiC Kei
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3.0,,,,,,,,,,,...,,,,7.0,,,,,,
5.0,,,,,,,,,,,...,,,7.0,,,,,,2.0,
7.0,,,,,,,,,,,...,,,,,,,,,,
11.0,,,,,,,,,,,...,,,,,,,,,,
14.0,,,,,,,,,,,...,,,,,,,,,,


# Transposing/Modifying the Matrix

In [16]:
# Note: As we are subtracting the mean from each rating to standardize
# all users with only one rating or who had rated everything the same will be dropped

# Normalizing the values
piv_norm = piv.apply(lambda x: (x-np.mean(x))/(np.max(x)-np.min(x)), axis=1)
# # Drop all columns containing only zeros representing users who did not rate
piv_norm.fillna(0, inplace=True)


# Transposing the Matrix
piv_norm = piv_norm.T
piv_norm = piv_norm.loc[:, (piv_norm != 0).any(axis=0)]

print(piv_norm.shape)
piv_norm.head()

(2040, 25759)


user_id,3.0,5.0,7.0,11.0,14.0,17.0,21.0,23.0,27.0,29.0,...,73494.0,73495.0,73499.0,73500.0,73502.0,73503.0,73504.0,73507.0,73510.0,73515.0
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
.hack//Roots,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
.hack//Sign,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.362745,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
009-1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
07-Ghost,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,-0.027536,0.0,0.0,0.0,0.0,0.0,0.0
11eyes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-0.745911,0.0,0.0


In [17]:
# Our data needs to be in a sparse matrix format to be read by the following functions
piv_sparse = sp.sparse.csr_matrix(piv_norm.values)
print(piv_sparse.shape)
piv_sparse

(2040, 25759)


<2040x25759 sparse matrix of type '<class 'numpy.float64'>'
	with 3501411 stored elements in Compressed Sparse Row format>

# Cosine similarity

In [18]:
# Matrix of the computed cosine similarity of anime to anime 
anime_similarity = cosine_similarity(piv_sparse)
# print(anime_similarity.shape)

In [19]:
# Matrix of the computed cosine similarity of user to user 
user_similarity = cosine_similarity(piv_sparse.T)
# print(user_similarity.shape)

In [20]:
# Inserting the 2 similarity matricies into separate dataframe objects

# Item to Item cosine similarity 
anime_sim_df = pd.DataFrame(anime_similarity, index = piv_norm.index, columns = piv_norm.index)
# User to User cosine similarity 
user_sim_df = pd.DataFrame(user_similarity, index = piv_norm.columns, columns = piv_norm.columns)

# Functions for Recommendations

In [21]:
# This function will return the top 10 shows with the highest cosine similarity value

def top_animes(anime_title):
    count = 1
    print(f'Top 10 similar Anime shows compared to {anime_title}:\n')
    for anime in anime_sim_df.sort_values(by = anime_title, ascending = False).index[1:11]: 
        print(f'No. {count}: {anime}')
        count +=1

In [22]:
# This function will return the top 5 users with the highest similarity value 

def top_users(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    print(f'Top 10 similar Users compared to {user}:\n')
    sim_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:11]
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    zipped = zip(sim_users, sim_values,)
    for user, sim in zipped:
        print(f'User #{user}, Similarity value: {sim:.2f}')

In [23]:
# This function constructs a list of lists containing the highest rated shows per similar user
# and returns the name of the show along with the frequency it appears in the list

def similar_user_recs(user):
    
    if user not in piv_norm.columns:
        return(f'No data available on user {user}')
    
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:11]
    best = []
    most_common = {}
    
    for i in sim_users:
        max_score = piv_norm.loc[:, i].max()
        best.append(piv_norm[piv_norm.loc[:, i]==max_score].index.tolist())
    for i in range(len(best)):
        for j in best[i]:
            if j in most_common:
                most_common[j] += 1
            else:
                most_common[j] = 1
    sorted_list = sorted(most_common.items(), key=operator.itemgetter(1), reverse=True)
    return sorted_list[:5]

In [24]:
# This function calculates the weighted average of similar users
# to determine a potential rating for an input user and show

def predicted_rating(anime_name, user):
    sim_users = user_sim_df.sort_values(by=user, ascending=False).index[1:1000]
    user_values = user_sim_df.sort_values(by=user, ascending=False).loc[:,user].tolist()[1:1000]
    rating_list = []
    weight_list = []
    for j, i in enumerate(sim_users):
        rating = piv.loc[i, anime_name]
        similarity = user_values[j]
        if np.isnan(rating):
            continue
        elif not np.isnan(rating):
            rating_list.append(rating*similarity)
            weight_list.append(similarity)
    return sum(rating_list)/sum(weight_list)

# Recommendations

In [25]:
top_animes('Naruto')

Top 10 similar Anime shows compared to Naruto:

No. 1: Bleach
No. 2: Dragon Ball Z
No. 3: Dragon Ball GT
No. 4: Dragon Ball
No. 5: Fairy Tail
No. 6: InuYasha
No. 7: Yu☆Gi☆Oh! Duel Monsters
No. 8: Pokemon
No. 9: Shaman King
No. 10: Sword Art Online


In [26]:
top_users(5)

Top 10 similar Users compared to 5:

User #37643.0, Similarity value: 0.32
User #65261.0, Similarity value: 0.29
User #33848.0, Similarity value: 0.29
User #2300.0, Similarity value: 0.28
User #23306.0, Similarity value: 0.28
User #4512.0, Similarity value: 0.28
User #23869.0, Similarity value: 0.28
User #12033.0, Similarity value: 0.28
User #69394.0, Similarity value: 0.27
User #44912.0, Similarity value: 0.27


In [27]:
similar_user_recs(5)

[('Steins;Gate', 7),
 ('Clannad: After Story', 6),
 ('Fullmetal Alchemist: Brotherhood', 5),
 ('Hajime no Ippo', 5),
 ('Hunter x Hunter (2011)', 5)]

In [28]:
predicted_rating('Bleach', 5)

7.437333747338609

# Mean Squared Error & Root Mean Squared Error

In [29]:
# Function to return list of every show watched by user_id 
def watchlist_of_user(user):
    return piv.T[piv.loc[user,:]>0].index.tolist()

In [30]:
# Make a list of the mean squared errors between actual and predicted value
def sq_errors(anime_title,user):
    watchlist_of_user(user)
    errors = []
    for anime_title in watchlist_of_user(user):
        actual=piv.loc[user, anime_title]
        predicted = predicted_rating(anime_title, user)
        errors.append((actual-predicted)**2)
#         RMSE = np.sqrt(((actual - predicted) ** 2).mean())
        MSE = mean(errors)
        RMSE = sqrt(MSE)
    print(f'The MSE:{MSE}')
    print(f'The RMSE:{RMSE}')

In [31]:
sq_errors('Bleach',5)

The MSE:12.003736068574227
The RMSE:3.4646408282207592
