In [1]:
import pandas as pd
import numpy as np

In [2]:
# import complete anime dataset
anime_data = pd.read_csv('dataset/anime.csv')
anime_data = anime_data[['MAL_ID', 'Name','Genres','Completed']]
anime_data.head()

Unnamed: 0,MAL_ID,Name,Genres,Completed
0,1,Cowboy Bebop,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",718161
1,5,Cowboy Bebop: Tengoku no Tobira,"Action, Drama, Mystery, Sci-Fi, Space",208333
2,6,Trigun,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",343492
3,7,Witch Hunter Robin,"Action, Mystery, Police, Supernatural, Drama, ...",46165
4,8,Bouken Ou Beet,"Adventure, Fantasy, Shounen, Supernatural",7314


In [3]:
df_top = pd.read_csv('dataset/top_anime_unsupervised_use.csv')
df_top.head()

Unnamed: 0,user_id,MAL_ID,rating
0,478,7,5
1,478,15,8
2,478,16,7
3,478,20,7
4,478,22,9


In [4]:
user_subset_group_for_test = df_top.groupby(['user_id'])

Check out one user and make it our test input, e.g. 478

In [5]:
test_user_input = user_subset_group_for_test.get_group(478)
test_user_input=test_user_input.sort_values(by='rating', ascending=False)
print(len(test_user_input))
test_user_input.head()

1404


Unnamed: 0,user_id,MAL_ID,rating
789,478,13601,10
1138,478,31043,10
1079,478,27833,10
31,478,135,10
492,478,5114,10


In [6]:
test_similar_users_anime = df_top[df_top['MAL_ID'].isin(test_user_input['MAL_ID'].tolist())]
test_similar_users_anime.head()

Unnamed: 0,user_id,MAL_ID,rating
0,478,7,5
1,478,15,8
2,478,16,7
3,478,20,7
4,478,22,9


In [7]:
similar_users_group = test_similar_users_anime.groupby('user_id')
similar_users_group.get_group(252916)

Unnamed: 0,user_id,MAL_ID,rating
3653388,252916,15,7
3653389,252916,16,7
3653391,252916,20,10
3653392,252916,22,10
3653398,252916,47,7
...,...,...,...
3655086,252916,38680,10
3655091,252916,38889,10
3655092,252916,39063,6
3655093,252916,39198,7


Next we want to compare users to an input user and find the users that are similar to the input user.
We will use Pearson Correlation Coefficient to measure how strong the linear association between two variables.

In [8]:
def get_pearson_df(user_input):
    # user_input: dataframe with MAL_ID | rating
    similar_users_anime = df_top[df_top['MAL_ID'].isin(user_input['MAL_ID'].tolist())]
    similar_users_group = similar_users_anime.groupby('user_id')
    similar_users_group_sorted = sorted(similar_users_group, key=lambda x:len(x[1]), reverse=True)
    pearson_correlation_dict = {}
    # key: user_id, val: pearson coefficient

    # as the computation takes a long time, we will limit the users for the most similar 100 users
    for uid, group in similar_users_group_sorted:
        group = group.sort_values(by='MAL_ID')
        user_input = user_input.sort_values(by='MAL_ID')
        # N similar movies watched
        N_similar = len(group)
        # get rating for the anime that are in common
        df_in_common = user_input[user_input['MAL_ID'].isin(group['MAL_ID'].tolist())]
        # convert rating to list
        shared_rating = df_in_common['rating'].tolist()
        # get current user group reviews and convert rating to list
        rating_list = group['rating'].tolist()

        # calculate pearson correlation
        xx = sum([i**2 for i in shared_rating]) - pow(sum(shared_rating),2)/float(N_similar)
        yy = sum([i**2 for i in rating_list]) - pow(sum(rating_list),2)/float(N_similar)
        xy = sum( i*j for i, j in zip(shared_rating, rating_list)) - sum(shared_rating)*sum(rating_list)/float(N_similar)

        if xx != 0 and yy != 0:
            pearson_correlation_dict[uid] = xy/np.sqrt(xx*yy)
        else:
            pearson_correlation_dict[uid] = 0

    pearson = pd.DataFrame.from_dict(pearson_correlation_dict, orient='index')
    pearson.columns = ['pearson_correlation']
    pearson['user_id'] = pearson.index
    pearson.index = range(len(pearson))
    return pearson

pearson_df = get_pearson_df(test_user_input)
pearson_df=pearson_df.drop(pearson_df.index[0])
pearson_df.head()

Unnamed: 0,pearson_correlation,user_id
1,0.051128,189037
2,0.145511,68042
3,0.290384,283786
4,0.0,162615
5,0.347333,277841


In [9]:
# sort most similar users

top_similar_users=pearson_df.sort_values(by='pearson_correlation', ascending=False)[:50]
top_similar_users.head()

Unnamed: 0,pearson_correlation,user_id
2998,0.782318,349997
2997,0.749038,63900
2992,0.682592,138623
2956,0.614916,16057
877,0.590065,314671


In [10]:
top_similar_users_rating = top_similar_users.merge(df_top, left_on='user_id', right_on='user_id', how='inner')
top_similar_users_rating.head()

Unnamed: 0,pearson_correlation,user_id,MAL_ID,rating
0,0.782318,349997,188,5
1,0.782318,349997,203,6
2,0.782318,349997,211,6
3,0.782318,349997,213,6
4,0.782318,349997,214,6


In [11]:
top_similar_users_rating['weighted_rating'] = top_similar_users_rating['pearson_correlation']*top_similar_users_rating['rating']
top_similar_users_rating.head()

Unnamed: 0,pearson_correlation,user_id,MAL_ID,rating,weighted_rating
0,0.782318,349997,188,5,3.911591
1,0.782318,349997,203,6,4.69391
2,0.782318,349997,211,6,4.69391
3,0.782318,349997,213,6,4.69391
4,0.782318,349997,214,6,4.69391


In [13]:
top_similar_users_rating_sum = top_similar_users_rating.groupby('MAL_ID').sum()[['pearson_correlation','weighted_rating']]
top_similar_users_rating_sum.columns = ['sum_pearson_correlation','sum_weighted_rating']
top_similar_users_rating_sum.head()

Unnamed: 0_level_0,sum_pearson_correlation,sum_weighted_rating
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18.165635,161.567958
5,15.589602,126.284139
6,10.636725,86.064641
7,4.076475,28.047766
8,1.982132,12.884388


In [14]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted_average_recommendation_score'] = top_similar_users_rating_sum['sum_weighted_rating']/top_similar_users_rating_sum['sum_pearson_correlation']
recommendation_df['anime_id'] = top_similar_users_rating_sum.index
print(len(recommendation_df))
recommendation_df.head()

13530


Unnamed: 0_level_0,weighted_average_recommendation_score,anime_id
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8.894154,1
5,8.100537,5
6,8.091273,6
7,6.880396,7
8,6.500267,8


In [15]:
recommendation_df = recommendation_df.sort_values(by='weighted_average_recommendation_score', ascending=False)
recommendation_df = recommendation_df[~recommendation_df['anime_id'].isin(test_user_input['MAL_ID'])]
recommendation_df[:20]

Unnamed: 0_level_0,weighted_average_recommendation_score,anime_id
MAL_ID,Unnamed: 1_level_1,Unnamed: 2_level_1
42129,10.0,42129
39731,10.0,39731
31004,10.0,31004
8353,10.0,8353
35054,10.0,35054
28023,10.0,28023
28561,10.0,28561
38891,10.0,38891
35058,10.0,35058
35055,10.0,35055


In [16]:
# user original
#anime_data.loc[anime_data['MAL_ID'].isin(test_user_input['MAL_id']).tolist()]
user_original = pd.merge(test_user_input, anime_data)
user_original[:10]

Unnamed: 0,user_id,MAL_ID,rating,Name,Genres,Completed
0,478,13601,10,Psycho-Pass,"Action, Sci-Fi, Police, Psychological",720340
1,478,31043,10,Boku dake ga Inai Machi,"Mystery, Psychological, Supernatural, Seinen",1071295
2,478,27833,10,Durarara!!x2 Ketsu,"Action, Mystery, Supernatural",194936
3,478,135,10,Hikaru no Go,"Comedy, Game, Shounen, Supernatural",62705
4,478,5114,10,Fullmetal Alchemist: Brotherhood,"Action, Military, Adventure, Comedy, Drama, Ma...",1644938
5,478,32983,10,Natsume Yuujinchou Go,"Slice of Life, Demons, Supernatural, Drama, Sh...",72069
6,478,11123,10,Sekaiichi Hatsukoi 2,"Comedy, Drama, Romance, Shounen Ai",107736
7,478,11375,10,.hack//The Movie: Sekai no Mukou ni,"Action, Adventure, Fantasy, Game, Magic, Myste...",9647
8,478,16498,10,Shingeki no Kyojin,"Action, Military, Mystery, Super Power, Drama,...",2182587
9,478,11665,10,Natsume Yuujinchou Shi,"Slice of Life, Demons, Supernatural, Drama, Sh...",114646


In [17]:
# display the top 10 anime recommendation
recommendation_to_user = anime_data.loc[anime_data['MAL_ID'].isin(recommendation_df['anime_id'].head(10))]
recommendation_to_user

Unnamed: 0,MAL_ID,Name,Genres,Completed
5340,8353,Ketsuinu,Comedy,84
9691,28023,Tsuru no Sugomori,Kids,34
9807,28561,Saru Tarou Jishin ni wa Makenai zo!: Jishin e ...,"Drama, Kids",31
10780,31004,Niji no Kakehashi,"Drama, Kids",28
12788,35054,Ni wa no Kotori,Kids,18
12789,35055,Mametarou Ganbare,Kids,20
12792,35058,Trojan wo Jibun no Te de,"Historical, Kids",17
15204,38891,Sora no Method: Mou Hitotsu no Negai,"Slice of Life, Comedy, Fantasy, School",1292
15778,39731,Na Bbeun Sang Sa,"Slice of Life, Hentai, Drama, Romance, Thrille...",131
16848,42129,Bem Movie: Become Human,"Action, Horror, Demons, Supernatural",1512


In [18]:
import math

def counter_cosine_similarity(c1, c2):
    terms = set(c1).union(c2)
    dotprod = sum(c1.get(k, 0) * c2.get(k, 0) for k in terms)
    magA = math.sqrt(sum(c1.get(k, 0)**2 for k in terms))
    magB = math.sqrt(sum(c2.get(k, 0)**2 for k in terms))
    return dotprod / (magA * magB)

In [19]:
user_original_explode = user_original.assign(Genres=user_original.Genres.str.split(", ")).explode('Genres')
recommendation_to_user_explode = recommendation_to_user.assign(Genres=recommendation_to_user.Genres.str.split(", ")).explode('Genres')

In [20]:
from collections import Counter

print("Similarity between user tastes and recommendation:", counter_cosine_similarity(Counter(user_original_explode.Genres), Counter(recommendation_to_user.Genres)))

Similarity between user tastes and recommendation: 0.12338948875458465
