In [95]:
import pandas as pd
import numpy as np

In [96]:
# import complete anime dataset
anime_data = pd.read_csv('dataset/anime.csv')
anime_data = anime_data[['MAL_ID', 'Name','Genres','Completed']]

In [97]:
# import rating for anime that users marked complete
user_rating = pd.read_csv('dataset/rating_complete.csv')
user_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


As the dataset is huge, we will limit the data to top 3000 users who gave ratings and top 1000 anime that are reviewed.

In [98]:
top_users = 3000
top_animes = 1000

In [99]:
user_ranking = user_rating.groupby(['user_id']).count().sort_values('anime_id',ascending=False)
ranking = user_ranking[0:top_users]
# key: user_id, value:{mal_id:score}
user_anime_dict = dict()

In [100]:
def populate_user_anime_dict(user_id):
    piece = user_rating.loc[user_rating['user_id'] == user_id]
    current_user = {}
    for index, row in piece.iterrows():
        mal_id = row["anime_id"]
        score = row["rating"]

        current_user[mal_id] = score

    user_anime_dict[user_id] = current_user

In [101]:
for index, row in ranking.iterrows():
    user_id  = index
    populate_user_anime_dict(user_id)

In [102]:
%store user_anime_dict

Stored 'user_anime_dict' (dict)


Generate dataframe to be used from the user_anime_dict

In [177]:
L = sorted([(k,k1,v1) for k,v in user_anime_dict.items() for k1,v1 in v.items()],
            key=lambda x: (x[0], x[1]))
df_top = pd.DataFrame(L, columns=['user_id','MAL_ID','rating'])
df_top.head()

Unnamed: 0,user_id,MAL_ID,rating
0,478,7,5
1,478,15,8
2,478,16,7
3,478,20,7
4,478,22,9


Now let's group up the rows by user_id.

In [179]:
user_subset_group_for_test = df_top.groupby(['user_id'])

Check out one user and make it our test input, e.g. 478

In [180]:
test_user_input = user_subset_group_for_test.get_group(478)
test_user_input=test_user_input.sort_values(by='rating', ascending=False)
print(len(test_user_input))
test_user_input.head()

1404


Unnamed: 0,user_id,MAL_ID,rating
789,478,13601,10
1138,478,31043,10
1079,478,27833,10
31,478,135,10
492,478,5114,10


In [181]:
test_similar_users_anime = df_top[df_top['MAL_ID'].isin(test_user_input['MAL_ID'].tolist())]
test_similar_users_anime.head()

Unnamed: 0,user_id,MAL_ID,rating
0,478,7,5
1,478,15,8
2,478,16,7
3,478,20,7
4,478,22,9


In [182]:
similar_users_group = test_similar_users_anime.groupby('user_id')
similar_users_group.get_group(252916)

Unnamed: 0,user_id,MAL_ID,rating
3653388,252916,15,7
3653389,252916,16,7
3653391,252916,20,10
3653392,252916,22,10
3653398,252916,47,7
...,...,...,...
3655086,252916,38680,10
3655091,252916,38889,10
3655092,252916,39063,6
3655093,252916,39198,7


Next we want to compare users to an input user and find the users that are similar to the input user.
We will use Pearson Correlation Coefficient to measure how strong the linear association between two variables.

In [106]:
#from scipy.stats import pearsonr

In [183]:
def get_pearson_df(user_input):
    # user_input: dataframe with MAL_ID | rating
    similar_users_anime = df_top[df_top['MAL_ID'].isin(user_input['MAL_ID'].tolist())]
    similar_users_group = similar_users_anime.groupby('user_id')
    similar_users_group_sorted = sorted(similar_users_group, key=lambda x:len(x[1]), reverse=True)
    pearson_correlation_dict = {}
    # key: user_id, val: pearson coefficient

    # as the computation takes a long time, we will limit the users for the most similar 100 users
    for uid, group in similar_users_group_sorted:
        group = group.sort_values(by='MAL_ID')
        user_input = user_input.sort_values(by='MAL_ID')
        # N similar movies watched
        N_similar = len(group)
        # get rating for the anime that are in common
        df_in_common = user_input[user_input['MAL_ID'].isin(group['MAL_ID'].tolist())]
        # convert rating to list
        shared_rating = df_in_common['rating'].tolist()
        # get current user group reviews and convert rating to list
        rating_list = group['rating'].tolist()

        # calculate pearson correlation
        xx = sum([i**2 for i in shared_rating]) - pow(sum(shared_rating),2)/float(N_similar)
        yy = sum([i**2 for i in rating_list]) - pow(sum(rating_list),2)/float(N_similar)
        xy = sum( i*j for i, j in zip(shared_rating, rating_list)) - sum(shared_rating)*sum(rating_list)/float(N_similar)

        if xx != 0 and yy != 0:
            pearson_correlation_dict[uid] = xy/np.sqrt(xx*yy)
        else:
            pearson_correlation_dict[uid] = 0

    pearson = pd.DataFrame.from_dict(pearson_correlation_dict, orient='index')
    pearson.columns = ['pearson_correlation']
    pearson['user_id'] = pearson.index
    pearson.index = range(len(pearson))
    return pearson

pearson_df = get_pearson_df(test_user_input)
pearson_df=pearson_df.drop(pearson_df.index[0])
pearson_df.head()

Unnamed: 0,pearson_correlation,user_id
1,0.051128,189037
2,0.145511,68042
3,0.290384,283786
4,0.0,162615
5,0.347333,277841


In [125]:
# sort most similar users

top_similar_users=pearson_df.sort_values(by='pearson_correlation', ascending=False)[:50]
top_similar_users.head()

Unnamed: 0,pearson_correlation,user_id
2998,0.782318,349997
2997,0.749038,63900
2992,0.682592,138623
2956,0.614916,16057
877,0.590065,314671


In [126]:
top_similar_users_rating = top_similar_users.merge(user_rating, left_on='user_id', right_on='user_id', how='inner')
top_similar_users_rating.head()

Unnamed: 0,pearson_correlation,user_id,anime_id,rating
0,0.782318,349997,3918,5
1,0.782318,349997,7969,5
2,0.782318,349997,18525,7
3,0.782318,349997,5573,3
4,0.782318,349997,2189,6


In [127]:
top_similar_users_rating['weighted_rating'] = top_similar_users_rating['pearson_correlation']*top_similar_users_rating['rating']
top_similar_users_rating.head()

Unnamed: 0,pearson_correlation,user_id,anime_id,rating,weighted_rating
0,0.782318,349997,3918,5,3.911591
1,0.782318,349997,7969,5,3.911591
2,0.782318,349997,18525,7,5.476228
3,0.782318,349997,5573,3,2.346955
4,0.782318,349997,2189,6,4.69391


In [128]:
top_similar_users_rating_sum = top_similar_users_rating.groupby('anime_id').sum()[['pearson_correlation','weighted_rating']]
top_similar_users_rating_sum.columns = ['sum_pearson_correlation','sum_weighted_rating']
top_similar_users_rating_sum.head()

Unnamed: 0_level_0,sum_pearson_correlation,sum_weighted_rating
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,18.165635,161.567958
5,15.589602,126.284139
6,10.636725,86.064641
7,4.076475,28.047766
8,1.982132,12.884388


In [132]:
recommendation_df = pd.DataFrame()
recommendation_df['weighted_average_recommendation_score'] = top_similar_users_rating_sum['sum_weighted_rating']/top_similar_users_rating_sum['sum_pearson_correlation']
recommendation_df['anime_id'] = top_similar_users_rating_sum.index
print(len(recommendation_df))
recommendation_df.head()

13530


Unnamed: 0_level_0,weighted_average_recommendation_score,anime_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,8.894154,1
5,8.100537,5
6,8.091273,6
7,6.880396,7
8,6.500267,8


In [184]:
recommendation_df = recommendation_df.sort_values(by='weighted_average_recommendation_score', ascending=False)
recommendation_df = recommendation_df[~recommendation_df['anime_id'].isin(test_user_input['MAL_ID'])]
recommendation_df[:20]

Unnamed: 0_level_0,weighted_average_recommendation_score,anime_id
anime_id,Unnamed: 1_level_1,Unnamed: 2_level_1
42129,10.0,42129
8353,10.0,8353
35035,10.0,35035
37962,10.0,37962
35056,10.0,35056
35055,10.0,35055
35058,10.0,35058
39731,10.0,39731
28561,10.0,28561
28023,10.0,28023


In [187]:
# user original
#anime_data.loc[anime_data['MAL_ID'].isin(test_user_input['MAL_id']).tolist()]
user_original = pd.merge(test_user_input, anime_data)
user_original[:50]

Unnamed: 0,user_id,MAL_ID,rating,Name,Genres,Completed
0,478,13601,10,Psycho-Pass,"Action, Sci-Fi, Police, Psychological",720340
1,478,31043,10,Boku dake ga Inai Machi,"Mystery, Psychological, Supernatural, Seinen",1071295
2,478,27833,10,Durarara!!x2 Ketsu,"Action, Mystery, Supernatural",194936
3,478,135,10,Hikaru no Go,"Comedy, Game, Shounen, Supernatural",62705
4,478,5114,10,Fullmetal Alchemist: Brotherhood,"Action, Military, Adventure, Comedy, Drama, Ma...",1644938
5,478,32983,10,Natsume Yuujinchou Go,"Slice of Life, Demons, Supernatural, Drama, Sh...",72069
6,478,11123,10,Sekaiichi Hatsukoi 2,"Comedy, Drama, Romance, Shounen Ai",107736
7,478,11375,10,.hack//The Movie: Sekai no Mukou ni,"Action, Adventure, Fantasy, Game, Magic, Myste...",9647
8,478,16498,10,Shingeki no Kyojin,"Action, Military, Mystery, Super Power, Drama,...",2182587
9,478,11665,10,Natsume Yuujinchou Shi,"Slice of Life, Demons, Supernatural, Drama, Sh...",114646


In [164]:
# display the top 20 anime recommendation
anime_data.loc[anime_data['MAL_ID'].isin(recommendation_df['anime_id'].head(20).tolist())]

Unnamed: 0,MAL_ID,Name,Genres,Completed
741,820,Ginga Eiyuu Densetsu,"Military, Sci-Fi, Space, Drama",52583
5340,8353,Ketsuinu,Comedy,84
6006,9969,Gintama',"Action, Sci-Fi, Comedy, Historical, Parody, Sa...",190008
6474,11061,Hunter x Hunter (2011),"Action, Adventure, Fantasy, Shounen, Super Power",1094486
9691,28023,Tsuru no Sugomori,Kids,34
9807,28561,Saru Tarou Jishin ni wa Makenai zo!: Jishin e ...,"Drama, Kids",31
9913,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",167130
10780,31004,Niji no Kakehashi,"Drama, Kids",28
11684,33050,Fate/stay night Movie: Heaven's Feel - III. Sp...,"Action, Supernatural, Magic, Fantasy",19886
12242,34096,Gintama.,"Action, Comedy, Historical, Parody, Samurai, S...",115105
