In [2]:
import pandas as pd

In [100]:
from sklearn.linear_model import LogisticRegression

user_genre_count = pd.read_csv('dataset/users_genres_count.csv')
top_animes = pd.read_csv('dataset/top_animes_complete.csv')


def generate_sets_genre_count(uid_str, current_user, genre_str):
    uid_str_list = uid_str.split(", ")
    genre_str_list = genre_str.split(", ")

    uid_dict = dict()

    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        if uid == current_user:
            continue

        score = int(s_pair[1])
        uid_dict[uid] = 1

    X_list = []
    y_list = []

    for index, row in user_genre_count.iterrows():
        uid = int(row[0])
        if uid == current_user:
            continue
        genres = []

        for genre in genre_str_list:
            genres.append(row[genre])

        temp = []

        for i in range(0, len(genres)):
            temp.append(genres[i])

        arr = np.array(temp)
        X_list.append(arr)

        if uid in uid_dict:
            y_list.append(1)
        else:
            y_list.append(0)

    X = np.array(X_list)
    y = np.array(y_list)

    return X, y


def get_user_info(user_id, genre_str):
    user_info = user_genre_count.loc[user_genre_count['user_id'] == user_id]
    user_info = user_info.iloc[0]
    genre_str_list = genre_str.split(", ")
    temp = []
    for genre in genre_str_list:
        temp.append(user_info[genre])
    user_line = np.array([np.array(temp)])

    return user_line


def predict(user_id, mal_id, user_line=None):
    anime_info = top_animes.loc[top_animes['MAL_ID'] == mal_id].iloc[0]
    if user_line is None:
        user_line = get_user_info(user_id, anime_info["genres"])

    X, y = generate_sets_genre_count(anime_info["completed_user_ids"], user_id, anime_info["genres"])

    clf = LogisticRegression().fit(X, y)

    result = clf.predict(user_line)

    return result[0]


test_users = [272784, 98466, 11716, 45741, 208254]


def evaluate_unsupervised(algorithm):
    for user in test_users:
        r = algorithm(user)
        count = 0
        user_line = get_user_info(user)
        for mal_id in get_mal_id(r):
            current_result = predict(user, mal_id, user_line)
            if current_result == 1:
                count += 1
        print("Precision for user " + str(user) + " is: " + str(count / len(r)))


In [3]:
# import complete anime dataset
anime_data = pd.read_csv('dataset/anime.csv')
anime_data = anime_data[['MAL_ID', 'Name','Genres','Completed']]

In [4]:
# import rating for anime that users marked complete
user_rating = pd.read_csv('dataset/rating_complete.csv')
user_rating.head()

Unnamed: 0,user_id,anime_id,rating
0,0,430,9
1,0,1004,5
2,0,3010,7
3,0,570,7
4,0,2762,9


As the dataset is huge, we will limit the data to top 3000 users who gave ratings and top 1000 anime that are reviewed.

In [101]:
top_users = 3000
top_animes = 1000

In [102]:
user_ranking = user_rating.groupby(['user_id']).count().sort_values('anime_id',ascending=False)
ranking = user_ranking[0:top_users]
# key: user_id, value:{mal_id:score}
user_anime_dict = dict()

In [104]:
def populate_user_anime_dict(user_id):
    piece = user_rating.loc[user_rating['user_id'] == user_id]
    current_user = {}
    for index, row in piece.iterrows():
        mal_id = row["anime_id"]
        score = row["rating"]

        current_user[mal_id] = score

    user_anime_dict[user_id] = current_user

In [105]:
for index, row in ranking.iterrows():
    user_id  = index
    populate_user_anime_dict(user_id)

Generate dataframe to be used from the user_anime_dict

In [106]:
L = sorted([(k,k1,v1) for k,v in user_anime_dict.items() for k1,v1 in v.items()],
            key=lambda x: (x[0], x[1]))
df_top = pd.DataFrame(L, columns=['user_id','MAL_ID','rating'])
df_top.head()

Unnamed: 0,user_id,MAL_ID,rating
0,478,7,5
1,478,15,8
2,478,16,7
3,478,20,7
4,478,22,9


In [11]:
df_top.to_csv('dataset/top_anime_unsupervised_use.csv', index=False)

After sort out the animes associated with each user, now we will perform unsupervised learning with KNN.

In [77]:
import pandas as pd
import warnings

warnings.filterwarnings('ignore')
pd.set_option("max_colwidth", None)

In [107]:
# load anime dataset
anime_df = pd.read_csv('dataset/anime.csv')
usecols = ["MAL_ID", "Name", "Score", "Genres", "Type", "Episodes", "Premiered",
           "Studios", "Source", "Rating", "Completed"]

anime_df = pd.read_csv('dataset/anime.csv', usecols=usecols)
anime_df

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Completed
0,1,Cowboy Bebop,8.78,"Action, Adventure, Comedy, Drama, Sci-Fi, Space",TV,26,Spring 1998,Sunrise,Original,R - 17+ (violence & profanity),718161
1,5,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,Unknown,Bones,Original,R - 17+ (violence & profanity),208333
2,6,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Spring 1998,Madhouse,Manga,PG-13 - Teens 13 or older,343492
3,7,Witch Hunter Robin,7.27,"Action, Mystery, Police, Supernatural, Drama, Magic",TV,26,Summer 2002,Sunrise,Original,PG-13 - Teens 13 or older,46165
4,8,Bouken Ou Beet,6.98,"Adventure, Fantasy, Shounen, Supernatural",TV,52,Fall 2004,Toei Animation,Manga,PG - Children,7314
...,...,...,...,...,...,...,...,...,...,...,...
17557,48481,Daomu Biji Zhi Qinling Shen Shu,Unknown,"Adventure, Mystery, Supernatural",ONA,Unknown,Unknown,Unknown,Novel,Unknown,0
17558,48483,Mieruko-chan,Unknown,"Comedy, Horror, Supernatural",TV,Unknown,Unknown,Passione,Manga,Unknown,0
17559,48488,Higurashi no Naku Koro ni Sotsu,Unknown,"Mystery, Dementia, Horror, Psychological, Supernatural, Thriller",TV,Unknown,Summer 2021,Unknown,Visual novel,R - 17+ (violence & profanity),0
17560,48491,Yama no Susume: Next Summit,Unknown,"Adventure, Slice of Life, Comedy",TV,Unknown,Unknown,8bit,Manga,PG-13 - Teens 13 or older,2


For KNN, we will further process data to create a sparse user item matrix to fit into the data. As the calculation will be very costly, we will limit animes count to top 1000.

In [108]:
# load top users with anime rating dataset
top_animes_count = 1000
rating_popular_anime = pd.read_csv('dataset/top_anime_unsupervised_use.csv')
animes_popular = anime_df.sort_values('Completed', ascending=False)[0:top_animes_count]
top_animes = animes_popular['MAL_ID'].unique()
rating_popular_anime = rating_popular_anime[rating_popular_anime.MAL_ID.isin(top_animes)]
rating_popular_anime.head()

Unnamed: 0,user_id,MAL_ID,rating
3,478,20,7
5,478,24,8
7,478,43,6
8,478,47,8
9,478,48,6


In [109]:
# user item interaction matrix
anime_matrix_UII = rating_popular_anime.pivot_table(index='user_id', columns='MAL_ID', values='rating').fillna(0)
anime_matrix_UII

MAL_ID,1,5,6,18,19,20,24,30,31,32,...,40716,40776,40839,40902,40956,41120,41168,41226,41353,41389
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
478,0.0,0.0,0.0,0.0,0.0,7.0,8.0,0.0,0.0,0.0,...,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0
781,7.0,0.0,10.0,8.0,10.0,7.0,7.0,0.0,0.0,0.0,...,8.0,9.0,7.0,4.0,7.0,0.0,6.0,7.0,7.0,3.0
853,9.0,10.0,8.0,9.0,8.0,9.0,0.0,8.0,0.0,8.0,...,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0
890,9.0,9.0,9.0,8.0,10.0,7.0,2.0,8.0,7.0,9.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
912,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,0.0,0.0,...,8.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,7.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
352832,0.0,0.0,10.0,10.0,10.0,1.0,0.0,10.0,0.0,10.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
352835,0.0,0.0,9.0,0.0,0.0,7.0,0.0,7.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
352922,0.0,0.0,0.0,0.0,7.0,0.0,0.0,3.0,0.0,7.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
352924,10.0,9.0,8.0,8.0,9.0,8.0,8.0,10.0,0.0,8.0,...,0.0,8.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [110]:
# convert the pivot table into a sparse matrix
from scipy.sparse import csr_matrix
rating_matrix = csr_matrix(anime_matrix_UII.values)

# create a KNN model and fit rating matrix
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(rating_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [111]:
import collections
import numpy as np

def predict_knn_user_based(user_index, k):
    distances, indices = model.kneighbors(anime_matrix_UII.loc[user_index, :].values.reshape(1, -1), n_neighbors=k)
    # anime the user has watched
    user_watched = set(user_anime_dict[user_index])
    # populate anime k neighbours have watched
    neighbours_watched = {}
    for i in range(0, len(distances.flatten())):
        neighbours_watched[anime_matrix_UII.index[indices.flatten()[i]]] = user_anime_dict[
            anime_matrix_UII.index[indices.flatten()[i]]].copy()
        for k, v in neighbours_watched[anime_matrix_UII.index[indices.flatten()[i]]].items():
            neighbours_watched[anime_matrix_UII.index[indices.flatten()[i]]][k] = [1 - distances.flatten()[i], v]
    user_unwatched = []
    for nw in neighbours_watched:
        # traverse animes neighbour watched
        an = neighbours_watched[nw].keys() - user_watched.intersection(neighbours_watched[nw].keys())
        for un_an in an:
            user_unwatched.append(un_an)
    common = [item for item, count in collections.Counter(user_unwatched).items() if count > 1]
    common_unwatched_rating = []
    for f in common:
        m, w = [], []
        for u in neighbours_watched:
            if neighbours_watched[u].get(f):
                m.append(neighbours_watched[u].get(f)[0] * neighbours_watched[u].get(f)[1])
                w.append(neighbours_watched[u].get(f)[0])
        common_unwatched_rating.append([np.sum(m) / np.sum(w), f])
    common_unwatched_rating = sorted(common_unwatched_rating, reverse=True)
    return common_unwatched_rating

In [112]:
def recommend_user_based(user_id_input, k=10):
    r_user_based = predict_knn_user_based(user_id_input, k)[:10]
    r_mal = {}
    for d, m in r_user_based:
        r_mal[m] = d
    sorted_keys = sorted(r_mal.keys())
    sorted_distance = [r_mal[k] for k in sorted_keys]
    rec = anime_df[(anime_df['MAL_ID'].isin(sorted_keys))]
    rec = rec.sort_values(by='MAL_ID')
    rec.insert(2, "Distance", sorted_distance, True)
    rec = rec.sort_values(by='Distance')
    return rec

In [113]:
recommendation_user_based_knn = recommend_user_based(478, 10)
recommendation_user_based_knn

Unnamed: 0,MAL_ID,Name,Distance,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Completed
4970,7311,Suzumiya Haruhi no Shoushitsu,8.890991,8.65,"Comedy, Mystery, Romance, School, Sci-Fi, Supernatural",Movie,1,Unknown,Kyoto Animation,Light novel,PG-13 - Teens 13 or older,322186
1863,2043,Cat's Eye,8.99328,7.18,"Action, Adventure, Comedy, Mystery, Romance",TV,73,Summer 1983,Tokyo Movie Shinsha,Manga,PG-13 - Teens 13 or older,6701
14293,37491,Gintama.: Shirogane no Tamashii-hen - Kouhan-sen,8.998149,8.86,"Action, Sci-Fi, Comedy, Historical, Parody, Samurai, Shounen",TV,14,Summer 2018,Bandai Namco Pictures,Manga,PG-13 - Teens 13 or older,67276
2199,2402,Ashita no Joe,9.0,8.28,"Action, Drama, Shounen, Slice of Life, Sports",TV,79,Spring 1970,Mushi Production,Manga,PG-13 - Teens 13 or older,15185
11684,33050,Fate/stay night Movie: Heaven's Feel - III. Spring Song,9.0,8.79,"Action, Supernatural, Magic, Fantasy",Movie,1,Unknown,ufotable,Visual novel,R - 17+ (violence & profanity),19886
2668,2921,Ashita no Joe 2,9.000844,8.67,"Action, Drama, Shounen, Slice of Life, Sports",TV,47,Fall 1980,Tokyo Movie Shinsha,Manga,PG-13 - Teens 13 or older,14113
9913,28977,Gintama°,9.001942,9.1,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,51,Spring 2015,Bandai Namco Pictures,Manga,PG-13 - Teens 13 or older,167130
8413,21195,Soul Eater: Late Night Show,9.002789,6.82,"Action, Fantasy, Comedy",Special,51,Unknown,Bones,Manga,PG-13 - Teens 13 or older,10360
6006,9969,Gintama',9.125955,9.08,"Action, Sci-Fi, Comedy, Historical, Parody, Samurai, Shounen",TV,51,Spring 2011,Sunrise,Manga,PG-13 - Teens 13 or older,190008
833,918,Gintama,9.251211,8.96,"Action, Comedy, Historical, Parody, Samurai, Sci-Fi, Shounen",TV,201,Spring 2006,Sunrise,Manga,PG-13 - Teens 13 or older,230260


In [114]:
evaluate_unsupervised(recommend_user_based)

TypeError: get_user_info() missing 1 required positional argument: 'genre_str'

Now let's try KNN item based approach. There are two different ways to calculate the similarities, cosine for distance between items, and pearson for correlation between items.

In [85]:
# generalize above exploration to a fit function
def predict_knn_item_based_pearson(input_MAL_ID, recommend_count):
    '''
    :return: dataframe of recommended anime
    '''
    user_rating_input = anime_matrix_UII[input_MAL_ID]
    similar_to_input_anime = anime_matrix_UII.corrwith(user_rating_input)
    corr_input = pd.DataFrame(similar_to_input_anime, columns=['correlation'])
    corr_input.dropna(inplace=True)
    corr_input.reset_index(inplace=True)
    corr_input = pd.merge(corr_input, anime_df, on="MAL_ID")
    recommendation = corr_input[corr_input['Completed'] > 10000].sort_values(by='correlation', ascending=False).head(recommend_count+1)
    recommendation.drop(recommendation.index[recommendation['MAL_ID'] == input_MAL_ID], inplace=True)
    return recommendation

In [86]:
# test input using MAL_ID 1
test_user_rating = anime_matrix_UII[1]
test_user_rating.head()
# test functionality using anime with MAL ID 1, 5 nearest neighbours
recommendation_item_based_pearson_test = predict_knn_item_based_pearson(1, 5)
recommendation_item_based_pearson_test

Unnamed: 0,MAL_ID,correlation,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Completed
1,5,0.653634,Cowboy Bebop: Tengoku no Tobira,8.39,"Action, Drama, Mystery, Sci-Fi, Space",Movie,1,Unknown,Bones,Original,R - 17+ (violence & profanity),208333
2,6,0.468294,Trigun,8.24,"Action, Sci-Fi, Adventure, Comedy, Drama, Shounen",TV,26,Spring 1998,Madhouse,Manga,PG-13 - Teens 13 or older,343492
49,205,0.44087,Samurai Champloo,8.5,"Action, Adventure, Comedy, Historical, Samurai, Shounen",TV,26,Spring 2004,Manglobe,Original,R - 17+ (violence & profanity),551621
84,467,0.420354,Koukaku Kidoutai: Stand Alone Complex,8.45,"Action, Military, Sci-Fi, Police, Mecha, Seinen",TV,26,Fall 2002,Production I.G,Manga,R - 17+ (violence & profanity),170891
11,43,0.397354,Koukaku Kidoutai,8.29,"Action, Mecha, Police, Psychological, Sci-Fi, Seinen",Movie,1,Unknown,Production I.G,Manga,R+ - Mild Nudity,325682


In [87]:
recommendation_for_all = animes_popular[:100].sort_values(by='Score', ascending=False)[:10]
top_users = rating_popular_anime.user_id.unique()

In [88]:
def recommend_item_based_pearson(user_id_input):
    '''
    Take user_id to get top-rated anime by the user, then for each anime make prediction
    return the highest 10 correlation anime
    :param user_id_input: user id
    :return: dataframe of recommended anime
    '''
    recommendation_df = pd.DataFrame(columns=['MAL_ID', 'correlation'])
    users_groups = rating_popular_anime.groupby('user_id')
    if user_id_input in top_users:
        user_fav_anime = users_groups.get_group(user_id_input).sort_values(by='rating', ascending=False)[:5]['MAL_ID'].tolist()
        for user_fav_MAL_ID in user_fav_anime:
            r = predict_knn_item_based_pearson(user_fav_MAL_ID, 10)
            recommendation_df = recommendation_df.append(r, ignore_index=True)

        recommendation_df.sort_values('correlation', ascending=False).drop_duplicates('MAL_ID')
        return recommendation_df[:10]
    else:
        return recommendation_for_all

In [89]:
recommendation_item_based_knn_pearson = recommend_item_based_pearson(478)
recommendation_item_based_knn_pearson

Unnamed: 0,MAL_ID,correlation,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Completed
0,23281,0.703873,Psycho-Pass 2,7.42,"Action, Sci-Fi, Police, Psychological",TV,11,Fall 2014,Tatsunoko Production,Original,R - 17+ (violence & profanity),352491.0
1,21339,0.523466,Psycho-Pass Movie,7.74,"Action, Military, Police, Sci-Fi",Movie,1,Unknown,Production I.G,Original,R - 17+ (violence & profanity),141876.0
2,23283,0.414906,Zankyou no Terror,8.12,"Mystery, Psychological, Thriller",TV,11,Summer 2014,MAPPA,Original,R - 17+ (violence & profanity),547800.0
3,14513,0.362664,Magi: The Labyrinth of Magic,8.07,"Action, Adventure, Fantasy, Magic, Shounen",TV,25,Fall 2012,A-1 Pictures,Manga,PG-13 - Teens 13 or older,535193.0
4,28223,0.359191,Death Parade,8.2,"Game, Mystery, Psychological, Drama, Thriller",TV,12,Winter 2015,Madhouse,Original,R - 17+ (violence & profanity),857277.0
5,6746,0.352121,Durarara!!,8.18,"Action, Mystery, Supernatural",TV,24,Winter 2010,Brain's Base,Light novel,R - 17+ (violence & profanity),651208.0
6,22535,0.350316,Kiseijuu: Sei no Kakuritsu,8.4,"Action, Sci-Fi, Horror, Psychological, Drama, Seinen",TV,24,Fall 2014,Madhouse,Manga,R - 17+ (violence & profanity),942917.0
7,18115,0.345764,Magi: The Kingdom of Magic,8.28,"Action, Adventure, Magic, Fantasy, Shounen",TV,25,Fall 2013,A-1 Pictures,Manga,PG-13 - Teens 13 or older,463350.0
8,20507,0.345692,Noragami,8.01,"Action, Adventure, Comedy, Supernatural, Shounen",TV,12,Winter 2014,Bones,Manga,PG-13 - Teens 13 or older,1181215.0
9,14075,0.344398,Zetsuen no Tempest,7.99,"Action, Mystery, Psychological, Drama, Magic, Fantasy, Shounen",TV,24,Fall 2012,Bones,Manga,PG-13 - Teens 13 or older,269385.0


Now let's try cosine based calculation approach

In [90]:
# load top users with anime rating dataset
rating_data = df_top.merge(anime_df, left_on = 'MAL_ID', right_on = 'MAL_ID', how = 'left')
rating_data = rating_data[["user_id", "Name", "MAL_ID","rating"]]
combine_anime_rating = rating_data.dropna(axis = 0, subset = ['Name'])
anime_ratingCount = (combine_anime_rating.groupby(by = ['Name'])['rating'].count().reset_index()[['Name', 'rating']])
rating_data = combine_anime_rating.merge(anime_ratingCount, left_on = 'Name', right_on = 'Name', how = 'left')
rating_data = rating_data.drop(columns = "rating_x")
rating_data = rating_data.rename(columns={"rating_y": "rating"})
rating_data.head()

Unnamed: 0,user_id,Name,MAL_ID,rating
0,478,Witch Hunter Robin,7,853
1,478,Eyeshield 21,15,864
2,478,Hachimitsu to Clover,16,1374
3,478,Naruto,20,2211
4,478,Tennis no Ouji-sama,22,821


In [91]:
count_group = rating_data.groupby('user_id')['rating'].count()
top_users_count = 3000
top_users_process = count_group.dropna().sort_values(ascending=False)[:top_users_count]
top_r = rating_data.join(top_users_process, rsuffix='_r', how='inner', on='user_id')
count_group = rating_data.groupby('MAL_ID')['rating'].count()
top_animes = count_group.dropna().sort_values(ascending=False)[:top_animes_count]
top_r = top_r.join(top_animes, rsuffix='_r', how='inner', on='MAL_ID')
pivot = pd.crosstab(top_r.user_id, top_r.MAL_ID, top_r.rating, aggfunc=np.sum)
pivot.fillna(0, inplace=True)
piviot_table = rating_data.pivot_table(index="Name",columns="user_id", values="rating").fillna(0)
piviot_table.head()

user_id,478,781,853,890,912,943,1177,1231,1393,1397,...,352366,352478,352660,352669,352761,352832,352835,352922,352924,352930
Name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"""0""",0.0,0.0,0.0,307.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,307.0,0.0,0.0
"""Aesop"" no Ohanashi yori: Ushi to Kaeru, Yokubatta Inu",0.0,0.0,0.0,12.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""Bungaku Shoujo"" Kyou no Oyatsu: Hatsukoi",0.0,0.0,0.0,0.0,0.0,0.0,0.0,922.0,0.0,922.0,...,0.0,922.0,0.0,0.0,922.0,0.0,0.0,0.0,0.0,922.0
"""Bungaku Shoujo"" Memoire",0.0,0.0,0.0,0.0,0.0,0.0,0.0,982.0,0.0,982.0,...,0.0,0.0,982.0,982.0,982.0,0.0,0.0,982.0,0.0,982.0
"""Bungaku Shoujo"" Movie",0.0,0.0,0.0,0.0,0.0,0.0,0.0,1196.0,0.0,1196.0,...,0.0,1196.0,1196.0,1196.0,1196.0,0.0,0.0,0.0,1196.0,1196.0


In [92]:
from scipy.sparse import csr_matrix
piviot_table_matrix = csr_matrix(piviot_table.values)
# create knn model and fit matrix
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(metric="cosine", algorithm="brute")
model.fit(piviot_table_matrix)

NearestNeighbors(algorithm='brute', metric='cosine')

In [93]:
def predict_knn_item_based_cosine(anime_name_input, n_neighbors):
    query = piviot_table.loc[anime_name_input, :].values.reshape(1, -1)

    distance, suggestions = model.kneighbors(query, n_neighbors)
    recommendation = {}
    for i in range(0, len(distance.flatten())):
        if i != 0:
            recommendation[piviot_table.index[suggestions.flatten()[i]]] = distance.flatten()[i]

    return recommendation

In [94]:
predict_knn_item_based_cosine('To Heart: Remember My Memories', 10)

{'To Heart': 0.2732727223710337,
 'To Heart: Remember My Memories Specials': 0.29498835462217965,
 'To Heart 2': 0.30237780857340457,
 'To Heart 2 OVA': 0.35235157990445853,
 'To Heart 2 AD': 0.3677287767366777,
 'To Heart 2 AD Plus': 0.38812794166617737,
 'To Heart 2 Special': 0.40202581826607475,
 'To Heart Omake': 0.4271426754758034,
 'To Heart 2 Adnext': 0.4333862988611539}

In [95]:
def recommend_item_based_cosine(user_id_input):
    '''
    Take user_id to get top-rated anime by the user, then for each anime make prediction
    return the highest 10 correlation anime
    :param user_id_input: user id
    :return: dataframe of recommended anime
    '''
    recommendation_df = pd.DataFrame(columns=['MAL_ID'])
    users_groups = rating_popular_anime.groupby('user_id')
    if user_id_input in top_users:
        user_fav_anime = users_groups.get_group(user_id_input).sort_values(by='rating', ascending=False)[:5]['MAL_ID'].tolist()
        for user_fav_MAL_ID in user_fav_anime:
            anime_name=anime_df[anime_df['MAL_ID']==user_fav_MAL_ID]['Name'].values[0]
            r = predict_knn_item_based_cosine(anime_name, 10)
            r_df = anime_df[(anime_df['Name'].isin(r.keys()))]
            distances = []
            for index, row in r_df.iterrows():
                distances.append(r[row['Name']])
            r_df.insert(2, "Distance", distances, True)
            recommendation_df = recommendation_df.append(r_df, ignore_index=True)

        recommendation_df = recommendation_df.sort_values(by='Distance', ascending=True).drop_duplicates('MAL_ID')
        return recommendation_df[:10]
    else:
        return recommendation_for_all

In [96]:
recommendation_item_based_knn_pearson = recommend_item_based_cosine(478)
recommendation_item_based_knn_pearson

Unnamed: 0,MAL_ID,Name,Distance,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Completed
22,10087,Fate/Zero,0.022017,8.34,"Action, Supernatural, Magic, Fantasy",TV,13,Fall 2011,ufotable,Light novel,R - 17+ (violence & profanity),773576.0
7,23281,Psycho-Pass 2,0.076945,7.42,"Action, Sci-Fi, Police, Psychological",TV,11,Fall 2014,Tatsunoko Production,Original,R - 17+ (violence & profanity),352491.0
25,22297,Fate/stay night: Unlimited Blade Works,0.080051,8.22,"Action, Fantasy, Magic, Supernatural",TV,12,Fall 2014,ufotable,Visual novel,R - 17+ (violence & profanity),592974.0
39,6547,Angel Beats!,0.080328,8.15,"Action, Comedy, Drama, School, Supernatural",TV,13,Spring 2010,P.A. Works,Original,PG-13 - Teens 13 or older,1229098.0
26,28701,Fate/stay night: Unlimited Blade Works 2nd Season,0.089437,8.33,"Action, Fantasy, Magic, Supernatural",TV,13,Spring 2015,ufotable,Visual novel,R - 17+ (violence & profanity),506098.0
4,16498,Shingeki no Kyojin,0.094947,8.48,"Action, Military, Mystery, Super Power, Drama, Fantasy, Shounen",TV,25,Spring 2013,Wit Studio,Manga,R - 17+ (violence & profanity),2182587.0
42,11757,Sword Art Online,0.095,7.25,"Action, Game, Adventure, Romance, Fantasy",TV,25,Summer 2012,A-1 Pictures,Light novel,PG-13 - Teens 13 or older,1907261.0
40,9253,Steins;Gate,0.095953,9.11,"Thriller, Sci-Fi",TV,24,Spring 2011,White Fox,Visual novel,PG-13 - Teens 13 or older,1134756.0
37,4224,Toradora!,0.096454,8.24,"Slice of Life, Comedy, Romance, School",TV,25,Fall 2008,J.C.Staff,Light novel,PG-13 - Teens 13 or older,1191775.0
18,356,Fate/stay night,0.099712,7.34,"Action, Supernatural, Magic, Romance, Fantasy",TV,24,Winter 2006,Studio Deen,Visual novel,R - 17+ (violence & profanity),510563.0


Now let's combine our recommendation result from unsupervised knn with supervised training model to see the accuracy result.

In [97]:
def get_mal_id(recommendation_df):
    return list(recommendation_df['MAL_ID'])

In [99]:
print("User based KNN: ")
evaluate_unsupervised(recommend_user_based)

User based KNN: 


ValueError: Incompatible dimension for X and Y matrices: X.shape[1] == 1000 while Y.shape[1] == 3000