In [1]:
import pandas as pd
import numpy as np
import warnings
import random
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer, MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression

warnings.filterwarnings('ignore')

## Using TF-IDF - Content based recommendation


### Data Preprocessing Part
- Load dataset - get user top rating anime / top users
- Clean up data frame for training and testing

In [2]:
top_users_count = 3000
top_animes_count = 1000

# contains user_id, MAL_ID, rating
rating_top_anime = pd.read_csv('dataset/top_anime_unsupervised_use.csv')

# contains all anime info
usecols = ["MAL_ID", "Name", "Score", "Genres", "Type", "Episodes", "Premiered",
           "Studios", "Source", "Rating", "Members", "Completed"]

# contains all anime info
anime_df = pd.read_csv('dataset/anime.csv', usecols=usecols, low_memory=True)
animes_popular = anime_df.sort_values('Completed', ascending=False)[0:top_animes_count]

In [3]:
# get user top rating anime 
user_rating_data = pd.read_csv('dataset/rating_complete.csv')
user_ranking = user_rating_data.groupby(['user_id']).count().sort_values('anime_id',ascending=False)

top_animes = animes_popular['MAL_ID'].unique()
rating_popular_anime = animes_popular[animes_popular.MAL_ID.isin(top_animes)]

user_genre_count = pd.read_csv('dataset/users_genres_count.csv')
top_animes_complete = pd.read_csv('dataset/top_animes_complete.csv')
# get top users
top_users = user_ranking[0:top_users_count]
rating_popular_anime.reset_index(drop=True, inplace=True)

- Now we are getting 3000 top users and 1000 most popular anime

- Split multiple label and remove the unknown catalog

In [4]:
def process_multilabel(series):
    series = series.split(",")
    if "Unknown" in series:
        series.remove("Unknown")
    return series

anime_data = rating_popular_anime.copy()

anime_data["Genres"] = anime_data["Genres"].map(process_multilabel)
anime_data["Studios"] = anime_data["Studios"].map(process_multilabel)
anime_data["Score"] = anime_data["Score"].replace("Unknown", 0).astype(float)
anime_data["Episodes"] = anime_data["Episodes"].replace("Unknown", 0).astype(int)
anime_data.head()

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
0,16498,Shingeki no Kyojin,8.48,"[Action, Military, Mystery, Super Power, D...",TV,25,Spring 2013,[Wit Studio],Manga,R - 17+ (violence & profanity),2531397,2182587
1,1535,Death Note,8.63,"[Mystery, Police, Psychological, Supernatur...",TV,37,Fall 2006,[Madhouse],Manga,R - 17+ (violence & profanity),2589552,2146116
2,11757,Sword Art Online,7.25,"[Action, Game, Adventure, Romance, Fantasy]",TV,25,Summer 2012,[A-1 Pictures],Light novel,PG-13 - Teens 13 or older,2214395,1907261
3,30276,One Punch Man,8.57,"[Action, Sci-Fi, Comedy, Parody, Super Pow...",TV,12,Fall 2015,[Madhouse],Web manga,R - 17+ (violence & profanity),2123866,1841220
4,31964,Boku no Hero Academia,8.11,"[Action, Comedy, School, Shounen, Super Po...",TV,13,Spring 2016,[Bones],Manga,PG-13 - Teens 13 or older,1909814,1655900


- Normalized and getting some details of the data

In [5]:
def to_category(df, column, is_multilabel=False):
    # helper function for to handling the multi-catagory label
    lb = LabelBinarizer()
    if is_multilabel:
        lb = MultiLabelBinarizer()
        
    expandedLabelData = lb.fit_transform(df[column])
    labelClasses = lb.classes_

    category_df = pd.DataFrame(expandedLabelData, columns=labelClasses)
    del df[column]
    return pd.concat([df, category_df], axis=1)

# clean up data
def generate_cleaned_data(anime_data):
    anime_metadata = anime_data.copy()
    anime_metadata = to_category(anime_metadata, 'Source')
    anime_metadata = to_category(anime_metadata, 'Premiered')
    anime_metadata = to_category(anime_metadata, 'Studios', is_multilabel=True)
    anime_metadata = to_category(anime_metadata, 'Type')
    anime_metadata = to_category(anime_metadata, "Rating")

    Genres = anime_metadata["Genres"]
    anime_id = anime_metadata[['MAL_ID', 'Name']]

    del anime_metadata["Genres"]
    del anime_metadata['MAL_ID']
    del anime_metadata['Name']
    del anime_metadata['Unknown']
    return anime_metadata, Genres, anime_id


In [8]:
anime_metadata, Genres, anime_id = generate_cleaned_data(anime_data)
anime_metadata[["Score", "Episodes", "Members"]] = MinMaxScaler().fit_transform(anime_metadata[["Score", "Episodes", "Members"]])
anime_metadata.head()

Unnamed: 0,Score,Episodes,Members,Completed,4-koma manga,Card game,Game,Light novel,Manga,Music,...,ONA,OVA,Special,TV,G - All Ages,PG - Children,PG-13 - Teens 13 or older,R - 17+ (violence & profanity),R+ - Mild Nudity,Rx - Hentai
0,0.877163,0.048096,0.976679,2182587,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
1,0.903114,0.072144,1.0,2146116,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,0
2,0.66436,0.048096,0.849557,1907261,0,0,0,1,0,0,...,0,0,0,1,0,0,1,0,0,0
3,0.892734,0.022044,0.813254,1841220,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
4,0.813149,0.024048,0.727416,1655900,0,0,0,0,1,0,...,0,0,0,1,0,0,1,0,0,0


In [9]:
anime_metadata1 = anime_metadata.copy()
anime_metadata = anime_metadata.values

### Model Part
- Using TF-IDF transform genre
- Fit knn model

In [10]:
def tf_idf_transform_genre(anime_data, genres_col):
    tfv = TfidfVectorizer(min_df=3, max_features=None,
                      strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}',
                      ngram_range=(1, 3),
                      stop_words = 'english')

    # Filling NaNs with empty string
    genres_original = anime_data['Genres'].fillna('').astype(str)
    genres_vector_tf_idf = tfv.fit_transform(genres_original)
    genres_vector_one_hot = to_category(pd.DataFrame(genres_col), "Genres", True).values

    return genres_vector_tf_idf, genres_vector_one_hot

# create knn model
def knn_model(anime_metadata, synopsis_vector_tf_idf):
    all_meta_data = np.concatenate((anime_metadata.values, synopsis_vector_tf_idf.todense()), axis=1)

    # create knn model
    cb_model_knn = NearestNeighbors(metric='cosine', n_neighbors=9)
    cb_model_knn.fit(csr_matrix(all_meta_data))
    return all_meta_data, cb_model_knn


genres_vector_tf_idf, genres_vector_one_hot = tf_idf_transform_genre(anime_data, Genres)

- Content based recommendation based on differnt query_index

In [12]:
def get_recommended(vector, query_index, n_neighbors=10):
    model_knn = NearestNeighbors(metric='cosine', n_neighbors=n_neighbors)
    model_knn.fit(csr_matrix(vector))

    distances, indices = model_knn.kneighbors(vector[query_index,:].reshape(1, -1), n_neighbors = n_neighbors)
    result = []
    for i in range(0, len(distances.flatten())):
        index = indices.flatten()[i]
        if index == query_index:
            continue
        result.append(anime_data.iloc[index])
        
    return pd.DataFrame(result)

- Test the functionatily

In [13]:
query_index = anime_id[anime_id.MAL_ID == 5231].index[0]
anime_data.iloc[[query_index]]

Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
886,5231,Inazuma Eleven,7.59,"[Sports, Super Power, Shounen]",TV,127,Fall 2008,[OLM],Game,G - All Ages,138185,101939


### Evaluation Part
- Before going to the model part, we first borrow the model from supervised training to evaluate the unsupervised model training result

In [14]:
def generate_sets_genre_count(uid_str, current_user, genre_str):
    uid_str_list = uid_str.split(", ")
    genre_str_list = genre_str.split(", ")
    uid_dict = dict()
    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        if uid == current_user:
            continue
        score = int(s_pair[1])
        uid_dict[uid] = 1
    X_list = []
    y_list = []
    for index, row in user_genre_count.iterrows():
        uid = int(row[0])
        if uid == current_user:
            continue
        genres = []
        for genre in genre_str_list:
            genres.append(row[genre])
        temp = []
        for i in range(0, len(genres)):
            temp.append(genres[i])
        arr = np.array(temp)
        X_list.append(arr)
        if uid in uid_dict:
            y_list.append(1)
        else:
            y_list.append(0)
    X = np.array(X_list)
    y = np.array(y_list)
    return X, y

def get_user_info(user_id, genre_str):
    user_info = user_genre_count.loc[user_genre_count['user_id'] == user_id]
    user_info = user_info.iloc[0]
    genre_str_list = genre_str.split(", ")
    temp = []
    for genre in genre_str_list:
        temp.append(user_info[genre])
    user_line = np.array([np.array(temp)])
    return user_line

def predict(user_id, mal_id, user_line=None):
    anime_info = top_animes_complete.loc[top_animes_complete['MAL_ID'] == mal_id].iloc[0]
    if user_line is None:
        user_line = get_user_info(user_id, anime_info["genres"])
    X, y = generate_sets_genre_count(anime_info["completed_user_ids"], user_id, anime_info["genres"])
    clf = LogisticRegression().fit(X, y)
    result = clf.predict(user_line)
    return result[0]

def generate_random_test_users(test_users_count):
    return random.sample(list(top_users.index), test_users_count)


def evaluate_unsupervised(algorithm, n=5):
    test_users = generate_random_test_users(n)
    total_precision = 0
    for user in test_users:
        r_mal_id = algorithm(user)
        count = 0
        for mal_id in r_mal_id:
            current_result = predict(user, mal_id)
            if current_result == 1:
                count += 1
        total_precision += count/len(r_mal_id)
        if n <= 5:
            print("Precision for user " + str(user) + " is: " + str(count / len(r_mal_id)))
    if n > 5:
        print("Precision mean with current algorithm is ", total_precision / n)
    return total_precision / n

def get_mal_id(recommendation_df):
    return list(recommendation_df['MAL_ID'])

- Since our recommendation evaluation is based on top 1000 anime, so we need to make sure the recommendation is within this range

### 1.1 Content based -> studio
- recommendation based on anime studio information

In [23]:
print("Recommendation based on anime studio information")
random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
studio_query = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
get_recommended(anime_metadata, studio_query, 10)

Recommendation based on anime studio information


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
539,12729,High School DxD OVA,7.3,"[Comedy, Demons, Ecchi, Romance, School]",OVA,2,Unknown,[TNK],Light novel,R+ - Mild Nudity,209100,168222
933,30300,"High School DxD New: Oppai, Tsutsumimasu!",7.29,"[Comedy, Demons, Ecchi, Romance, School]",OVA,1,Unknown,[TNK],Light novel,R+ - Mild Nudity,116137,96284
447,34626,Kono Subarashii Sekai ni Shukufuku wo! 2: Kono...,8.07,"[Magic, Adventure, Fantasy, Comedy, Supern...",OVA,1,Unknown,[Studio Deen],Light novel,PG-13 - Teens 13 or older,239082,200972
351,32380,Kono Subarashii Sekai ni Shukufuku wo!: Kono S...,7.86,"[Adventure, Comedy, Supernatural, Magic, F...",OVA,1,Unknown,[Studio Deen],Light novel,PG-13 - Teens 13 or older,273954,245860
407,9515,Highschool of the Dead: Drifters of the Dead,6.64,"[Comedy, Ecchi, Supernatural]",OVA,1,Unknown,[Madhouse],Manga,R+ - Mild Nudity,242026,216089
516,268,Golden Boy,8.03,"[Adventure, Comedy, Ecchi]",OVA,6,Unknown,[APPP],Manga,R+ - Mild Nudity,251683,176125
523,18753,Yahari Ore no Seishun Love Comedy wa Machigatt...,7.6,"[Comedy, Romance, School]",OVA,1,Unknown,[Brain's Base],Light novel,PG-13 - Teens 13 or older,198096,173086
554,5042,Kiss x Sis,6.93,"[Comedy, Ecchi, Harem, Romance, School, S...",OVA,12,Unknown,[feel.],Manga,R+ - Mild Nudity,256088,163235
590,4901,Black Lagoon: Roberta's Blood Trail,8.05,"[Action, Seinen]",OVA,5,Unknown,[Madhouse],Manga,R+ - Mild Nudity,208186,153249


In [24]:
# get a list of top liked anime
recommended_anime = rating_popular_anime["MAL_ID"].tolist()

def get_recommended_based_on_studio(user_id_input):
    # get user most liked movie
    user = top_users.loc[top_users.index == user_id_input]
    user_top_liked = user
    # init with a top anime for recommendation
    random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
    query_index = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
    if int(user['anime_id']) in recommended_anime:
        user_top_liked = int(user['anime_id'])
    else:
        mask = rating_top_anime[rating_top_anime['user_id'] == user_id_input].sort_values('rating',ascending=False)[:10]
        tmp = mask[mask['MAL_ID'].isin(recommended_anime)]['MAL_ID'].to_list()
        if len(tmp) >= 1:
            user_top_liked = int(tmp[0])
        query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    return get_recommended(anime_metadata, query_index, 10)['MAL_ID'].to_list()

In [25]:
evaluate_unsupervised(get_recommended_based_on_studio)

Precision for user 102440 is: 1.0
Precision for user 5045 is: 0.6666666666666666
Precision for user 57106 is: 0.8888888888888888
Precision for user 122020 is: 1.0
Precision for user 103368 is: 1.0


0.9111111111111111

#### Content based -> Keywords using TD IDF

In [26]:
print("Recommendation based on anime keywords information")
random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
keywords_query = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
get_recommended(genres_vector_tf_idf, keywords_query, 10)

Recommendation based on anime keywords information


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
642,15039,Ano Hi Mita Hana no Namae wo Bokutachi wa Mada...,7.95,"[Slice of Life, Supernatural, Drama]",Movie,1,Unknown,[A-1 Pictures],Original,PG-13 - Teens 13 or older,220264,143081
533,101,Air,7.32,"[Slice of Life, Supernatural, Drama, Romance]",TV,12,Winter 2005,[Kyoto Animation],Visual novel,PG-13 - Teens 13 or older,259381,170036
611,1530,Kanon (2006),8.01,"[Slice of Life, Supernatural, Drama, Romance]",TV,24,Fall 2006,[Kyoto Animation],Visual novel,PG-13 - Teens 13 or older,232486,149675
115,18153,Kyoukai no Kanata,7.78,"[Slice of Life, Supernatural, Fantasy]",TV,12,Fall 2013,[Kyoto Animation],Light novel,PG-13 - Teens 13 or older,848484,531679
690,28675,Kyoukai no Kanata Movie 2: I'll Be Here - Mira...,8.2,"[Fantasy, Slice of Life, Supernatural]",Movie,1,Unknown,[Kyoto Animation],Light novel,PG-13 - Teens 13 or older,189288,133388
515,16001,Kokoro Connect: Michi Random,8.01,"[Comedy, Drama, Romance, School, Slice of ...",Special,4,Unknown,[SILVER LINK.],Light novel,PG-13 - Teens 13 or older,209714,176453
66,2167,Clannad,8.07,"[Comedy, Drama, Romance, School, Slice of ...",TV,23,Fall 2007,[Kyoto Animation],Visual novel,PG-13 - Teens 13 or older,1095634,725568
202,33255,Saiki Kusuo no Ψ-nan,8.45,"[Comedy, School, Shounen, Slice of Life, S...",TV,120,Summer 2016,"[J.C.Staff, Egg Firm]",Manga,PG-13 - Teens 13 or older,591140,365729
952,38249,Saiki Kusuo no Ψ-nan: Kanketsu-hen,8.22,"[Comedy, School, Shounen, Slice of Life, S...",Special,1,Unknown,"[J.C.Staff, Egg Firm]",Manga,PG-13 - Teens 13 or older,114514,94072


In [28]:
def get_recommended_based_keywords(user_id_input):
    # get user most liked movie
    user = top_users.loc[top_users.index == user_id_input]
    # init with a top anime for recommendation
    user_top_liked = int(random.choice(anime_id["MAL_ID"].to_list()))
    query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    if int(user['anime_id']) in recommended_anime:
        user_top_liked = int(user['anime_id'])
    else:
        mask = rating_top_anime[rating_top_anime['user_id'] == user_id_input].sort_values('rating',ascending=False)[:10]
        tmp = mask[mask['MAL_ID'].isin(recommended_anime)]['MAL_ID'].to_list()
        if len(tmp) >= 1:
            user_top_liked = int(tmp[0])
        query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    return get_recommended(genres_vector_tf_idf, query_index, 10)['MAL_ID'].to_list()

evaluate_unsupervised(get_recommended_based_keywords)

Precision for user 261314 is: 1.0
Precision for user 243289 is: 0.7777777777777778
Precision for user 201507 is: 0.7777777777777778
Precision for user 103781 is: 0.7777777777777778
Precision for user 281881 is: 1.0


0.8666666666666666

#### Content based -> Genres

In [29]:
print("Recommendation based on anime genre information")
random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
genre_query = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
get_recommended(genres_vector_one_hot, genre_query, 10)

Recommendation based on anime genre information


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
322,6573,Darker than Black: Ryuusei no Gemini,7.48,"[Action, Sci-Fi, Mystery, Super Power]",TV,12,Fall 2009,[Bones],Original,R - 17+ (violence & profanity),349653,262714
145,2025,Darker than Black: Kuro no Keiyakusha,8.12,"[Action, Sci-Fi, Mystery, Super Power]",TV,25,Spring 2007,[Bones],Original,R - 17+ (violence & profanity),759790,455508
485,38656,Darwin's Game,7.3,"[Action, Sci-Fi, Mystery, Super Power, Sho...",TV,11,Winter 2020,[Nexus],Manga,R - 17+ (violence & profanity),305455,184837
454,16049,Toaru Kagaku no Railgun S,8.06,"[Action, Sci-Fi, Super Power]",TV,24,Spring 2013,[J.C.Staff],Manga,R - 17+ (violence & profanity),292223,199391
299,6213,Toaru Kagaku no Railgun,7.72,"[Action, Sci-Fi, Super Power]",TV,24,Fall 2009,[J.C.Staff],Manga,PG-13 - Teens 13 or older,458949,277246
939,35848,Promare,8.08,"[Action, Mecha, Sci-Fi, Super Power]",Movie,1,Unknown,[Trigger],Original,PG-13 - Teens 13 or older,170644,95395
128,20787,Black Bullet,7.17,"[Action, Sci-Fi, Mystery, Seinen]",TV,13,Spring 2014,"[Kinema Citrus, Orange]",Light novel,R - 17+ (violence & profanity),709300,503294
315,8937,Toaru Majutsu no Index II,7.61,"[Action, Magic, Sci-Fi, Super Power]",TV,24,Fall 2010,[J.C.Staff],Light novel,R - 17+ (violence & profanity),366426,265217
559,10163,C: The Money of Soul and Possibility Control,7.25,"[Action, Mystery, Super Power, Thriller]",TV,11,Spring 2011,[Tatsunoko Production],Original,PG-13 - Teens 13 or older,250212,162154


In [30]:
def get_recommended_based_on_genres_vector(user_id_input):
    # get user most liked movie
    user = top_users.loc[top_users.index == user_id_input]
    user_top_liked = int(random.choice(anime_id["MAL_ID"].to_list()))
    query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    if int(user['anime_id']) in recommended_anime:
        user_top_liked = int(user['anime_id'])
    else:
        mask = rating_top_anime[rating_top_anime['user_id'] == user_id_input].sort_values('rating',ascending=False)[:10]
        tmp = mask[mask['MAL_ID'].isin(recommended_anime)]['MAL_ID'].to_list()
        if len(tmp) >= 1:
            user_top_liked = int(tmp[0])
        else:
            # find another anime for recommendation
            user_top_liked = int(mask['MAL_ID'].to_list()[0])
        query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    return get_recommended(genres_vector_one_hot, query_index, 10)['MAL_ID'].to_list()

evaluate_unsupervised(get_recommended_based_on_genres_vector)

Precision for user 246493 is: 0.3333333333333333
Precision for user 286901 is: 0.4444444444444444
Precision for user 161305 is: 0.4444444444444444
Precision for user 220437 is: 0.8888888888888888
Precision for user 112577 is: 0.8888888888888888


0.5999999999999999

#### Content based - All Aspects

In [32]:
all_data = np.concatenate((anime_metadata, genres_vector_tf_idf.todense(), genres_vector_one_hot), axis=1)

print("Recommendation based on all aspects")
random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
all_aspects_query = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
get_recommended(all_data, all_aspects_query, 10)

Recommendation based on all aspects


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
649,28725,Kokoro ga Sakebitagatterunda.,7.96,"[Drama, Romance, School]",Movie,1,Unknown,[A-1 Pictures],Original,PG-13 - Teens 13 or older,234870,141350
466,6351,"Clannad: After Story - Mou Hitotsu no Sekai, K...",7.85,"[Drama, Romance, School]",Special,1,Unknown,[Kyoto Animation],Visual novel,PG-13 - Teens 13 or older,220084,193930
376,4059,"Clannad: Mou Hitotsu no Sekai, Tomoyo-hen",8.0,"[Drama, Romance, School, Slice of Life]",Special,1,Unknown,[Kyoto Animation],Visual novel,PG-13 - Teens 13 or older,255959,230911
737,35851,Sayonara no Asa ni Yakusoku no Hana wo Kazarou,8.45,"[Drama, Fantasy]",Movie,1,Unknown,[P.A. Works],Original,PG-13 - Teens 13 or older,244902,124565
821,2926,Myself; Yourself,7.21,"[Drama, Romance, School]",TV,13,Fall 2007,[Doga Kobo],Visual novel,PG-13 - Teens 13 or older,164171,111131
325,38329,Seishun Buta Yarou wa Yumemiru Shoujo no Yume ...,8.68,"[Supernatural, Drama, Romance, School]",Movie,1,Unknown,[CloverWorks],Light novel,PG-13 - Teens 13 or older,359086,259853
495,31173,Akagami no Shirayuki-hime 2nd Season,8.0,"[Drama, Romance, Fantasy, Shoujo]",TV,12,Winter 2016,[Bones],Manga,PG-13 - Teens 13 or older,247889,182447
600,16662,Kaze Tachinu,8.12,"[Drama, Historical, Romance]",Movie,1,Unknown,[Studio Ghibli],Manga,PG-13 - Teens 13 or older,207697,151546
259,38826,Tenki no Ko,8.41,"[Slice of Life, Drama, Romance, Fantasy]",Movie,1,Unknown,[CoMix Wave Films],Original,PG-13 - Teens 13 or older,475388,319958


In [33]:
def get_recommended_based_on_all_vector(user_id_input):
    # get user most liked movie
    user = top_users.loc[top_users.index == user_id_input]
    user_top_liked = int(random.choice(anime_id["MAL_ID"].to_list()))
    query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    if int(user['anime_id']) in recommended_anime:
        user_top_liked = int(user['anime_id'])
    else:
        mask = rating_top_anime[rating_top_anime['user_id'] == user_id_input].sort_values('rating',ascending=False)[:10]
        tmp = mask[mask['MAL_ID'].isin(recommended_anime)]['MAL_ID'].to_list()
        if len(tmp) >= 1:
            user_top_liked = int(tmp[0])
        else:
            # find another anime for recommendation
            user_top_liked = int(mask['MAL_ID'].to_list()[0])
        query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    return get_recommended(all_data, query_index, 10)['MAL_ID'].to_list()

evaluate_unsupervised(get_recommended_based_on_all_vector)

Precision for user 39883 is: 1.0
Precision for user 275681 is: 0.6666666666666666
Precision for user 112577 is: 1.0
Precision for user 306454 is: 1.0
Precision for user 260560 is: 1.0


0.9333333333333332

#### Content based - Top Features

In [34]:
reduced_all_data = PCA(n_components=250).fit_transform(all_data)

print("Recommendation based on all aspects")
random_anime_id = int(random.choice(anime_id["MAL_ID"].to_list()))
top_features_query = anime_id[anime_id.MAL_ID == random_anime_id].index[0]
get_recommended(reduced_all_data, top_features_query, 10)

Recommendation based on all aspects


Unnamed: 0,MAL_ID,Name,Score,Genres,Type,Episodes,Premiered,Studios,Source,Rating,Members,Completed
898,34279,Grancrest Senki,7.25,"[Action, Drama, Fantasy, Romance]",TV,24,Winter 2018,[A-1 Pictures],Light novel,R - 17+ (violence & profanity),211902,100552
917,10418,Deadman Wonderland: Akai Knife Tsukai,6.98,"[Action, Sci-Fi, Horror, Shounen]",OVA,1,Unknown,[Manglobe],Manga,R - 17+ (violence & profanity),117178,97803
865,24655,Date A Live Movie: Mayuri Judgment,7.36,"[Harem, Romance, Sci-Fi]",Movie,1,Unknown,[Production IMS],Light novel,PG-13 - Teens 13 or older,141636,104180
885,36106,Shingeki no Kyojin: Lost Girls,7.75,"[Action, Horror, Supernatural, Drama, Fant...",OVA,3,Unknown,[Wit Studio],Novel,R - 17+ (violence & profanity),174320,101966
896,37055,Youjo Senki Movie,8.28,"[Action, Military, Magic]",Movie,1,Unknown,[Nut],Light novel,R - 17+ (violence & profanity),160580,100698
994,2961,Digimon Adventure Movie,7.57,"[Action, Fantasy, Kids, Sci-Fi]",Movie,1,Unknown,[Toei Animation],Original,PG - Children,96213,90359
929,34437,Code Geass: Fukkatsu no Lelouch,7.97,"[Action, Military, Sci-Fi, Super Power, Dr...",Movie,1,Unknown,[Sunrise],Original,R - 17+ (violence & profanity),215184,96591
892,16009,Kamisama no Inai Nichiyoubi,7.35,"[Adventure, Mystery, Fantasy]",TV,12,Summer 2013,[Madhouse],Light novel,R - 17+ (violence & profanity),190201,101189
856,8888,Code Geass: Boukoku no Akito 1 - Yokuryuu wa M...,7.41,"[Action, Mecha, Military, Sci-Fi]",Movie,1,Unknown,[Sunrise],Original,R - 17+ (violence & profanity),150559,105431


In [36]:
def get_recommended_based_on_top_features(user_id_input):
    # get user most liked movie
    user = top_users.loc[top_users.index == user_id_input]
    user_top_liked = int(random.choice(anime_id["MAL_ID"].to_list()))
    query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    if int(user['anime_id']) in recommended_anime:
        user_top_liked = int(user['anime_id'])
    else:
        mask = rating_top_anime[rating_top_anime['user_id'] == user_id_input].sort_values('rating',ascending=False)[:10]
        tmp = mask[mask['MAL_ID'].isin(recommended_anime)]['MAL_ID'].to_list()
        if len(tmp) >= 1:
            user_top_liked = int(tmp[0])
        else:
            # find another anime for recommendation
            user_top_liked = int(mask['MAL_ID'].to_list()[0])
        query_index = anime_id[anime_id.MAL_ID == user_top_liked].index[0]
    return get_recommended(reduced_all_data, query_index, 10)['MAL_ID'].to_list()

evaluate_unsupervised(get_recommended_based_on_top_features)

Precision for user 323819 is: 1.0
Precision for user 43331 is: 1.0
Precision for user 304151 is: 1.0
Precision for user 189530 is: 1.0
Precision for user 175308 is: 0.2222222222222222


0.8444444444444444

#### Now, we choose three of the content based algorithm for a more accurate evaluation

- Recommendation based on all vector

In [38]:
evaluate_unsupervised(get_recommended_based_on_all_vector, 50)

Precision mean with current algorithm is  0.8488888888888887


0.8488888888888887

- Recommendation based on studio

In [41]:
evaluate_unsupervised(get_recommended_based_on_studio, 50)

Precision mean with current algorithm is  0.7355555555555555


0.7355555555555555

- Recommendation based on top features

In [None]:
evaluate_unsupervised(get_recommended_based_on_top_features, 50)