In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

In [2]:
user_genre_rate = pd.read_csv('dataset/users_genres_rating.csv')
top_animes = pd.read_csv('dataset/top_animes_complete.csv')

top_animes

Unnamed: 0,MAL_ID,completed_user_ids,genres
0,16498,"189037|6, 162615|10, 68042|9, 283786|9, 259790...","Action, Military, Mystery, Super Power, Drama,..."
1,1535,"189037|9, 162615|10, 68042|10, 283786|10, 2597...","Mystery, Police, Psychological, Supernatural, ..."
2,11757,"189037|7, 162615|10, 68042|4, 283786|9, 259790...","Action, Game, Adventure, Romance, Fantasy"
3,30276,"189037|8, 162615|10, 68042|9, 283786|9, 259790...","Action, Sci-Fi, Comedy, Parody, Super Power, S..."
4,31964,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Action, Comedy, School, Shounen, Super Power"
...,...,...,...
995,906,"189037|7, 162615|10, 259790|5, 291207|6, 18228...","Action, Adventure, Comedy, Fantasy, Sci-Fi, Sh..."
996,33929,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Action, Comedy, School, Shounen, Super Power"
997,10029,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Historical, Romance, School, Shoujo"
998,10012,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Comedy, Parody, Supernatural"


In [3]:
user_genre_rate

Unnamed: 0,user_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,189037,6.995090,7.008240,6.974138,7.005295,6.823377,6.922018,6.980873,7.009602,6.994641,...,7.029516,6.993506,6.992481,7.008621,6.968397,6.921053,7.000000,6.806202,6.416667,6.781250
1,162615,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,...,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000,10.000000
2,68042,6.133354,6.352613,5.881720,6.059456,5.532951,6.019753,6.150605,5.374648,6.076405,...,6.125761,6.062645,6.294023,6.114603,6.090909,6.663551,5.305556,5.791667,4.500000,4.625000
3,283786,6.991761,6.892308,6.766667,6.776759,6.170787,6.898876,7.001941,6.631732,6.793103,...,6.831423,6.960280,7.042403,7.031311,7.070434,7.439655,6.378378,7.191667,3.000000,6.000000
4,259790,5.585144,5.640308,4.486486,5.257695,3.876543,5.528169,6.159873,5.056338,5.366282,...,5.375774,5.569322,4.805732,5.501062,5.923307,6.744681,3.130435,5.462264,4.920000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,163153,4.171990,4.543011,8.000000,4.254167,5.400000,4.410256,4.784367,3.311594,4.127413,...,4.826291,4.655172,4.935484,4.112676,4.352679,4.513514,0.000000,4.263158,4.400000,3.333333
2996,245817,8.002262,7.834395,0.000000,7.406114,7.681818,8.098361,7.935714,6.802326,7.582375,...,7.198795,7.846154,7.488372,8.360000,8.173333,8.605263,0.000000,8.018868,6.500000,0.000000
2997,145568,6.615087,6.848485,6.142857,6.349537,6.700000,6.361111,6.773723,5.700000,6.732673,...,6.266667,6.666667,6.898551,6.702479,6.602740,7.422222,0.000000,6.103448,0.000000,0.000000
2998,346470,7.954348,7.762774,7.000000,7.802448,7.800000,7.942308,8.145455,7.456790,7.818966,...,7.975904,8.500000,8.076923,8.108108,8.086207,8.416667,0.000000,8.176471,0.000000,7.666667


In [4]:
def init_dict():
    result = dict()
    result["mean_error"] = 0
    result["score"] = 0
    
    return result
    
linear_regression_dict = init_dict()
svm_dict = init_dict()
naive_dict = init_dict()
logistic_dict = init_dict()

In [5]:
def handle_one_model(model, store_dict, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    store_dict["mean_error"] = store_dict['mean_error'] + mean_squared_error(y_test,model.predict(X_test))
    
    store_dict["score"] = store_dict["score"] + model.score(X_test,y_test)

In [6]:
def generate_sets_genre_score(uid_str,genre_str):
    uid_str_list = uid_str.split(", ")
    genre_str_list = genre_str.split(", ")
    
    uid_score = dict()
    uid_list = []
    
    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        score = int(s_pair[1])
        
        uid_score[uid] = score
        uid_list.append(uid)
    
    clean_user_genre_rate = user_genre_rate[user_genre_rate['user_id'].isin(uid_list)]
    
    X_list = []
    y_list = []
    
    for index, row in clean_user_genre_rate.iterrows():
        uid = int(row[0])
        genres = []
        
        for genre in genre_str_list:
            genres.append(row[genre])
            
        temp = []
        
        for i in range(0,len(genres)):
            temp.append(genres[i])
        
        arr = np.array(temp)
        
        X_list.append(arr)
        y_list.append(uid_score[uid])
    
    X = np.array(X_list)
    y = np.array(y_list)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [7]:
def compute_animes(X_train, X_test, y_train, y_test):
    lr = linear_model.LinearRegression()
    handle_one_model(lr, linear_regression_dict, X_train, X_test, y_train, y_test)

    
    svc = svm.SVC()
    handle_one_model(svc, svm_dict, X_train, X_test, y_train, y_test)
    
    
    nb = naive_bayes.GaussianNB()
    handle_one_model(nb, naive_dict, X_train, X_test, y_train, y_test)
    
    logR = LogisticRegression()
    handle_one_model(logR, logistic_dict, X_train, X_test, y_train, y_test)

In [8]:
count = 0
for index, row in top_animes.iterrows():
    mal_id = row['MAL_ID']
    X_train, X_test, y_train, y_test = generate_sets_genre_score(row['completed_user_ids'], row['genres'])
    compute_animes(X_train, X_test, y_train, y_test)
    count += 1
    if count > 49:
        break

In [9]:
def output_model(store_dict, name):
    print("Using " + name + ":")
    print("average mean squared error:  " + str(store_dict["mean_error"] / count)) 
    print("average score:  " + str(store_dict["score"] / count))

In [10]:
output_model(logistic_dict, "Logistic Regression")

Using Logistic Regression:
average mean squared error:  1.8749342819004085
average score:  0.37462050139337166


In [11]:
output_model(linear_regression_dict, "Linear Regression")

Using Linear Regression:
average mean squared error:  1.4097520798899144
average score:  0.39627377394447416


In [12]:
output_model(svm_dict, "Super Vector Machines")

Using Super Vector Machines:
average mean squared error:  1.7091085446335936
average score:  0.38422941006424843


In [13]:
output_model(naive_dict, "Naive Bayes")

Using Naive Bayes:
average mean squared error:  2.1806474970892746
average score:  0.35430281680163084
