In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes

In [3]:
user_genre_count = pd.read_csv('dataset/users_genres_count.csv')
top_animes = pd.read_csv('dataset/top_animes_complete.csv')

user_genre_count

Unnamed: 0,user_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,189037,3462,2670,116,5477,385,436,2405,729,2799,...,1694,462,665,580,1329,114,55,129,36,32
1,162615,3285,2605,115,5235,383,408,2320,695,2685,...,1555,453,625,563,1246,106,49,122,36,32
2,68042,3292,2507,93,4928,349,405,2231,710,2526,...,1479,431,619,541,1243,107,36,120,36,24
3,283786,3277,2275,90,4690,445,356,2061,687,2494,...,1483,428,566,511,1221,116,37,120,28,3
4,259790,2531,1821,74,3931,324,284,1570,639,1993,...,1131,339,314,471,1004,94,23,106,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,123966,411,219,3,712,4,50,296,252,337,...,354,4,80,66,239,33,0,23,0,0
1996,31159,381,231,10,687,18,48,365,104,299,...,352,43,51,64,263,42,0,18,0,0
1997,161305,708,301,13,534,13,56,338,177,332,...,127,113,51,112,248,38,0,22,0,0
1998,36853,454,341,8,627,25,55,445,117,281,...,163,62,61,86,235,37,0,25,5,0


In [4]:
def init_dict():
    result = dict()
    result["mean_error"] = 0
    result["score"] = 0
    
    return result
    
linear_regression_dict = init_dict()
svm_dict = init_dict()
naive_dict = init_dict()

In [5]:
def handle_one_model(model, store_dict, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    store_dict["mean_error"] = store_dict['mean_error'] + mean_squared_error(y_test,model.predict(X_test))
    
    store_dict["score"] = store_dict["score"] + model.score(X_test,y_test)

In [6]:
def compute_animes(uid_str):
    uid_str_list = uid_str.split(", ")
    
    uid_dict = dict()
    
    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        score = int(s_pair[1])
        
        uid_dict[uid] = 1
    
    X_list = []
    y_list = []
    
    for index, row in user_genre_count.iterrows():
        uid = int(row[0])
        genres = row[1:]
        
        temp = []
        
        for i in range(0, len(genres)):
            temp.append(genres[i])
        
        arr = np.array(temp)
        X_list.append(arr)
        
        if uid in uid_dict:
            y_list.append(1)
        else:
            y_list.append(0)

    
    X = np.array(X_list)
    y = np.array(y_list)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    lr = linear_model.LinearRegression()
    handle_one_model(lr, linear_regression_dict, X_train, X_test, y_train, y_test)

    
    svc = svm.SVC()
    handle_one_model(svc, svm_dict, X_train, X_test, y_train, y_test)
    
    
    nb = naive_bayes.GaussianNB()
    handle_one_model(nb, naive_dict, X_train, X_test, y_train, y_test)

In [7]:
count = 0
for index, row in top_animes.iterrows():
    mal_id = row['MAL_ID']
    compute_animes(row['completed_user_ids'])
    count += 1

In [8]:
def output(store_dict, name):
    print("Using " + name + ":")
    print("average mean squared error:  " + str(store_dict["mean_error"] / count)) 
    print("average score:  " + str(store_dict["score"] / count))

In [9]:
output(linear_regression_dict, "Linear Regression")

Using Linear Regression:
average mean squared error:  0.13620131328562313
average score:  0.26276373283689924


In [10]:
output(svm_dict, "Super Vector Machines")

Using Super Vector Machines:
average mean squared error:  0.21271249999999997
average score:  0.7872875000000001


In [11]:
output(naive_dict, "Naive Bayes")

Using Naive Bayes:
average mean squared error:  0.4971250000000002
average score:  0.5028749999999998
