In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn import linear_model
from sklearn import svm
from sklearn import naive_bayes
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegressionCV

from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import warnings

warnings.filterwarnings('ignore')

In [2]:
user_genre_count = pd.read_csv('dataset/users_genres_count.csv')
top_animes = pd.read_csv('dataset/top_animes_complete.csv')

user_genre_count

Unnamed: 0,user_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,189037,3462,2670,116,5477,385,436,2405,729,2799,...,1694,462,665,580,1329,114,55,129,36,32
1,162615,3285,2605,115,5235,383,408,2320,695,2685,...,1555,453,625,563,1246,106,49,122,36,32
2,68042,3292,2507,93,4928,349,405,2231,710,2526,...,1479,431,619,541,1243,107,36,120,36,24
3,283786,3277,2275,90,4690,445,356,2061,687,2494,...,1483,428,566,511,1221,116,37,120,28,3
4,259790,2531,1821,74,3931,324,284,1570,639,1993,...,1131,339,314,471,1004,94,23,106,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,163153,407,186,1,480,15,39,371,138,259,...,213,58,31,71,224,37,0,19,5,3
2996,245817,442,157,0,458,44,61,280,86,261,...,166,13,43,100,300,38,0,53,2,0
2997,145568,517,264,7,432,10,72,411,30,303,...,180,39,69,121,292,45,0,29,0,0
2998,346470,460,274,1,572,5,52,275,162,348,...,166,2,65,111,232,24,0,17,0,3


In [3]:
def init_dict():
    result = dict()
    result["mean_error"] = 0
    result["score"] = 0
    
    return result
    
linear_regression_dict = init_dict()
svm_dict = init_dict()
naive_dict = init_dict()
logistic_dict = init_dict()

# Key: mal_id, value: dict
# Key: tree_depth, value: (accuracy_score:0, precison_score:0, recall_score:0)
tree_dict = dict()

In [4]:
def handle_one_model(model, store_dict, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    store_dict["mean_error"] = store_dict['mean_error'] + mean_squared_error(y_test,model.predict(X_test))
    
    store_dict["score"] = store_dict["score"] + model.score(X_test,y_test)

In [5]:
def generate_sets_genre_count(uid_str, genre_str):
    uid_str_list = uid_str.split(", ")
    genre_str_list = genre_str.split(", ")
    
    uid_dict = dict()
    
    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        score = int(s_pair[1])
        
        uid_dict[uid] = 1
    
    X_list = []
    y_list = []
    
    for index, row in user_genre_count.iterrows():
        uid = int(row[0])
        genres = []
        
        for genre in genre_str_list:
            genres.append(row[genre])
        
        temp = []
        
        for i in range(0, len(genres)):
            temp.append(genres[i])
        
        arr = np.array(temp)
        X_list.append(arr)
        
        if uid in uid_dict:
            y_list.append(1)
        else:
            y_list.append(0)

    X = np.array(X_list)
    y = np.array(y_list)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [6]:
def compute_animes_models(X_train, X_test, y_train, y_test):
    lr = linear_model.LinearRegression()
    handle_one_model(lr, linear_regression_dict, X_train, X_test, y_train, y_test)

    
    svc = svm.SVC()
    handle_one_model(svc, svm_dict, X_train, X_test, y_train, y_test)
    
    
    nb = naive_bayes.GaussianNB()
    handle_one_model(nb, naive_dict, X_train, X_test, y_train, y_test)
    
    logR = LogisticRegression()
    handle_one_model(logR, logistic_dict, X_train, X_test, y_train, y_test)

In [7]:
def handle_one_tree(X_train, X_test, y_train, y_test, depth):
    if depth == 0:
        clf = DecisionTreeClassifier().fit(X_train, y_train)
    else:
        clf = DecisionTreeClassifier(max_depth=depth).fit(X_train, y_train)
    
    y_pred = clf.predict(X_test)
    
    scores = dict()
    
    scores["accuracy"] = accuracy_score(y_test, y_pred)
    scores["precision"] = precision_score(y_test, y_pred)
    scores["recall"] = recall_score(y_test, y_pred)
    scores["test_score"] = clf.score(X_test, y_test)
    
    if depth == 0:
        return scores, clf.get_depth()
    else:
        return scores, -1

In [8]:
def compute_animes_trees(X_train, X_test, y_train, y_test):
    
    result = dict()
    
    score_0, max_depth = handle_one_tree(X_train, X_test, y_train, y_test, 0)
    result[0] = score_0

    for i in range(1, max_depth):
        score, temp = handle_one_tree(X_train, X_test, y_train, y_test, i)
        result[i] = score
    
    return result

In [9]:
count = 0
for index, row in top_animes.iterrows():
    mal_id = row['MAL_ID']
    X_train, X_test, y_train, y_test = generate_sets_genre_count(row['completed_user_ids'], row['genres'])
    
    tree_score = compute_animes_trees(X_train, X_test, y_train, y_test)
    tree_dict[mal_id] = tree_score
    compute_animes_models(X_train, X_test, y_train, y_test)
    count += 1
    if count > 49:
        break

In [10]:
def output_model(store_dict, name):
    print("Using " + name + ":")
    print("average mean squared error:  " + str(store_dict["mean_error"] / count)) 
    print("average score:  " + str(store_dict["score"] / count))

In [11]:
output_model(logistic_dict, "Logistic Regression")

Using Logistic Regression:
average mean squared error:  0.1833333333333334
average score:  0.8166666666666664


In [12]:
output_model(linear_regression_dict, "Linear Regression")

Using Linear Regression:
average mean squared error:  0.1369973231326117
average score:  0.12563051230328467


In [13]:
output_model(svm_dict, "Super Vector Machines")

Using Super Vector Machines:
average mean squared error:  0.19370000000000004
average score:  0.8062999999999997


In [14]:
output_model(naive_dict, "Naive Bayes")

Using Naive Bayes:
average mean squared error:  0.3321000000000002
average score:  0.6678999999999999


In [15]:
def output_one_depth(depth, score_dict):
    if depth == 0:
        print("Not specified max depth:")
    else:
        print("Max depth is " + str(depth) + ": ")
    
    print("accuracy score: " + str(score_dict["accuracy"]))
    print("precision score: " + str(score_dict["precision"]))
    print("recall score: " + str(score_dict["recall"]))
    print("test score: " + str(score_dict["test_score"]))
    print()

In [16]:
def out_tree():
    for mal_id in tree_dict.keys():
        print("result for mal_id=" + str(mal_id) + ": \n")
        scores = tree_dict[mal_id]
        
        for depth in scores.keys():
            output_one_depth(depth, scores[depth])
        
        print("--------------------")
        
        break

out_tree()

result for mal_id=16498: 

Not specified max depth:
accuracy score: 0.8516666666666667
precision score: 0.9260628465804066
recall score: 0.9109090909090909
test score: 0.8516666666666667

Max depth is 1: 
accuracy score: 0.9166666666666666
precision score: 0.9166666666666666
recall score: 1.0
test score: 0.9166666666666666

Max depth is 2: 
accuracy score: 0.915
precision score: 0.9264957264957265
recall score: 0.9854545454545455
test score: 0.915

Max depth is 3: 
accuracy score: 0.91
precision score: 0.918918918918919
recall score: 0.9890909090909091
test score: 0.91

Max depth is 4: 
accuracy score: 0.9066666666666666
precision score: 0.9186440677966101
recall score: 0.9854545454545455
test score: 0.9066666666666666

Max depth is 5: 
accuracy score: 0.915
precision score: 0.9250425894378195
recall score: 0.9872727272727273
test score: 0.915

Max depth is 6: 
accuracy score: 0.9016666666666666
precision score: 0.925476603119584
recall score: 0.9709090909090909
test score: 0.901666666

In [17]:
def get_max(depth_dict):
    result = 0
    for depth in depth_dict.keys():
        result = max(result, depth_dict[depth]["test_score"])
    
    return result

In [18]:
def calculate_avg_tree_score():
    total = 0
    for mal_id in tree_dict.keys():
        total = total + get_max(tree_dict[mal_id])
    
    print(total / count)

calculate_avg_tree_score()

0.8127666666666664
