In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
import warnings

warnings.filterwarnings('ignore')

In [2]:
user_genre_count = pd.read_csv('dataset/users_genres_count.csv')
top_animes = pd.read_csv('dataset/top_animes_complete.csv')
test_users = pd.read_csv('dataset/users_genres_count_test.csv')

user_genre_count

Unnamed: 0,user_id,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Unknown,Vampire,Yaoi,Yuri
0,189037,3462,2670,116,5477,385,436,2405,729,2799,...,1694,462,665,580,1329,114,55,129,36,32
1,162615,3285,2605,115,5235,383,408,2320,695,2685,...,1555,453,625,563,1246,106,49,122,36,32
2,68042,3292,2507,93,4928,349,405,2231,710,2526,...,1479,431,619,541,1243,107,36,120,36,24
3,283786,3277,2275,90,4690,445,356,2061,687,2494,...,1483,428,566,511,1221,116,37,120,28,3
4,259790,2531,1821,74,3931,324,284,1570,639,1993,...,1131,339,314,471,1004,94,23,106,25,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2995,163153,407,186,1,480,15,39,371,138,259,...,213,58,31,71,224,37,0,19,5,3
2996,245817,442,157,0,458,44,61,280,86,261,...,166,13,43,100,300,38,0,53,2,0
2997,145568,517,264,7,432,10,72,411,30,303,...,180,39,69,121,292,45,0,29,0,0
2998,346470,460,274,1,572,5,52,275,162,348,...,166,2,65,111,232,24,0,17,0,3


In [3]:
top_animes

Unnamed: 0,MAL_ID,completed_user_ids,genres
0,16498,"189037|6, 162615|10, 68042|9, 283786|9, 259790...","Action, Military, Mystery, Super Power, Drama,..."
1,1535,"189037|9, 162615|10, 68042|10, 283786|10, 2597...","Mystery, Police, Psychological, Supernatural, ..."
2,11757,"189037|7, 162615|10, 68042|4, 283786|9, 259790...","Action, Game, Adventure, Romance, Fantasy"
3,30276,"189037|8, 162615|10, 68042|9, 283786|9, 259790...","Action, Sci-Fi, Comedy, Parody, Super Power, S..."
4,31964,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Action, Comedy, School, Shounen, Super Power"
...,...,...,...
995,906,"189037|7, 162615|10, 259790|5, 291207|6, 18228...","Action, Adventure, Comedy, Fantasy, Sci-Fi, Sh..."
996,33929,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Action, Comedy, School, Shounen, Super Power"
997,10029,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Historical, Romance, School, Shoujo"
998,10012,"189037|7, 162615|10, 68042|8, 283786|8, 259790...","Comedy, Parody, Supernatural"


In [4]:
def generate_sets_genre_count(uid_str, current_user, genre_str):
    uid_str_list = uid_str.split(", ")
    genre_str_list = genre_str.split(", ")
    
    uid_dict = dict()
    
    for pair in uid_str_list:
        s_pair = pair.split("|")
        uid = int(s_pair[0])
        if uid == current_user:
            continue
        
        score = int(s_pair[1])
        uid_dict[uid] = 1
    
    X_list = []
    y_list = []
    
    for index, row in user_genre_count.iterrows():
        uid = int(row[0])
        if uid == current_user:
            continue
        genres = []
        
        for genre in genre_str_list:
            genres.append(row[genre])
        
        temp = []
        
        for i in range(0, len(genres)):
            temp.append(genres[i])
        
        arr = np.array(temp)
        X_list.append(arr)
        
        if uid in uid_dict:
            y_list.append(1)
        else:
            y_list.append(0)

    
    X = np.array(X_list)
    y = np.array(y_list)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    return X_train, X_test, y_train, y_test

In [5]:
def get_user_info(user_id,genre_str):
    user_info = test_users.loc[test_users['user_id'] == user_id]
    if user_info.size == 0:
        user_info = user_genre_count.loc[user_genre_count['user_id'] == user_id]
    user_info = user_info.iloc[0]
    genre_str_list = genre_str.split(", ")
    temp = []
    for genre in genre_str_list:
        temp.append(user_info[genre])
    user_line = np.array([np.array(temp)])
    
    return user_line

In [6]:
def predict(user_id, mal_id, user_line=None):
    anime_info = top_animes.loc[top_animes['MAL_ID'] == mal_id].iloc[0]
    if user_line is None:
        user_line = get_user_info(user_id,anime_info["genres"])
    
    X_train, X_test, y_train, y_test = generate_sets_genre_count(anime_info["completed_user_ids"], user_id, anime_info["genres"])
    
    clf = LogisticRegression().fit(X_train, y_train)
    
    result = clf.predict(user_line)
    
    return result[0]

In [7]:
# Use user_id in dataset/users_genres_count_test.csv or dataset/users_genres_count.csv
# mal_id in dataset/top_animes_complete.csv
result = predict(189037,64)
print(result)
# count = 0
# i = 0
# for index, row in top_animes.iterrows():
#     user_line = get_user_info(72699)
    
#     print(i)
#     i+=1
#     mal_id = row["MAL_ID"]
#     result = predict(72699,mal_id, user_line)
#     if result == 0:
#         count += 1

# print(count)

1
