In [24]:
import pandas as pd
import numpy as np
anime = pd.read_csv('anime-recommendations-database/anime.csv')
rating = pd.read_csv('anime-recommendations-database/rating.csv')
pd.set_option('display.max_rows', 10)

In [2]:
numUser = 500
numItem = 100
rating_sample = rating[rating["user_id"].isin(range(1,numUser+1))]
rating_sample = rating_sample[rating_sample["anime_id"].isin(range(1,numItem+1))]
rating_sample = rating_sample[rating_sample.rating != -1]

ani_sample = anime[anime["anime_id"].isin(range(1,numItem+1))]

In [48]:
merged = rating_sample.merge(ani_sample, on='anime_id')
matInput = merged.sample(frac=1).reset_index(drop=true)

In [65]:
def toMatrix (df, train_ratio = 0.8):
    # user
    uid = pd.get_dummies(df['user_id'])
    
    # anime
    aid = pd.get_dummies(df['anime_id'])
    
    # genre
    genre = df['genre'].str.get_dummies(sep=',')
    
    # type
    aniType = pd.get_dummies(df['type']) 
    
    # episode
    epi = df['episodes']
    check = epi.loc[epi == 'Unknown']
    if not check.empty:
        mean = pd.to_numeric(epi.loc[epi != 'Unknown']).mean()
        epi.replace('Unknown', mean)
    epi = pd.to_numeric(epi)
    epi = (epi-epi.mean())/epi.std()
    
    # merge and split
    val = df['rating_x']
    prep = pd.concat([uid,aid,genre,aniType,epi,val],axis=1)
    
    msk = np.random.rand(len(prep)) < train_ratio
    train = prep[msk]
    test = prep[~msk]
    
    X_train = train.loc[:, train.columns != 'rating_x'].values.astype('float')
    Y_train = train['rating_x'].values.astype('float')
    X_test = test.loc[:, test.columns != 'rating_x'].values.astype('float')
    Y_test = test['rating_x'].values.astype('float')
    
    return (X_train, Y_train, X_test, Y_test)

In [66]:
X_train, Y_train, X_test, Y_test = toMatrix (merged)

In [94]:
def getInteractions(data):
    data_copy = data.copy()
    data_copy['rating'].replace(-1, 0)
    return data_copy.pivot_table(index = 'anime_id', columns = 'user_id', values = 'rating').fillna(0).T

In [95]:
getInteractions(rating_sample)

anime_id,1,5,6,7,8,15,16,17,18,19,...,91,92,93,94,95,96,97,98,99,100
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,8.0,0.0,0.0,6.0,0.0,6.0,6.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
492,9.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
494,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
495,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
497,9.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,10.0,0.0,8.0,8.0,0.0,7.0,0.0,0.0,0.0,0.0


In [6]:
print(anime.columns)

Index(['anime_id', 'name', 'genre', 'type', 'episodes', 'rating', 'members'], dtype='object')


In [87]:
def getItemFeatures(anime):
    ret = anime.copy()
    
    # drop useless column
    ret = ret.drop(columns=['name', 'members'])
    
    # create dummy genre
    ret = ret.drop('genre', 1).join(ret['genre'].str.get_dummies(sep=', '))
    
    # normalize the episodes
    ret = ret.replace({'Unknown': None})
    ret['episodes']  = pd.to_numeric(ret['episodes'])
    ret = ret.replace({None: ret['episodes'].mean()})
    ret['episodes'] = (ret['episodes'] - ret['episodes'].mean())/ret['episodes'].std()
    ret = ret.drop('episodes',1)
    
    # create dummy type
    ret = ret.drop('type',1).join(pd.get_dummies(ret['type']))
    return ret

In [88]:
result = getItemFeatures(ani_sample)

In [96]:
result.columns

Index(['anime_id', 'rating', 'Action', 'Adventure', 'Cars', 'Comedy',
       'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem',
       'Historical', 'Horror', 'Josei', 'Magic', 'Martial Arts', 'Mecha',
       'Military', 'Music', 'Mystery', 'Police', 'Psychological', 'Romance',
       'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai',
       'Shounen', 'Slice of Life', 'Space', 'Sports', 'Super Power',
       'Supernatural', 'Thriller', 'Vampire', 'Movie', 'OVA', 'TV'],
      dtype='object')

In [97]:
anime['genre']

0                     Drama, Romance, School, Supernatural
1        Action, Adventure, Drama, Fantasy, Magic, Mili...
2        Action, Comedy, Historical, Parody, Samurai, S...
3                                         Sci-Fi, Thriller
4        Action, Comedy, Historical, Parody, Samurai, S...
                               ...                        
12289                                               Hentai
12290                                               Hentai
12291                                               Hentai
12292                                               Hentai
12293                                               Hentai
Name: genre, Length: 12294, dtype: object

In [98]:
anime['genre'].head(10)

0                 Drama, Romance, School, Supernatural
1    Action, Adventure, Drama, Fantasy, Magic, Mili...
2    Action, Comedy, Historical, Parody, Samurai, S...
3                                     Sci-Fi, Thriller
4    Action, Comedy, Historical, Parody, Samurai, S...
5               Comedy, Drama, School, Shounen, Sports
6              Action, Adventure, Shounen, Super Power
7                       Drama, Military, Sci-Fi, Space
8    Action, Comedy, Historical, Parody, Samurai, S...
9    Action, Comedy, Historical, Parody, Samurai, S...
Name: genre, dtype: object

In [245]:
def normalzieRating(rating):
    rating = rating.replace(-1, 0)
    ret = pd.merge(rating[['user_id', 'rating']].groupby('user_id').mean(),
                   rating[['user_id', 'rating']].groupby('user_id').std(), 
                   left_index=True, right_index=True)
    ret = ret.fillna(1)
    ret = ret.replace(0,1)
    ret = ret.rename(index=str, columns={"rating_x": "mean", "rating_y": "std"})
    ret['user_id'] = pd.to_numeric(ret.index)
    ret = ret.reset_index(drop=True)
    return ret

In [255]:
def normalize_rating(rating):
    result = normalzieRating(rating)
    rating_normalized = pd.merge(rating, result, on='user_id')
    rating_normalized = rating_normalized.replace(-1, 0)
    rating_normalized['rating'] = (rating_normalized['rating'] - rating_normalized['mean'])/rating_normalized['std']
    return result, rating_normalized

In [260]:
print(len(rating[['user_id', 'anime_id']]))
print(len(rating[['user_id', 'anime_id']].drop_duplicates()))

7813737
7813730


In [None]:
q = """ select count(*) from rating """

print(ps.sqldf(q, locals()))

In [281]:
animee = anime.fillna(anime.mean())

In [282]:
anime.isna().any()

anime_id    False
name        False
genre        True
type         True
episodes    False
rating      False
members     False
dtype: bool

In [287]:

result = getItemFeatures(ani_sample)
result

Unnamed: 0,anime_id,rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Slice of Life,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Movie,OVA,TV
21,44,8.83,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,1,0
22,1,8.82,1,1,0,1,0,0,1,0,...,0,1,0,0,0,0,0,0,0,1
38,19,8.72,0,0,0,0,0,0,1,0,...,0,0,0,0,0,1,0,0,0,1
74,21,8.58,1,1,0,1,0,0,1,0,...,0,0,0,1,0,0,0,0,0,1
130,32,8.45,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,23,6.70,1,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,1
4826,55,6.62,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5350,69,6.48,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
5438,51,6.46,1,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0


In [300]:
result['combined'] = ''
for i in result.columns:
    if i != 'anime_id' and i != 'rating' and i != 'Movie' and i != 'OVA' and i != 'TV' and i != 'Special': 
        result['combined'] = result['combined'].astype(str) + result[i].astype(str)
result['combined']

In [301]:
result

Unnamed: 0,anime_id,rating,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,...,Space,Sports,Super Power,Supernatural,Thriller,Vampire,Movie,OVA,TV,combined
21,44,8.83,1,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,1000001000010001000000110000000000000100000100...
22,1,8.82,1,1,0,1,0,0,1,0,...,1,0,0,0,0,0,0,0,1,1101001000000000000000000100000100000110100100...
38,19,8.72,0,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,0,1,0000001000001000000111000010000000010000000100...
74,21,8.58,1,1,0,1,0,0,1,0,...,0,0,1,0,0,0,0,0,1,1101001010000000000000000000010001000110100101...
130,32,8.45,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,1,0,0,0000101000000000100001000100000000000000010100...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4536,23,6.70,1,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,1,1000000000000000000000000000010010000100000000...
4826,55,6.62,1,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1100000010001000000000000100000000000110000001...
5350,69,6.48,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1000000010000000010000000100000000000100000001...
5438,51,6.46,1,0,0,0,0,0,1,0,...,0,0,0,1,0,0,0,1,0,1000001000000000000000100001000000100100000100...


In [291]:
result.columns

Index(['anime_id', 'rating', 'Action', 'Adventure', 'Cars', 'Comedy',
       'Dementia', 'Demons', 'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem',
       'Historical', 'Horror', 'Josei', 'Magic', 'Martial Arts', 'Mecha',
       'Military', 'Music', 'Mystery', 'Police', 'Psychological', 'Romance',
       'Samurai', 'School', 'Sci-Fi', 'Seinen', 'Shoujo', 'Shoujo Ai',
       'Shounen', 'Slice of Life', 'Space', 'Sports', 'Super Power',
       'Supernatural', 'Thriller', 'Vampire', 'Movie', 'OVA', 'TV',
       'combined'],
      dtype='object')

In [None]:
result['']