# Data Preprocessing for Hybrid Filtering

In [1]:
import pandas as pd
import numpy as np

%cd ~/data

C:\Users\user\data


In [2]:
"""importing data of rating and genre"""

rating = pd.read_csv('rating.csv')
ani = pd.read_csv('anime.csv')

In [3]:
rating.head(5)

Unnamed: 0,user_id,anime_id,rating
0,1,20,-1
1,1,24,-1
2,1,79,-1
3,1,226,-1
4,1,241,-1


In [4]:
ani.head(5)

Unnamed: 0,anime_id,name,genre,type,episodes,rating,members
0,32281,Kimi no Na wa.,"Drama, Romance, School, Supernatural",Movie,1,9.37,200630
1,5114,Fullmetal Alchemist: Brotherhood,"Action, Adventure, Drama, Fantasy, Magic, Mili...",TV,64,9.26,793665
2,28977,Gintama°,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.25,114262
3,9253,Steins;Gate,"Sci-Fi, Thriller",TV,24,9.17,673572
4,9969,Gintama&#039;,"Action, Comedy, Historical, Parody, Samurai, S...",TV,51,9.16,151266


In [5]:
"""Making rating matrix to sparse matrix by users"""

def sparse_umat(rating):
    rating = rating.pivot_table(index = ['user_id'], columns = ['anime_id'])
    rating = rating.replace(-1,0)
    return rating

In [6]:
"""Extracting all genre information used"""

def genre_preprocessor(ani):
    genre = ani['genre']
    lis = []
    for i in range(len(genre)):
        if type(genre[i]) == str:
            for j in range(len(genre[i].replace(' ', '').split(','))):
                lis.append(genre[i].replace(' ', '').split(',')[j])
        else:
            pass
    return np.unique(lis)

In [7]:
"""Extraced 43 genre information"""

genre_type = genre_preprocessor(ani)
genre_type

array(['Action', 'Adventure', 'Cars', 'Comedy', 'Dementia', 'Demons',
       'Drama', 'Ecchi', 'Fantasy', 'Game', 'Harem', 'Hentai',
       'Historical', 'Horror', 'Josei', 'Kids', 'Magic', 'MartialArts',
       'Mecha', 'Military', 'Music', 'Mystery', 'Parody', 'Police',
       'Psychological', 'Romance', 'Samurai', 'School', 'Sci-Fi', 'Seinen',
       'Shoujo', 'ShoujoAi', 'Shounen', 'ShounenAi', 'SliceofLife',
       'Space', 'Sports', 'SuperPower', 'Supernatural', 'Thriller',
       'Vampire', 'Yaoi', 'Yuri'], 
      dtype='|S13')

In [8]:
%%time

umatrix = sparse_umat(rating)
umatrix

Wall time: 8min 50s


In [13]:
"""Making users genre feature array"""

def sparse_featmat(umatrix, ani, genre_type):
    u_mfeat = np.zeros((len(umatrix), len(genre_type)))

    for i in range(len(umatrix.index)):
        u_movie = umatrix['rating'].iloc[i,:][umatrix['rating'].iloc[i,:]>0].index
        mean = umatrix['rating'].iloc[i,:][umatrix['rating'].iloc[i,:]>0].mean()
        diff = umatrix['rating'].iloc[i,:][umatrix['rating'].iloc[i,:]>0] - mean
        count = np.zeros(len(genre_type))
        feat_score_scale = np.zeros(len(genre_type))

        for j in u_movie:
            if j not in ani[ani['genre'].isnull()]['anime_id'].values:
                u_feat = np.zeros(len(genre_type))
                u_genre = ani[ani['anime_id']==j]['genre']
                try:
                    u_feat_for_onetime = pd.Series(genre_type).isin(u_genre.values[0].replace(' ', '').split(',')).values
                    count[u_feat_for_onetime] += 1.0
                    u_feat[u_feat_for_onetime] = 1.0
                    feat_score = (diff[j] * u_feat)
                    feat_score_scale += feat_score
                except:
                    feat_score_scale += np.zeros(len(genre_type))
            else:
                pass
        u_mfeat[i,:] = feat_score_scale / count


    return u_mfeat

In [12]:
gen_sparse.head(5)

Unnamed: 0,Action,Adventure,Cars,Comedy,Dementia,Demons,Drama,Ecchi,Fantasy,Game,...,ShounenAi,SliceofLife,Space,Sports,SuperPower,Supernatural,Thriller,Vampire,Yaoi,Yuri
0,0.0,0.0,,0.0,,0.0,,0.0,0.0,0.0,...,,,,,,0.0,,,,
1,,,,0.0,,,,,,,...,,,,0.0,,,,,,
2,-0.274895,-0.033302,,0.092677,,-1.065217,-0.113604,0.234783,0.011706,-0.565217,...,,0.234783,,1.034783,0.21256,0.018116,0.434783,-1.565217,,
3,,,,,,,,,,,...,,,,,,,,,,
4,0.346179,0.831693,1.64488,-0.199357,-3.35512,-0.197225,0.862829,-1.84512,0.055406,-0.25512,...,,0.151729,2.89488,1.64488,-0.184388,-0.008181,2.930594,0.073452,,


In [16]:
"""Merging rating matrix with feature matrix and transform it to array"""

ufeat_matrix = np.hstack([umatrix['rating'].values, gen_sparse])
ufeat_matrix

array([[         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [         nan,          nan,          nan, ...,  -1.56521739,
                 nan,          nan],
       ..., 
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan],
       [ 10.        ,  10.        ,  10.        , ...,  -0.45657694,
                 nan,  -1.54748603],
       [         nan,          nan,          nan, ...,          nan,
                 nan,          nan]])

In [25]:
ufeat_matrix = np.nan_to_num(ufeat_matrix)
ufeat_matrix

array([[  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [  0.        ,   0.        ,   0.        , ...,  -1.56521739,
          0.        ,   0.        ],
       ..., 
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ],
       [ 10.        ,  10.        ,  10.        , ...,  -0.45657694,
          0.        ,  -1.54748603],
       [  0.        ,   0.        ,   0.        , ...,   0.        ,
          0.        ,   0.        ]])

In [18]:
pd.DataFrame(ufeat_matrix).to_csv('added_sparse_matrix.csv', encoding = 'utf-8')