In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
np.set_printoptions(precision=3)         
vectorizer = TfidfVectorizer(token_pattern=u'(?u)\\b\\w+\\b') 

docs = np.array([
        'りんご りんご りんご みかん いちご', # 文書１
        'みかん いちご いちご すいか',        # 文書２
        'みかん すいか ぶどう',               # 文書３
        'いちご ぶどう',                      # 文書４
        ])
 
vecs = vectorizer.fit_transform(docs)
 
print (vecs.toarray())

In [None]:
#文書1のりんごを減らした
docs = np.array([
        'りんご みかん いちご',           # 文書１
        'みかん いちご いちご すいか',    # 文書２
        'みかん すいか ぶどう',           # 文書３
        'いちご ぶどう',                  # 文書４
        ])
 
vecs = vectorizer.fit_transform(docs)
 
print (vecs.toarray())

In [None]:
#りんごを含む文書5を追加
docs = np.array([
        'りんご りんご りんご みかん いちご', # 文書１
        'みかん いちご いちご すいか',        # 文書２
        'みかん すいか ぶどう',               # 文書３
        'いちご ぶどう',                      # 文書４
        'りんご りんご',                      # 文書５
        ])
 
vecs = vectorizer.fit_transform(docs)
 
print (vecs.toarray())

In [None]:
#コサイン類似度の計算
def cos_sim(v1, v2):
    return np.dot(v1, v2) / (np.linalg.norm(v1) * np.linalg.norm(v2))

docs = np.array([
        'りんご りんご りんご みかん いちご', # 文書１
        'みかん いちご いちご すいか',        # 文書２
        'みかん すいか ぶどう',               # 文書３
        'いちご ぶどう',                      # 文書４
        ])
vecs = vectorizer.fit_transform(docs)
vecs.toarray()

print(cos_sim(vecs.toarray()[0],vecs.toarray()[1]))
print(cos_sim(vecs.toarray()[0],vecs.toarray()[2]))
print(cos_sim(vecs.toarray()[0],vecs.toarray()[3]))

In [None]:
#データの読み込み
import pandas as pd
#https://www.kaggle.com/CooperUnion/anime-recommendations-database/homeからダウンロードしてください
anime = pd.read_csv("anime.csv")
rating = pd.read_csv("rating.csv")

In [None]:
#データの形状&タイプの確認
print(anime.shape)
print(rating.shape)
print(type(anime))
print(type(rating))

In [None]:
#データの先頭5行の表示
anime.head() 

In [None]:
rating.head()

In [None]:
#統計情報
anime.describe()

In [None]:
#統計情報
rating.describe()

In [None]:
#人気アニメ10件を表示
anime.sort_values('members', ascending= False)[:10]

In [None]:
#membersの分布
%matplotlib inline
anime['members'].hist(bins=11, figsize=(10,10), color = 'red')

In [None]:
%matplotlib inline
anime[anime['members'] < 100000]['members'].hist(bins=11, figsize=(10,10), color = 'red')

In [None]:
#10,0000以下のデータは捨てる
anime = anime[anime['members'] > 100000]

In [None]:
#スコアの分布
%matplotlib inline
rating['rating'].plot.hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
round(rating.describe(),2)

In [None]:
#スコアが-1のものは削除する
rating = rating[rating.rating > -1]
round(rating.describe(),2)

In [None]:
#削除後のスコアの分布（-1がなくなっている）
%matplotlib inline
rating['rating'].hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
#欠損データの確認
anime.isnull().sum()

In [None]:
rating.isnull().sum()

In [None]:
#欠損があるデータの作成
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                            pd.NaT]})
df

In [None]:
#欠損データを削除
df.dropna()

In [None]:
#欠損データを埋める
df.fillna(0)

In [None]:
#欠損データを削除
anime = anime.dropna()
anime.isnull().sum()

In [None]:
#必要なデータのみを残す
anime = anime[['anime_id','name']]
anime.head()

In [None]:
#内部結合で2つのデータをmerge
merged_df = pd.merge(rating,anime,how='inner')
merged_df.head()

In [None]:
#結合説明用データ
df1 = pd.DataFrame({'name': ['Tom', 'Bob', 'Alice', 'michael','Eric'],
                    'age': ['25', '60',np.nan, 40, 32],
                    'job_code': [np.nan, 3, 2, 10, 3]},
                    )
df1

In [None]:
#結合説明用データ
df2 = pd.DataFrame({'job_code': [1,2,3,4],
                    'job': ['chef ', 'teacher', 'engineer', 'doctor']},
                   )
df2

In [None]:
#内部結合
pd.merge(df1,df2, left_on='job_code', right_on='job_code',how='inner')

In [None]:
#左外部結合
pd.merge(df1,df2,how='left')

In [None]:
#右外部結合
pd.merge(df1,df2,how='right')

In [None]:
#完全外部結合
pd.merge(df1,df2, how='outer')

In [None]:
#レコメンド対象ユーザの選出
merged_df['user_id'].value_counts().head(10)

In [None]:
merged_df[merged_df['user_id']==57620 ]['rating'].hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
merged_df[merged_df['user_id']==42635]['rating'].hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
#ユーザ42635が好きなアニメ
merged_df[(merged_df['user_id'] == 42635 ) & (merged_df['rating'] >= 9)].sort_values(by=['rating'],ascending=False)

In [None]:
merged_df[(merged_df['user_id'] == 42635 ) & (merged_df['rating'] >= 9)].sort_values(by=['rating'],ascending=False)

In [None]:
#ユーザごとの平均スコア
%matplotlib inline
rating.groupby(['user_id']).mean()['rating'].hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
merged_df.head()

In [None]:
import sys

In [None]:
#ピアソンの相関係数
def pearson_correlation_coefficient(x_df,x_ave,y_df,y_ave):
    cov = std_x = std_y = 0.
    
    for x,y in zip(x_df['rating'],y_df['rating']): 
        cov += (x-x_ave)*(y-y_ave)
        std_x += (x-x_ave)**2
        std_y += (y-y_ave)**2
    
    r = cov /(np.sqrt(std_x)*np.sqrt(std_y))
    return r

In [None]:
#準備
user_u = 42635
user_u_df  = merged_df[merged_df['user_id']==user_u]
user_u_ave = user_u_df['rating'].mean()

user_list = merged_df['user_id'].unique()

similarity_df = pd.DataFrame(columns=['user_id','similarity'])

In [None]:
import time
start = time.time()

for user_v in user_list:
    
    user_v_df  = merged_df[merged_df['user_id']==user_v]
    user_v_ave = user_v_df['rating'].mean()

    if len(user_v_df) <= 10:    
        continue
        
    tmp_user_u_df = user_u_df[user_u_df['anime_id'].isin(user_v_df['anime_id'])]              #user_v_dfに含まれるuser_u_dfの要素のみを残す
    
    if len(tmp_user_u_df) <= 10:  
        continue
        
    tmp_user_v_df = user_v_df[user_v_df['anime_id'].isin(tmp_user_u_df['anime_id'])]           #tmp_user_u_dfに含まれるuser_v_dfのみを残す
    
    value = pearson_correlation_coefficient(tmp_user_u_df,user_u_ave,tmp_user_v_df,user_v_ave)
   
    series = pd.Series([user_v,value], similarity_df.columns)
    similarity_df = similarity_df.append(series,ignore_index = True)

similarity_df = similarity_df[similarity_df['user_id'] != 42635]      #ユーザ42635を削除
print(time.time()-start)

In [None]:
#速度比較用
similarity_list = []
start = time.time()


for user_v in user_list:
    
    user_v_df  = merged_df[merged_df['user_id']==user_v]
    user_v_ave = user_v_df['rating'].mean()

    if len(user_v_df) <= 10:    
        continue
        
    tmp_user_u_df = user_u_df[user_u_df['anime_id'].isin(user_v_df['anime_id'])]              
    
    if len(tmp_user_u_df) <= 10:  
        continue
        
    tmp_user_v_df = user_v_df[user_v_df['anime_id'].isin(tmp_user_u_df['anime_id'])]            
    
    value = pearson_correlation_coefficient(tmp_user_u_df,user_u_ave,tmp_user_v_df,user_v_ave)
   
    similarity_list.append([user_v,value])

cmp_similarity_df = pd.DataFrame(similarity_list,columns=['user_id','similarity'])
cmp_similarity_df = cmp_similarity_df[cmp_similarity_df['user_id'] != 42635]                     

print(time.time()-start)

In [None]:
similarity_df.shape

In [None]:
similarity_df['similarity'].hist(bins=50, figsize=(10,10), color = 'red')

In [None]:
sorted_similarity_df = similarity_df.sort_values(by=['similarity'], ascending=False)
sorted_similarity_df.to_csv('similarity.csv', index=False)

In [None]:
#ユーザUが見ていないアニメのリスト
not_watched = anime[~anime['anime_id'].isin(user_u_df['anime_id'])]

In [None]:
#類似度上位2500人のデータを含むDFを作成
similar_user_df = merged_df[merged_df['user_id'].isin(sorted_similarity_df[:2500]['user_id'])]    

#ユーザごとの平均値をとる
similar_user_ave_df = similar_user_df.groupby(['user_id'], as_index=False).mean()

#不要な列の削除と列名のリネーム
similar_user_ave_df2 = similar_user_ave_df.drop('anime_id', axis=1).rename(columns={"rating":"ave_rating"})    

In [None]:
similar_user_ave_df.head()

In [None]:
similar_user_ave_df2.head()

In [None]:
#対象ユーザが見ていないアニメのみを含むDF
similar_user_df2 = similar_user_df[similar_user_df['anime_id'].isin(not_watched['anime_id'])]

#ユーザごとの平均スコアをDFに付け加える
similar_user_df3 = similar_user_df2.merge(similar_user_ave_df2,how='left',on='user_id')

In [None]:
similar_user_df2.head()

In [None]:
similar_user_df3.head()

In [None]:
similar_user_df3.head()

In [None]:
def f(x): return (x['user_id'],x['anime_id'], x['name'],x['rating']-x['ave_rating'])

similar_user_df4 = pd.DataFrame(list(similar_user_df3.apply(f,axis=1)),columns=['user_id','anime_id','name','rating'])
similar_user_df4.head()

In [None]:
result = similar_user_df4.groupby('name', as_index=False).mean().sort_values(by=['rating'], ascending=False).drop(['anime_id','user_id'], axis=1)

In [None]:
result