## 협업필터링(collaborative filtering) 사용을 위해 

## movielens data 를 matrix 형태로 만들고 cosine-similarity 를 구하는 내용입니다.

In [1]:
import numpy as np
import pandas as pd

### Data loading & preprocessing 

In [None]:
ratings_fn = ('https://raw.githubusercontent.com/kiakass/blog/main/recommend_cf.csv')

In [3]:
df_ratings=pd.read_csv(ratings_fn,index_col='user').fillna(0)
df_ratings

Unnamed: 0_level_0,item1,item2,item3,item4
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user1,5,5.0,3.0,2.0
user2,4,0.0,3.0,2.0
user3,2,5.0,2.0,5.0
user4,5,4.0,1.0,5.0
user5,4,0.0,2.0,1.0
user6,2,1.0,3.0,0.0
user7,4,4.0,0.0,2.0


In [4]:
# 정규화 - scaling 0~1 outlier 처리를 위해
from sklearn.preprocessing import MinMaxScaler
min_max_scaler = MinMaxScaler()

In [5]:
df_combine_output_scaled = min_max_scaler.fit_transform(df_ratings)

In [6]:
df_scaled_output = pd.DataFrame(df_combine_output_scaled, columns=df_ratings.columns, index=list(df_ratings.index.values))
df_scaled_output

Unnamed: 0,item1,item2,item3,item4
user1,1.0,1.0,1.0,0.4
user2,0.666667,0.0,1.0,0.4
user3,0.0,1.0,0.666667,1.0
user4,1.0,0.8,0.333333,1.0
user5,0.666667,0.0,0.666667,0.2
user6,0.0,0.2,1.0,0.0
user7,0.666667,0.8,0.0,0.4


In [7]:
min_max_scaler.inverse_transform(df_combine_output_scaled)

array([[5., 5., 3., 2.],
       [4., 0., 3., 2.],
       [2., 5., 2., 5.],
       [5., 4., 1., 5.],
       [4., 0., 2., 1.],
       [2., 1., 3., 0.],
       [4., 4., 0., 2.]])

### 1.SVD ,  Singular  Value  Decomposition(특이값분해): $ A = USV^{T}$
https://blog.naver.com/kiakass/222200041769

In [8]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(df_scaled_output, k=3)

In [9]:
U

array([[-0.12422546, -0.11101413, -0.54521869],
       [-0.15254182, -0.48588619, -0.33028912],
       [ 0.76311683,  0.26474522, -0.4138005 ],
       [-0.27266344,  0.37025334, -0.48623564],
       [-0.3083597 , -0.34551615, -0.24643132],
       [ 0.39478943, -0.51105735, -0.1992514 ],
       [-0.23165055,  0.40478621, -0.29250679]])

In [10]:
sigma=np.diag(sigma) #sorted(sigma, reverse=True)

$A = USV^{T}$

In [11]:
user_predict_ratiings  = np.dot(np.dot(U, sigma), Vt)  ;user_predict_ratiings

array([[ 0.98776729,  0.77066231,  0.9885708 ,  0.69002404],
       [ 0.67527709,  0.1614275 ,  1.00804485,  0.19585626],
       [ 0.00473868,  1.08884033,  0.67109408,  0.88765112],
       [ 1.00808234,  0.95152703,  0.34088479,  0.80837655],
       [ 0.67043767,  0.07069843,  0.67018997,  0.11059366],
       [-0.00511639,  0.10407838,  0.99521968,  0.12130399],
       [ 0.65991446,  0.67341027, -0.00630868,  0.56008735]])

#### rating 값으로 복원

In [12]:
df_output_inverse = min_max_scaler.inverse_transform(user_predict_ratiings)

In [13]:
preds_df=pd.DataFrame(df_output_inverse, columns=df_ratings.columns, index=list(df_ratings.index.values))
round(preds_df,2)

Unnamed: 0,item1,item2,item3,item4
user1,4.96,3.85,2.97,3.45
user2,4.03,0.81,3.02,0.98
user3,2.01,5.44,2.01,4.44
user4,5.02,4.76,1.02,4.04
user5,4.01,0.35,2.01,0.55
user6,1.98,0.52,2.99,0.61
user7,3.98,3.37,-0.02,2.8


#### predictions vs real value 비교

In [14]:
df_ratings

Unnamed: 0_level_0,item1,item2,item3,item4
user,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
user1,5,5.0,3.0,2.0
user2,4,0.0,3.0,2.0
user3,2,5.0,2.0,5.0
user4,5,4.0,1.0,5.0
user5,4,0.0,2.0,1.0
user6,2,1.0,3.0,0.0
user7,4,4.0,0.0,2.0


### 2.Cosine_similarity - item based CF : cos(θ) = $\frac{\sum_{n=1}^{n}A_i B_i}{\sqrt{\sum_{n=1}^{n}A_i^2} \sqrt{\sum_{n=1}^{n}B_i^2}}$

In [15]:
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
df_scaled_output

Unnamed: 0,item1,item2,item3,item4
user1,1.0,1.0,1.0,0.4
user2,0.666667,0.0,1.0,0.4
user3,0.0,1.0,0.666667,1.0
user4,1.0,0.8,0.333333,1.0
user5,0.666667,0.0,0.666667,0.2
user6,0.0,0.2,1.0,0.0
user7,0.666667,0.8,0.0,0.4


In [21]:
df_scaled_output.T

Unnamed: 0,user1,user2,user3,user4,user5,user6,user7
item1,1.0,0.666667,0.0,1.0,0.666667,0.0,0.666667
item2,1.0,0.0,1.0,0.8,0.0,0.2,0.8
item3,1.0,1.0,0.666667,0.333333,0.666667,1.0,0.0
item4,0.4,0.4,1.0,1.0,0.2,0.0,0.4


In [17]:
item_base_cf = cosine_similarity(df_scaled_output.T)

In [18]:
icf=pd.DataFrame(item_base_cf, index=df_scaled_output.T.index, columns=df_scaled_output.T.index)
icf

Unnamed: 0,item1,item2,item3,item4
item1,1.0,0.701404,0.669439,0.713068
item2,0.701404,1.0,0.585409,0.871227
item3,0.669439,0.585409,1.0,0.608943
item4,0.713068,0.871227,0.608943,1.0


### 3.Cosine_similarity - user based cf

In [19]:
user_base_cf = cosine_similarity(df_scaled_output)

In [20]:
ucf_score=pd.DataFrame(user_base_cf, index=df_scaled_output.index, columns=df_scaled_output.index)
ucf_score

Unnamed: 0,user1,user2,user3,user4,user5,user6,user7
user1,1.0,0.811248,0.743596,0.859201,0.824934,0.661944,0.82029
user2,0.811248,1.0,0.538612,0.666364,0.975681,0.774143,0.427766
user3,0.743596,0.538612,1.0,0.779803,0.427675,0.543557,0.688024
user4,0.859201,0.666364,0.779803,1.0,0.681158,0.291655,0.922374
user5,0.824934,0.975681,0.427675,0.681158,1.0,0.678282,0.487787
user6,0.661944,0.774143,0.543557,0.291655,0.678282,1.0,0.140642
user7,0.82029,0.427766,0.688024,0.922374,0.487787,0.140642,1.0
