<a href="https://colab.research.google.com/github/hyuna0926/ds-sa-cp2/blob/%EC%A0%84%ED%98%84%EC%95%84/1_23_cp2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Collaborative Filtering - ALS

## ALS
1. 초기 아이템, 사용자 행렬 초기화
2. 아이템 행렬 고정 사용자행렬 최적화
3. 사용자 행렬 고정 아이템행렬 최적화
4. 위 2,3번 반복

[ALS](https://eda-ai-lab.tistory.com/529)

- 공식으로 구현해보기

In [96]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

In [41]:
from os import makedirs
#클래스 만들기

class AlternatingLeastSquares():
  def __init__(self, R, k, reg_param, epochs, verbose=False):
    """
    R : rating matrix # 별점 행렬
    k : latent parameter #m*k, k*n 차원의 수 데이터가 많으면 보통 20개
    reg_param : lambda
    epochs : training epochs  #몇 번 반복할 것인지
    verbose : print status #학습과정 출력할지 여부
    """
    self.R=R
    self.num_users, self.num_items = R.shape
    self.k = k
    self.reg_param = reg_param
    self.epochs = epochs
    self.verbose = verbose
  

  # 사용자
  def user_latent(self, i, Ri):
    """
    error : rating - prediction error
    i : user index
    Ri : Rating of user index i
    return : convergence value of user latent of i index

    user = np.dot(np.linalg.inv(np.dot(vt,v)+lambda_i),np.dot(vt, Rij))
    """
    Vt = self.items.T
    V = np.dot(np.diag(Ri),self.items)
    lambda_i = self.reg_param * np.eye(self.k)
    Rij= np.dot(np.diag(Ri), self.R[i].T)

    # np.linalg.solve은 np.linalg.inv()(역행렬) 만들고 np.dot()한 결과와 동일함 
    # du = np.linalg.solve(np.dot(Vt, V)+ lambda_i, np.dot(Vt,Rij)).T

    du = np.dot(np.linalg.inv(np.dot(Vt,V)+lambda_i),np.dot(Vt, Rij))
    return du


  # 아이템
  def item_latent(self, j,Rj):
    """
    item = np.dot(np.linalg.inv(np.dot(ut,u)+lambda_i), np.dot(ut, Rij))
    """
    Ut = self.users.T
    U = np.dot(np.diag(Rj),self.users)
    lambda_i = self.reg_param*np.eye(self.k)
    Rij = np.dot(np.diag(Rj), self.R[:,j])

    # di = np.linalg.solve(np.dot(Ut,U)+lambda_i, np.dot(Ut,Rij))
    di = np.dot(np.linalg.inv(np.dot(Ut,U)+lambda_i), np.dot(Ut,Rij))
    return di


  #rmse
  def cost(self):
    '''
    pow(x,y) = x의 y제곱
    return rmse
    '''
    xi, yi = self.R.nonzero() #0이 아닌 값들의 index 반환
    mse,mae = 0,0
    for x,y in zip(xi,yi):
      mse += pow(self.R[x,y] - self.get_prediction(x,y),2)
      mae += abs(self.R[x,y] - self.get_prediction(x,y))

    return np.sqrt(mse/len(xi)), mae/len(xi)
  



  # 예측값
  def get_prediction(self, i,j):
    """
    get predicted rating: user_i, item_j
    """
    return self.users[i,:].dot(self.items[j,:].T)


  # 학습
  def fit(self):
    # 초기(랜덤) 사용자, 아이템 행렬 만들어주기
    self.users = np.random.normal(size=(self.num_users, self.k))
    self.items = np.random.normal(size=(self.num_items, self.k))

    # 반복(epochs)
    self.training_process = []
    self.user_error =0;
    self.item_error =0; 
    for epoch in range(self.epochs):
      #rating이 존재하는 index를 기준으로 training
      for i, Ri in enumerate(self.R):
        self.users[i]=self.user_latent(i,Ri)
        self.user_error = self.cost()
      
      for j, Rj in enumerate(self.R.T):
        self.items[j] = self.item_latent(j,Rj)
        self.item_error = self.cost()
      
      cost = self.cost()
      self.training_process.append((epoch,cost))

      #print status
      if self.verbose == True and ((epoch+1)%10 == 0):
        print(f"Iteration: {epoch+1}; rmse={cost[0]}; mae={cost[1]}")


  # 결과
  def get_complete_matrix(self):
    return self.users.dot(self.items.T)



## CODE 뜯고 맛보기

In [40]:
def sss(a):
  c=0
  d=0
  for i in range(1,a):
    c+=i
    d-=i
  return c,d
sss(10)[0]

45

In [2]:
R = np.array([
        [1, 0, 0, 1, 3],
        [2, 0, 3, 1, 1],
        [1, 2, 0, 5, 0],
        [1, 0, 0, 4, 4],
        [2, 1, 5, 4, 0],
        [5, 1, 5, 4, 0],
        [0, 0, 0, 1, 0],
    ])

In [3]:
R.nonzero()

(array([0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5, 6]),
 array([0, 3, 4, 0, 2, 3, 4, 0, 1, 3, 0, 3, 4, 0, 1, 2, 3, 0, 1, 2, 3, 3]))

In [4]:
for i, Ri in enumerate(R):
  print(i,Ri)

0 [1 0 0 1 3]
1 [2 0 3 1 1]
2 [1 2 0 5 0]
3 [1 0 0 4 4]
4 [2 1 5 4 0]
5 [5 1 5 4 0]
6 [0 0 0 1 0]


In [5]:
R[1,:]

array([2, 0, 3, 1, 1])

In [6]:
user, item=R.shape
k=2

In [7]:
print(user,item)

7 5


- 랜덤

In [30]:
users=np.random.normal(size=(user, k))
items=np.random.normal(size=(item, k))

In [31]:
items

array([[-0.74701711, -0.17321202],
       [ 0.22829509, -1.09728407],
       [ 1.27890807, -0.32934075],
       [ 1.76217103, -0.46742663],
       [ 0.29966062, -0.19417342]])

In [9]:
users

array([[ 0.07934117,  0.55114124],
       [-0.31357278,  0.30594198],
       [-1.05635029, -0.74633996],
       [-0.18757459,  0.26913108],
       [ 0.11136912, -1.18153912],
       [-0.4979724 , -0.52980363],
       [-0.18822109, -0.78212667]])

- USER 최적화

In [10]:
reg_param =0.01
def user_latent(i, Ri):
    """
    error : rating - prediction error
    i : user index
    Ri : Rating of user index i
    return : convergence value of user latent of i index

    user = np.dot(np.linalg.inv(np.dot(vt,v)+lambda_i),np.dot(vt, Rij))
    """
    Vt = items.T
    V = np.dot(np.diag(Ri),items)
    lambda_i = reg_param * np.eye(k)
    Rij= np.dot(np.diag(Ri), R[i].T)
    #np.linalg.solve은 np.linalg.inv()(역행렬) 만들고 np.dot()한 결과와 동일함 
    du = np.linalg.solve(np.dot(Vt, V)+ lambda_i, np.dot(Vt,Rij)).T
    return du

In [11]:
for i, Ri in enumerate(R):
    users[i]=user_latent(i,Ri)
users    

array([[-2.59052301, -1.82118326],
       [-0.49688413, -0.0993656 ],
       [ 2.22480851, -3.95831303],
       [-3.03147213, -7.11849284],
       [ 2.71700813, -1.0351145 ],
       [-0.60502494, -4.78911648],
       [ 0.63177772, -0.55785019]])

In [17]:
def user_latent_1(i, Ri):
    """
    error : rating - prediction error
    i : user index
    Ri : Rating of user index i
    return : convergence value of user latent of i index

    user = np.dot(np.linalg.inv(np.dot(vt,v)+lambda_i),np.dot(vt, Rij))
    """
    Vt = items.T
    V = np.dot(np.diag(Ri),items)
    lambda_i = reg_param * np.eye(k)
    Rij= np.dot(np.diag(Ri), R[i].T)
    #np.linalg.solve은 np.linalg.inv()(역행렬) 만들고 np.dot()한 결과와 동일함 
    du = np.dot(np.linalg.inv(np.dot(Vt,V)+lambda_i),np.dot(Vt, Rij))
    return du

In [18]:
for i, Ri in enumerate(R):
    users[i]=user_latent_1(i,Ri)
users    

array([[-2.59052301, -1.82118326],
       [-0.49688413, -0.0993656 ],
       [ 2.22480851, -3.95831303],
       [-3.03147213, -7.11849284],
       [ 2.71700813, -1.0351145 ],
       [-0.60502494, -4.78911648],
       [ 0.63177772, -0.55785019]])

- item 최적화

In [34]:
def item_latent(j,Rj):
    """
    item = np.dot(np.linalg.inv(ut,u)+lambda_i, np.dot(ut, Rij))
    """
    Ut = users.T
    U = np.dot(np.diag(Rj),users)
    lambda_i = reg_param*np.eye(k)
    Rij = np.dot(np.diag(Rj), R[:,j])

    di = np.linalg.solve(np.dot(Ut,U)+lambda_i, np.dot(Ut,Rij))
    return di

In [35]:
for j, Rj in enumerate(R.T):
   items[j] = item_latent(j, Rj)
items

array([[ 2.06094569, -1.45060098],
       [-0.44603877, -2.83496424],
       [-0.05551008, -0.44946957],
       [-0.12641285, -3.74322564],
       [-6.72849183,  3.51034187]])

In [36]:
def item_latent_1(j,Rj):
    """
    item = np.dot(np.linalg.inv(np.dot(Ut,u)+lambda_i), np.dot(Ut, Rij))
    """
    Ut = users.T
    U = np.dot(np.diag(Rj),users)
    lambda_i = reg_param*np.eye(k)
    Rij = np.dot(np.diag(Rj), R[:,j])

    di =  np.dot(np.linalg.inv(np.dot(Ut,U)+lambda_i), np.dot(Ut,Rij))
    return di

In [37]:
for j, Rj in enumerate(R.T):
   items[j] = item_latent_1(j, Rj)
items

array([[ 2.06094569, -1.45060098],
       [-0.44603877, -2.83496424],
       [-0.05551008, -0.44946957],
       [-0.12641285, -3.74322564],
       [-6.72849183,  3.51034187]])

In [None]:
users.dot(items.T)

array([[-0.29984125,  0.71107765, -0.96051395,  0.81720924,  2.61590126],
       [ 0.3272503 , -0.16889076,  0.61323903, -0.02690368, -1.02164265],
       [ 3.44584415,  1.96589219,  3.77428699,  5.05083658,  0.54808445],
       [ 1.47866286,  2.10895272,  0.71291596,  3.9700372 ,  4.05590039],
       [ 4.23640621,  0.91296556,  5.7178503 ,  4.06707422, -3.86730051],
       [ 3.37741174,  1.17691668,  4.23669326,  3.88216353, -1.72720063],
       [ 0.73819437,  0.37786422,  0.83957064,  1.02036505, -0.01327932]])

## 더미데이터로 실행

In [42]:
if __name__ == "__main__":
    # rating matrix - User X Item : (7 X 5)
    R = np.array([
        [1, 0, 0, 1, 3],
        [2, 0, 3, 1, 1],
        [1, 2, 0, 5, 0],
        [1, 0, 0, 4, 4],
        [2, 1, 5, 4, 0],
        [5, 1, 5, 4, 0],
        [0, 0, 0, 1, 0],
    ])

In [43]:
als = AlternatingLeastSquares(R = R, reg_param = 0.01, epochs=100, verbose=True, k=3)
als.fit()

Iteration: 10; rmse=0.010575607114263882; mae=0.006549823312835146
Iteration: 20; rmse=0.00854460377106766; mae=0.005388997754247893
Iteration: 30; rmse=0.007285048492959748; mae=0.0046423788527967195
Iteration: 40; rmse=0.006428866404872557; mae=0.004176452036164084
Iteration: 50; rmse=0.00581078512884749; mae=0.0038339531724855672
Iteration: 60; rmse=0.005345332805738212; mae=0.003564112606165212
Iteration: 70; rmse=0.004983008100715914; mae=0.0033442203019312163
Iteration: 80; rmse=0.004693003502318026; mae=0.0031603302629142046
Iteration: 90; rmse=0.004455220506722262; mae=0.003003390666838034
Iteration: 100; rmse=0.004256112552257482; mae=0.0029054737766648


In [44]:
als.get_complete_matrix()

array([[ 1.01324929,  0.53267807, -1.59309973,  1.00524934,  2.99556046],
       [ 2.00108156,  0.07552257,  2.99515009,  0.99289908,  0.99554818],
       [ 0.9962799 ,  1.99908018, -1.35349391,  4.99957716,  4.41429551],
       [ 1.00042033,  1.62379618, -1.33444292,  3.99861113,  4.00041731],
       [ 2.00610086,  0.99958456,  5.00027982,  4.00059116,  0.74626557],
       [ 4.99768857,  1.00075309,  5.00077522,  4.00063448,  4.92697459],
       [-0.05843708,  0.40542251, -0.34464777,  0.99565272,  0.57538344]])

In [45]:
als.cost()

(0.004256112552257482, 0.0029054737766648)

## movielens 데이터 사용

In [48]:
df=pd.read_csv('/content/drive/MyDrive/ml-100k/movie_lens.csv', index_col=0)
df_c = df.astype(dtype='int32',errors='ignore')
df_c.iloc[:,5:7]=df_c.iloc[:,5:7].astype('category')
df_c.iloc[:,-1:]=df_c.iloc[:,-1:].astype('category')

In [50]:
df_c.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100000 entries, 0 to 99999
Data columns (total 27 columns):
 #   Column       Non-Null Count   Dtype   
---  ------       --------------   -----   
 0   user_id      100000 non-null  int32   
 1   movie_id     100000 non-null  int32   
 2   rating       100000 non-null  int32   
 3   timestamp    100000 non-null  int32   
 4   age          100000 non-null  int32   
 5   gender       100000 non-null  category
 6   occupation   100000 non-null  category
 7   unknown      100000 non-null  int32   
 8   Action       100000 non-null  int32   
 9   Adventure    100000 non-null  int32   
 10  Animation    100000 non-null  int32   
 11  Children's   100000 non-null  int32   
 12  Comedy       100000 non-null  int32   
 13  Crime        100000 non-null  int32   
 14  Documentary  100000 non-null  int32   
 15  Drama        100000 non-null  int32   
 16  Fantasy      100000 non-null  int32   
 17  Film-Noir    100000 non-null  int32   
 18  Horro

- 데이터 분리

In [85]:
col_names=['user_id','movie_id','rating','timestamp']
train = pd.read_csv('/content/drive/MyDrive/ml-100k/u1.base',sep='\t',names=col_names)
test = pd.read_csv('/content/drive/MyDrive/ml-100k/u1.test',sep='\t',names=col_names)

In [86]:
train['user_id'].nunique()

943

In [87]:
test['user_id'].nunique()

459

In [88]:
test['movie_id'].nunique()

1410

In [89]:
train['movie_id'].nunique()

1650

In [82]:
# train, test = train_test_split(df_c[['user_id','movie_id','rating']],test_size=0.2,
                              #  shuffle=True, random_state=3)

In [83]:
# train['user_id'].nunique()


943

In [59]:
# print(train.shape, test.shape)

(80000, 3) (20000, 3)


In [97]:
train_matrix = train.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
test_matrix = test.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
train_matrix=csr_matrix(train_matrix.values)
test_matrix=csr_matrix(test_matrix.values)


In [98]:
train_matrix.shape

(943, 1650)

In [99]:
test_matrix.shape

(459, 1410)

In [107]:
train_matrix.toarray().shape

(943, 1650)

In [None]:
als = AlternatingLeastSquares(R=train_matrix.toarray() ,k=20, reg_param=0.01, epochs=50, verbose=True)
als.fit()