<a href="https://colab.research.google.com/github/hyuna0926/ds-sa-cp2/blob/%EC%A0%84%ED%98%84%EC%95%84/ALS_1_26.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [22]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler

## 데이터 분리

In [3]:
df=pd.read_csv('/content/drive/MyDrive/ml-100k/movie_lens.csv', index_col=0)
df_c = df.astype(dtype='int32',errors='ignore')
df_c.iloc[:,5:7]=df_c.iloc[:,5:7].astype('category')
df_c.iloc[:,-1:]=df_c.iloc[:,-1:].astype('category')

- 데이터 삭제

In [4]:
unknown = df_c['movie_title'] !='unknown'
df_c = df_c[unknown]

In [5]:
train, test = train_test_split(df_c[['user_id','movie_id','rating']],test_size=0.2,
                               shuffle=True, random_state=3)

In [9]:
train_matrix = train.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
test_matrix = test.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)

train_matrix=csr_matrix(train_matrix.values).toarray()
test_matrix=csr_matrix(test_matrix.values).toarray()

In [10]:
train_matrix

array([[0., 3., 4., ..., 0., 0., 0.],
       [4., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 5., 0., ..., 0., 0., 0.]])

## ALS

In [52]:
class AlternatingLeastSquares():
  def __init__(self,R, k,alpha ,reg_param, epochs, verbose=True,random_state=42):
    """
    R : rating matrix # 별점 행렬
    k : latent parameter #m*k, k*n 차원의 수 데이터가 많으면 보통 20개
    reg_param : lambda
    epochs : training epochs  #몇 번 반복할 것인지
    verbose : print status #학습과정 출력할지 여부
    """
    self.R=R
    self.num_users, self.num_items = R.shape
    self.k = k
    self.alpha=alpha
    self.reg_param = reg_param
    self.epochs = epochs
    self.verbose = verbose
    self.random_state= random_state
  

  # 사용자
  def user_latent(self, i, Ri):
    """
    i : user index
    Ri : Rating of user index i
    Cu : 신뢰행렬
    return : convergence value of user latent of i index

    user = np.dot(np.linalg.inv(np.dot(vt,v)+lambda_i),np.dot(vt, Rij))
    """
    Cu = 1+self.alpha*np.diag(Ri)
    Vt = self.items.T
    V = np.dot(Cu,self.items)
    lambda_i = self.reg_param * np.eye(self.k)
    Rij= np.dot(Cu, self.R[i].T)

    # np.linalg.solve은 np.linalg.inv()(역행렬) 만들고 np.dot()한 결과와 동일함 
    # du = np.linalg.solve(np.dot(Vt, V)+ lambda_i, np.dot(Vt,Rij)).T

    du = np.dot(np.linalg.inv(np.dot(Vt,V)+lambda_i),np.dot(Vt, Rij))
    return du


  # 아이템
  def item_latent(self, j,Rj):
    """
    Ci: 신뢰도 행렬
    item = np.dot(np.linalg.inv(np.dot(ut,u)+lambda_i), np.dot(ut, Rij))
    """
    Ci = 1+self.alpha*np.diag(Rj)
    Ut = self.users.T
    U = np.dot(Ci,self.users)
    lambda_i = self.reg_param*np.eye(self.k)
    Rij = np.dot(Ci, self.R[:,j])

    # di = np.linalg.solve(np.dot(Ut,U)+lambda_i, np.dot(Ut,Rij))
    di = np.dot(np.linalg.inv(np.dot(Ut,U)+lambda_i), np.dot(Ut,Rij))
    return di


  #rmse,mae
  def cost(self):
    '''
    pow(x,y) = x의 y제곱
    return rmse
    '''
    xi, yi = self.R.nonzero() #0이 아닌 값들의 index 반환
    mse,mae = 0,0
    for x,y in zip(xi,yi):
      mse += pow(self.R[x,y] - self.get_prediction(x,y),2)
      mae += abs(self.R[x,y] - self.get_prediction(x,y))

    return np.sqrt(mse/len(xi)), mae/len(xi)
  


  # 예측값
  def get_prediction(self, i,j):
    """
    get predicted rating: user_i, item_j
    """
    return self.users[i,:].dot(self.items[j,:].T)


  # 학습
  def fit(self):
    # 초기(랜덤) 사용자, 아이템 행렬 만들어주기
    np.random.seed(self.random_state)
    self.users = np.random.normal(size=(self.num_users, self.k))
    self.items = np.random.normal(size=(self.num_items, self.k))

    # 반복(epochs)
    self.training_process = []
    # error : rating - prediction error
    self.user_error =0;
    self.item_error =0; 
    for epoch in range(self.epochs):
      #rating이 존재하는 index를 기준으로 training
      for i, Ri in enumerate(self.R):
        self.users[i]=self.user_latent(i,Ri)
        self.user_error = self.cost()
      
      for j, Rj in enumerate(self.R.T):
        self.items[j] = self.item_latent(j,Rj)
        self.item_error = self.cost()
      
      cost = self.cost()
      self.training_process.append((epoch,cost))

      #print status
      if self.verbose == True and ((epoch+1)%1 == 0):
        print(f"Iteration: {epoch+1}; rmse={cost[0]}; mae={cost[1]}")
  
  #trasform
  def transform(self,test):
    for epoch in range(self.epochs):
      #rating이 존재하는 index를 기준으로 training
      for i, testi in enumerate(test):
        self.users[i]=self.user_latent(i,testi)
        self.user_error = self.cost()
      
      for j, testj in enumerate(test.T):
        self.items[j] = self.item_latent(j,testj)
        self.item_error = self.cost()
      
      cost = self.cost()
      self.training_process.append((epoch,cost))

      #print status
      if self.verbose == True and ((epoch+1)%1 == 0):
        print(f"Iteration: {epoch+1}; rmse={cost[0]}; mae={cost[1]}")


  # 결과
  def get_complete_matrix(self):
    return self.users.dot(self.items.T)
  
  #user,item
  def user_item(self):
    return self.users, self.items

In [None]:
als = AlternatingLeastSquares(R=train_matrix ,k=200,alpha=40,reg_param=40, epochs=3, verbose=True)
als.fit()

In [20]:
als.cost()

(0.6676728494825777, 0.46379485033625373)

In [24]:
user_vecs, item_vecs = als.user_item()

### 추천하기

In [None]:
matrix = df_c.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
matrix

#### 유저가 안 본 영화 중 추천해주기

In [34]:
def recommend_un(user_id,mf_train,user_vecs,item_vecs,num_items=10):
  # 유저위치
  # user_ind = np.where(matrix.index==user_id)[0][0]
  pref_vec = mf_train[user_id-1,:]#.toarray()                   # 훈련 데이터의 실제 평점
  pref_vec = pref_vec.reshape(-1)+1                           # 1을 더해 안본것들을 1로 만들고
  pref_vec[pref_vec > 1] = 0                                  # 본 것들을 모두 0으로 만듦으로써 곱해질때 본 것들이 안나오게!
  rec_vector = user_vecs[user_id-1,:].dot(item_vecs.T)         # 추천 시스템에 기반한 예측 평점


  # Min-Max Scaling
  min_max = MinMaxScaler()
  rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
  recommend_vector = pref_vec*rec_vector_scaled  # 구매하지 않은 아이템에 대해서만 예측 평점이 남도록

  product_idx = np.argsort(recommend_vector)[::-1][:num_items+1] # num_items만큼 내림차순으로 평점 정렬한 index
    

  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  recommend = movie_list[movie_list['movie_id'].isin(product_idx+1)].reset_index(drop=True)

  return recommend

In [35]:
recommend_un(12,train_matrix,user_vecs,item_vecs,25)

Unnamed: 0,movie_id,movie_title
0,983,"Rich Man's Wife, The (1996)"
1,939,Murder in the First (1995)
2,880,Soul Food (1997)
3,815,One Fine Day (1996)
4,278,Bed of Roses (1996)
5,764,If Lucy Fell (1996)
6,458,Nixon (1995)
7,466,Red Rock West (1992)
8,696,City Hall (1996)
9,582,"Piano, The (1993)"


### 성능평가
> **precision@k, recall@k**<br>
> *유저가 본 영화 중 4점 이상인 영화와 추천영화 비교*

#### 유저가 평가한 4점 이상인 영화

In [29]:
def get_movie_4(user_id):
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  trans = matrix[matrix.index==user_id].T
  trans.columns=['rating']

  movie_ind=trans[trans['rating']>=4].index.tolist()  
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()
  user_movie = movie_list[movie_list['movie_id'].isin(movie_ind)]
  
  return user_movie

In [30]:
get_movie_4(12)

Unnamed: 0,movie_id,movie_title
0,242,Kolya (1996)
309,381,Muriel's Wedding (1994)
881,238,Raising Arizona (1987)
3800,202,Groundhog Day (1993)
8613,591,Primal Fear (1996)
9676,71,"Lion King, The (1994)"
12911,300,Air Force One (1997)
14546,754,Red Corner (1997)
15692,98,"Silence of the Lambs, The (1991)"
16957,684,In the Line of Fire (1993)


#### 유저에게 추천하기(본 것들도 포함)

In [31]:
def recommend(user_id,mf_train,user_vecs,item_vecs,num_items=25):
  # 유저위치
  rec_vector = user_vecs[user_id-1,:].dot(item_vecs.T)         # 추천 시스템에 기반한 예측 평점


  # Min-Max Scaling
  min_max = MinMaxScaler()
  rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 

  product_idx = np.argsort(rec_vector_scaled)[::-1][:num_items+1] # num_items만큼 내림차순으로 평점 정렬한 index
    
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  recommend = movie_list[movie_list['movie_id'].isin(product_idx+1)].reset_index(drop=True)

  return recommend

In [36]:
recommend(12,train_matrix,user_vecs,item_vecs)

Unnamed: 0,movie_id,movie_title
0,983,"Rich Man's Wife, The (1996)"
1,939,Murder in the First (1995)
2,880,Soul Food (1997)
3,815,One Fine Day (1996)
4,278,Bed of Roses (1996)
5,764,If Lucy Fell (1996)
6,458,Nixon (1995)
7,466,Red Rock West (1992)
8,696,City Hall (1996)
9,582,"Piano, The (1993)"


#### precision/recall@k

In [37]:
def precision_recall(user_id,data):
  count=0
  user_seen = get_movie_4(user_id)['movie_title']
  user_recommend = recommend(user_id,data,user_vecs,item_vecs)['movie_title']
  for i in user_seen:
    for j in user_recommend:
      if i==j:
        count+=1
  
  precision = count/len(user_recommend)
  if len(user_seen)==0:
    pass
  else:
    recall = count/len(user_seen)
  
  return precision, recall

In [42]:
precision_recall(1,train_matrix)

(0.038461538461538464, 0.006172839506172839)

In [44]:
precision_recall(30,train_matrix)

(0.038461538461538464, 0.034482758620689655)

In [45]:
precision_recall(922,train_matrix)

(0.038461538461538464, 0.017543859649122806)