<a href="https://colab.research.google.com/github/hyuna0926/cp2/blob/main/ALS_class.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler

## 데이터 분리

In [2]:
df=pd.read_csv('/content/drive/MyDrive/ml-100k/movie_lens.csv', index_col=0)
df_c = df.astype(dtype='int32',errors='ignore')
df_c.iloc[:,5:7]=df_c.iloc[:,5:7].astype('category')
df_c.iloc[:,-1:]=df_c.iloc[:,-1:].astype('category')

- 데이터 삭제

In [3]:
unknown = df_c['movie_title'] !='unknown'
df_c = df_c[unknown]

In [4]:
train, test = train_test_split(df_c[['user_id','movie_id','rating']],test_size=0.2,
                               shuffle=True, random_state=3)

In [111]:
train_matrix_p = train.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
test_matrix_p = test.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)

train_matrix=csr_matrix(train_matrix_p.values).toarray()
test_matrix=csr_matrix(test_matrix_p.values).toarray()

In [None]:
train_matrix_p

In [None]:
train_matrix

## ALS

In [6]:
class AlternatingLeastSquares():
  def __init__(self,R, k,alpha ,reg_param, epochs, verbose=True,random_state=42):
    """
    R : rating matrix # 별점 행렬
    k : latent parameter #m*k, k*n 차원의 수 데이터가 많으면 보통 20개
    reg_param : lambda
    epochs : training epochs  #몇 번 반복할 것인지
    verbose : print status #학습과정 출력할지 여부
    """
    self.R=R
    self.num_users, self.num_items = R.shape
    self.k = k
    self.alpha=alpha
    self.reg_param = reg_param
    self.epochs = epochs
    self.verbose = verbose
    self.random_state= random_state
  

  # 사용자
  def user_latent(self, i, Ri):
    """
    i : user index
    Ri : Rating of user index i
    Cu : 신뢰행렬
    return : convergence value of user latent of i index

    user = np.dot(np.linalg.inv(np.dot(vt,v)+lambda_i),np.dot(vt, Rij))
    """
    Cu = 1+self.alpha*np.diag(Ri)
    Vt = self.items.T
    V = np.dot(Cu,self.items)
    lambda_i = self.reg_param * np.eye(self.k)
    Rij= np.dot(Cu, self.R[i].T)

    # np.linalg.solve은 np.linalg.inv()(역행렬) 만들고 np.dot()한 결과와 동일함 
    # du = np.linalg.solve(np.dot(Vt, V)+ lambda_i, np.dot(Vt,Rij)).T

    du = np.dot(np.linalg.inv(np.dot(Vt,V)+lambda_i),np.dot(Vt, Rij))
    return du


  # 아이템
  def item_latent(self, j,Rj):
    """
    Ci: 신뢰도 행렬
    item = np.dot(np.linalg.inv(np.dot(ut,u)+lambda_i), np.dot(ut, Rij))
    """
    Ci = 1+self.alpha*np.diag(Rj)
    Ut = self.users.T
    U = np.dot(Ci,self.users)
    lambda_i = self.reg_param*np.eye(self.k)
    Rij = np.dot(Ci, self.R[:,j])

    # di = np.linalg.solve(np.dot(Ut,U)+lambda_i, np.dot(Ut,Rij))
    di = np.dot(np.linalg.inv(np.dot(Ut,U)+lambda_i), np.dot(Ut,Rij))
    return di


  #rmse,mae
  def cost(self):
    '''
    pow(x,y) = x의 y제곱
    return rmse
    '''
    xi, yi = self.R.nonzero() #0이 아닌 값들의 index 반환
    mse,mae = 0,0
    for x,y in zip(xi,yi):
      mse += pow(self.R[x,y] - self.get_prediction(x,y),2)
      mae += abs(self.R[x,y] - self.get_prediction(x,y))

    return np.sqrt(mse/len(xi)), mae/len(xi)
  


  # 예측값
  def get_prediction(self, i,j):
    """
    get predicted rating: user_i, item_j
    """
    return self.users[i,:].dot(self.items[j,:].T)


  # 학습
  def fit(self):
    # 초기(랜덤) 사용자, 아이템 행렬 만들어주기
    np.random.seed(self.random_state)
    self.users = np.random.normal(size=(self.num_users, self.k))
    self.items = np.random.normal(size=(self.num_items, self.k))

    # 반복(epochs)
    self.training_process = []
    # error : rating - prediction error
    self.user_error =0;
    self.item_error =0; 
    for epoch in range(self.epochs):
      #rating이 존재하는 index를 기준으로 training
      for i, Ri in enumerate(self.R):
        self.users[i]=self.user_latent(i,Ri)
        self.user_error = self.cost()
      
      for j, Rj in enumerate(self.R.T):
        self.items[j] = self.item_latent(j,Rj)
        self.item_error = self.cost()
      
      cost = self.cost()
      self.training_process.append((epoch,cost))

      #print status
      if self.verbose == True and ((epoch+1)%1 == 0):
        print(f"Iteration: {epoch+1}; rmse={cost[0]}; mae={cost[1]}")
  
  #trasform
  def transform(self,test):
    for epoch in range(self.epochs):
      #rating이 존재하는 index를 기준으로 training
      for i, testi in enumerate(test):
        self.users[i]=self.user_latent(i,testi)
        self.user_error = self.cost()
      
      for j, testj in enumerate(test.T):
        self.items[j] = self.item_latent(j,testj)
        self.item_error = self.cost()
      
      cost = self.cost()
      self.training_process.append((epoch,cost))

      #print status
      if self.verbose == True and ((epoch+1)%1 == 0):
        print(f"Iteration: {epoch+1}; rmse={cost[0]}; mae={cost[1]}")


  # 결과
  def get_complete_matrix(self):
    return self.users.dot(self.items.T)
  
  #user,item
  def user_item(self):
    return self.users, self.items

- *7번 돌리는데 2시간 20분 정도 걸림*

In [7]:
als = AlternatingLeastSquares(R=train_matrix ,k=200,alpha=40,reg_param=40, epochs=7, verbose=True)
als.fit()

Iteration: 1; rmse=0.21830177009663754; mae=0.12710955175978447
Iteration: 2; rmse=0.11177790789022614; mae=0.07035021528329607
Iteration: 3; rmse=0.08553627422899603; mae=0.0557619210315803
Iteration: 4; rmse=0.07202212685324985; mae=0.04806444034202659
Iteration: 5; rmse=0.06420737431694523; mae=0.04362685767997608
Iteration: 6; rmse=0.05849492754633684; mae=0.0402861467008401
Iteration: 7; rmse=0.05427037963131837; mae=0.037800975320189036


In [107]:
als.cost()

(0.05427037963131837, 0.037800975320189036)

In [110]:
als.get_complete_matrix()

array([[-7.92977717e+00,  2.99253071e+00,  3.97666706e+00, ...,
        -4.47760542e-02,  1.96877492e-01,  2.32135797e-02],
       [ 3.94415899e+00, -2.10482492e+00,  5.71294326e-01, ...,
         2.21739594e-01, -1.52355559e-02, -7.19407200e-02],
       [ 1.19957904e-01, -1.73061846e-01, -3.76083241e-01, ...,
         1.83283117e-01,  6.52096491e-02, -8.01917471e-02],
       ...,
       [ 4.97130445e+00, -6.91913060e-01,  1.50456338e+00, ...,
        -5.89256803e-03, -1.06361608e-02, -3.84681306e-02],
       [-6.46708239e-01, -1.61100308e-01, -2.97112780e+00, ...,
         1.13145303e-02, -3.21123669e-02, -1.35224516e-01],
       [-1.74791752e+00,  4.98393877e+00,  2.98856802e+00, ...,
         2.18995348e-01,  2.70787371e-01,  3.07517551e-01]])

In [8]:
user_vecs, item_vecs = als.user_item()

### 추천하기

In [None]:
matrix = df_c.pivot_table('rating', index='user_id', columns='movie_id').fillna(0)
matrix

#### 유저가 안 본 영화 중 추천해주기

In [10]:
def recommend_un(user_id,mf_train,user_vecs,item_vecs,num_items=10):
  # 유저위치
  # user_ind = np.where(matrix.index==user_id)[0][0]
  pref_vec = mf_train[user_id-1,:]#.toarray()                   # 훈련 데이터의 실제 평점
  pref_vec = pref_vec.reshape(-1)+1                           # 1을 더해 안본것들을 1로 만들고
  pref_vec[pref_vec > 1] = 0                                  # 본 것들을 모두 0으로 만듦으로써 곱해질때 본 것들이 안나오게!
  rec_vector = user_vecs[user_id-1,:].dot(item_vecs.T)         # 추천 시스템에 기반한 예측 평점


  # Min-Max Scaling
  min_max = MinMaxScaler()
  rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
  recommend_vector = pref_vec*rec_vector_scaled  # 구매하지 않은 아이템에 대해서만 예측 평점이 남도록

  product_idx = np.argsort(recommend_vector)[::-1][:num_items+1] # num_items만큼 내림차순으로 평점 정렬한 index
    

  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  recommend = movie_list[movie_list['movie_id'].isin(product_idx+1)].reset_index(drop=True)

  return recommend

In [11]:
recommend_un(12,train_matrix,user_vecs,item_vecs,25)

Unnamed: 0,movie_id,movie_title
0,8,Babe (1995)
1,377,Heavyweights (1994)
2,510,"Magnificent Seven, The (1954)"
3,144,Die Hard (1988)
4,526,Ben-Hur (1959)
5,265,"Hunt for Red October, The (1990)"
6,179,"Clockwork Orange, A (1971)"
7,22,Braveheart (1995)
8,631,"Crying Game, The (1992)"
9,581,Kalifornia (1993)


## 성능평가
> **precision@k, recall@k**<br>
> *유저가 본 영화 중 평점이 높은 순으로 정렬 후 추천영화와 비교*

- 유저가 평가한 4점 이상인 영화(필요없을듯)

In [29]:
def get_movie_4(user_id, item=25):
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  trans = matrix[matrix.index==user_id].T
  trans.columns=['rating']

  movie_ind=trans[trans['rating']>=4].index.tolist()  
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()
  user_movie = movie_list[movie_list['movie_id'].isin(movie_ind)]
  
  return user_movie[:item]

#### 유저가 높게 평가한 순으로 정렬 후 뽑기

In [113]:
def top_get_movie(user_id,matrix_p,item=25):
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  trans = matrix_p[matrix_p.index==user_id].T
  trans.columns=['rating']

  movie_ind=trans.sort_values(['rating']).tail(item).index.tolist()  
  user_movie = movie_list[movie_list['movie_id'].isin(movie_ind)]
  
  return user_movie

In [None]:
top_get_movie(12,train_matrix_p)

#### 유저에게 추천하기(본 것들도 포함)

In [115]:
def recommend(user_id,mf_train,user_vecs,item_vecs,num_items=25):
  # 유저위치
  rec_vector = user_vecs[user_id-1,:].dot(item_vecs.T)         # 추천 시스템에 기반한 예측 평점


  # Min-Max Scaling
  min_max = MinMaxScaler()
  rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 

  product_idx = np.argsort(rec_vector_scaled)[::-1][:num_items+1] # num_items만큼 내림차순으로 평점 정렬한 index
    
  movie_list=df_c[['movie_id','movie_title']].drop_duplicates()

  recommend = movie_list[movie_list['movie_id'].isin(product_idx+1)].reset_index(drop=True)

  return recommend

In [None]:
recommend(12,train_matrix,user_vecs,item_vecs)

### precision/recall@k

In [119]:
def precision_recall(user_id,data):
  count=0
  user_seen = top_get_movie(user_id,train_matrix_p)['movie_title'] #유저가 본 영화
  user_recommend = recommend(user_id,data,user_vecs,item_vecs)['movie_title'] #유저가 추천받은 영화
  for i in user_seen:
    for j in user_recommend:
      if i==j:
        count+=1
  
  precision = count/len(user_recommend)
  recall = count/len(user_seen)
  
  return precision, recall

In [122]:
precision_recall(12,train_matrix)

(0.5769230769230769, 0.6)

In [121]:
precision_recall(30,train_matrix)

(0.46153846153846156, 0.48)

In [123]:
precision_recall(922,train_matrix)

(0.46153846153846156, 0.48)

In [124]:
precision_recall(196,train_matrix)

(0.38461538461538464, 0.4)

### mean_precision@k

In [125]:
def mean_precision_k(k,data):
  precision=0
  total =0
  for i in range(1,k+1):
    #train,test셋으로 분리했기 때문에 없는 user_id는 넘김
    try:
      total+=1
      precision+=precision_recall(i,data)[0]
    except:
      pass
  mean_precision = precision/total
  return mean_precision

In [126]:
mean_precision_k(400,train_matrix)

0.30113461538461544

### mean recall@k

In [127]:
def mean_recall_k(k,data):
  recall=0
  total =0
  for i in range(1,k+1):
    try:
      total+=1
      recall+=precision_recall(i,data)[1]
    except:
      pass
  mean_recall = recall/total
  return mean_recall

In [128]:
mean_recall_k(400,train_matrix)

0.3116999999999999