In [5]:
import pandas as pd
import numpy as np
import random
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

In [6]:
data=pd.read_csv('/content/drive/MyDrive/project_2_1/data/full_data.csv')
data.movie_title=data.movie_title.str[:-1]

In [7]:
# 평점 4점 이상인 데이터 1, 이외 데이터를 0으로 하는 binary 타겟 값 형성 

In [8]:
data['target']=data['rating']
data.loc[data['rating']<4, 'target'] = 0
data.loc[data['rating']>3, 'target'] = 1
data

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title,release_date,Action,Adventure,Animation,Children,...,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,term,target
0,196,242,3,881250949,Kolya,854064000,0,0,0,0,...,0,0,0,0,0,49,0,writer,27186949,0
1,196,257,2,881251577,Men in Black,860371200,1,1,0,0,...,0,1,0,0,0,49,0,writer,20880377,0
2,196,111,4,881251793,"Truth About Cats & Dogs, The",830476800,0,0,0,0,...,1,0,0,0,0,49,0,writer,50774993,1
3,196,25,4,881251955,"Birdcage, The",839030400,0,0,0,0,...,0,0,0,0,0,49,0,writer,42221555,1
4,196,382,4,881251843,"Adventures of Priscilla, Queen of the Desert, The",757382400,0,0,0,0,...,0,0,0,0,0,49,0,writer,123869443,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99985,873,313,5,891392177,Titanic,852076800,1,0,0,0,...,1,0,0,0,0,48,1,administrator,39315377,1
99986,873,326,4,891392656,G.I. Jane,852076800,1,0,0,0,...,0,0,0,1,0,48,1,administrator,39315856,1
99987,873,348,3,891392577,Desperate Measures,886118400,0,0,0,0,...,0,0,1,0,0,48,1,administrator,5274177,0
99988,873,358,2,891392698,Spawn,852681600,1,1,0,0,...,0,1,1,0,0,48,1,administrator,38711098,0


In [9]:
#유저와 영화에 대한 타겟 데이터를 나타내는 행렬 생성  

In [10]:
users=list(np.sort(data['user_id'].unique()))
movies=list(data['movie_title'].unique())
target_data=list(data['rating'])

rows=data['user_id'].astype('category').cat.codes
cols=data['movie_title'].astype('category').cat.codes

target_table=sparse.csr_matrix((target_data,(rows,cols)),shape=(len(users),len(movies)))
target_table

<943x1657 sparse matrix of type '<class 'numpy.int64'>'
	with 99585 stored elements in Compressed Sparse Row format>

In [11]:
# 학습을 진행하기에 충분한 데이터가 존재하는지 확인 (0.5% 이상) 

In [12]:
matrix_size=target_table.shape[0]*target_table.shape[1]
sparsity=(1-(len(target_table.nonzero()[0])/matrix_size))*100
sparsity

93.62676802229176

In [13]:
#학습 데이터 생성(값이 존재하는 데이터의 20%를 삭제) 

In [14]:
test_set=target_table.copy()
train_set=target_table.copy()

In [15]:
nonzero_inds=train_set.nonzero()
nonzero_pairs=list(zip(nonzero_inds[0],nonzero_inds[1]))

In [16]:
random.seed(21)
num_samples = int(np.ceil(0.2 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)
user_inds = [index[0] for index in samples]
item_inds = [index[1] for index in samples]

In [17]:
train_set[user_inds,item_inds]=0
train_set.eliminate_zeros()

In [18]:
users_altered=list(set(user_inds))
item_altered=list(set(item_inds))

In [19]:
#행렬 분해 및 학습 진행하는 모델 생성 

In [20]:
def implicit_weighted_ALS(train_set, lambda_val=0.1,alpha=40,n_iter=10,rank_size=20,seed=21):
  conf=(alpha*train_set)
  num_user=conf.shape[0]
  num_item=conf.shape[1]

  rstate=np.random.RandomState(seed)
  X=sparse.csr_matrix(rstate.normal(size=(num_user,rank_size)))
  Y=sparse.csr_matrix(rstate.normal(size=(num_item,rank_size)))
  X_eye=sparse.eye(num_user)
  Y_eye=sparse.eye(num_item)

  lambda_eye=lambda_val*sparse.eye(rank_size)

  for i in range(n_iter):
    yTy=Y.T.dot(Y)
    xTx=X.T.dot(X)

    for u in range(num_user):
      conf_samp=conf[u,:].toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1
      CuI=sparse.diags(conf_samp,[0])
      yTCuIY=Y.T.dot(CuI).dot(Y)
      yTCupu=Y.T.dot(CuI+Y_eye).dot(pref.T)

      X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu)

    for i in range(num_item):
      conf_samp=conf[:,i].T.toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1

      CiI=sparse.diags(conf_samp,[0])
      xTCiIX=X.T.dot(CiI).dot(pref.T)
      xTCiPi = X.T.dot(CiI+ X_eye).dot(pref.T)

      Y[i]=spsolve(xTx+xTCiIX+lambda_eye,xTCiPi)

    return X, Y.T

In [21]:
user_vecs, item_vecs = implicit_weighted_ALS(train_set, lambda_val = 0.1, alpha = 15, n_iter = 50,rank_size = 20)

  warn('spsolve requires A be CSC or CSR matrix format',


In [22]:
#학습 결과 예측된 유저 행렬과 영화 행렬을 곱하여 예측 행렬 생성 

In [23]:
pred_dic={}
for i in range(user_vecs.shape[0]):
  preference=user_vecs[i].dot(item_vecs).toarray()[0].tolist()
  pred_dic[i]=preference

In [24]:
rec_table=pd.DataFrame(pred_dic,index=movies)

In [26]:
rec_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
Kolya,0.004249,0.010032,0.001395,0.004212,0.000367,0.008120,0.005002,0.006764,0.002504,0.011350,...,-0.002315,0.014113,0.007590,0.010952,0.009066,-0.002093,0.012950,-0.005172,0.011371,1.993775e-03
Men in Black,0.007811,-0.001016,-0.001194,-0.004396,0.010729,0.002624,0.006120,0.003250,0.004497,0.001265,...,0.005949,0.006197,0.001306,0.005797,0.002064,0.007611,0.000871,-0.001063,-0.000193,5.263511e-03
"Truth About Cats & Dogs, The",0.111257,0.181888,-0.058255,0.081439,0.081159,0.120899,0.094836,0.036107,-0.000726,0.147202,...,0.103141,0.285830,0.111505,0.250611,0.178143,0.130072,0.228944,0.047364,0.025463,-2.729720e-02
"Birdcage, The",0.212090,0.160737,0.069514,0.016459,0.202251,0.211832,0.145220,0.114659,0.195849,0.210800,...,0.091737,0.145158,0.073731,0.163440,0.043858,0.034346,0.189749,-0.047892,0.093851,9.542940e-03
"Adventures of Priscilla, Queen of the Desert, The",0.011042,0.042745,0.014335,0.031901,-0.009812,0.030131,0.008428,0.024373,0.027306,0.043049,...,-0.017043,0.022050,0.005115,0.009627,0.007337,0.002569,0.021649,-0.017855,0.032992,1.421217e-02
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Walk in the Sun, A",0.177972,0.088747,0.078735,0.031516,0.156512,0.107551,0.119066,0.110775,0.088820,0.118547,...,0.083804,0.122711,0.035598,0.138702,0.091614,0.058520,0.125273,-0.028302,0.045866,1.579135e-02
"Reluctant Debutante, The",0.041422,0.026106,0.019726,0.010809,0.033450,0.019432,0.030185,0.034124,0.016788,0.023893,...,0.024868,0.037549,0.000577,0.032146,0.023286,0.027019,0.019824,0.000112,0.005995,7.338590e-03
Killer: A Journal of Murder,0.029047,0.039012,-0.011148,0.015767,0.037988,0.034806,0.022450,-0.004965,0.022460,0.057139,...,0.003320,0.059825,0.033751,0.068307,0.028396,0.014040,0.049307,-0.004352,0.023847,5.554352e-03
I Don't Want to Talk About It (De eso no se habla),0.001533,0.003881,-0.003852,0.003637,0.000623,0.002180,0.002555,0.000782,-0.002933,0.004494,...,0.000802,0.009242,0.005213,0.007705,0.006210,0.005397,0.009169,-0.000827,0.001982,-5.319043e-09


In [27]:
def run(user_id):
    return rec_table[user_id].sort_values(ascending=False)[1:21]

In [28]:
run(0)

Terror in a Texas Town           12.278429
Angus                             2.701045
Big One, The                      2.549068
Palookaville                      2.452889
Very Natural Thing, A             2.149605
Denise Calls Up                   1.957402
Rent-a-Kid                        1.953711
Browning Version, The             1.928833
Excess Baggage                    1.890310
Brothers McMullen, The            1.885332
Manhattan                         1.822295
Three Colors: Blue                1.737474
Love and Death on Long Island     1.654154
Every Other Weekend               1.606012
Fresh                             1.494836
English Patient, The              1.375102
Miracle on 34th Street            1.319225
Man Who Knew Too Little, The      1.286789
Suture                            1.284910
In Love and War                   1.280662
Name: 0, dtype: float64

In [29]:
# 모델 성능 평가 

In [30]:
item_titles=[]
for i in item_inds:
  item_titles.append(movies[i])

In [31]:
#삭제한 20%의 데이터를 추출하여 평가 진행 

In [32]:
test_data=pd.DataFrame({'user_id':user_inds,'movie_title':item_titles})
test_data=test_data.sort_values(by='user_id').reset_index(drop=True)
test_data

Unnamed: 0,user_id,movie_title
0,0,Independence Day (ID4)
1,0,Renaissance Man
2,0,Wild America
3,0,Orlando
4,0,"Bronx Tale, A"
...,...,...
19912,942,Kama Sutra: A Tale of Love
19913,942,Enchanted April
19914,942,Best Men
19915,942,Hana-bi


In [33]:
#삭제된 데이터에 포함된 유저를 대상으로 예측 진행하여 precision과 recall값 도출 

In [34]:
def test_score():
  precision_list=[]
  recall_list=[]
  for i in test_data['user_id'].unique():
    test_title=list(test_data[test_data.user_id==i].movie_title)
    try:
      if len(test_title)>1:
        recommended_list=list(run(i).index)
        count=0
        for value in test_title:
          if value in recommended_list:
            count+=1
        precision=count/len(recommended_list)
        recall=count/(len(test_title))
        precision_list.append(precision)
        recall_list.append(recall)

    except:
      pass

  return f'precision: {np.mean(precision_list)}, recall: {np.mean(recall_list)}'

In [35]:
test_score()

'precision: 0.05667735042735043, recall: 0.06916143462254289'