In [1]:
import pandas as pd
import numpy as np
import random
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

In [4]:
data=pd.read_csv('/content/drive/MyDrive/project_2_1/data/full_data.csv')
data.movie_title=data.movie_title.str[:-1]

In [5]:
data['target']=data['rating']
data.loc[data['rating']<4, 'target'] = 0
data.loc[data['rating']>3, 'target'] = 1
data

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title,release_date,Action,Adventure,Animation,Children,...,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,term,target
0,196,242,3,881250949,Kolya,854064000,0,0,0,0,...,0,0,0,0,0,49,0,writer,27186949,0
1,196,257,2,881251577,Men in Black,860371200,1,1,0,0,...,0,1,0,0,0,49,0,writer,20880377,0
2,196,111,4,881251793,"Truth About Cats & Dogs, The",830476800,0,0,0,0,...,1,0,0,0,0,49,0,writer,50774993,1
3,196,25,4,881251955,"Birdcage, The",839030400,0,0,0,0,...,0,0,0,0,0,49,0,writer,42221555,1
4,196,382,4,881251843,"Adventures of Priscilla, Queen of the Desert, The",757382400,0,0,0,0,...,0,0,0,0,0,49,0,writer,123869443,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99985,873,313,5,891392177,Titanic,852076800,1,0,0,0,...,1,0,0,0,0,48,1,administrator,39315377,1
99986,873,326,4,891392656,G.I. Jane,852076800,1,0,0,0,...,0,0,0,1,0,48,1,administrator,39315856,1
99987,873,348,3,891392577,Desperate Measures,886118400,0,0,0,0,...,0,0,1,0,0,48,1,administrator,5274177,0
99988,873,358,2,891392698,Spawn,852681600,1,1,0,0,...,0,1,1,0,0,48,1,administrator,38711098,0


In [65]:
movies=pd.read_csv('/content/drive/MyDrive/project_2_1/data/item.csv')[['movie_id','movie_title']]

In [66]:
users=list(np.sort(data['user_id'].unique()))
target_data=list(data['target'])

rows=data['user_id'].astype('category').cat.codes
cols=data['movie_title'].astype('category').cat.codes

target_table=sparse.csr_matrix((target_data,(rows,cols)),shape=(len(users),len(movies)))
target_table

<943x1680 sparse matrix of type '<class 'numpy.int64'>'
	with 99585 stored elements in Compressed Sparse Row format>

In [7]:
matrix_size=target_table.shape[0]*target_table.shape[1]
sparsity=(1-(len(target_table.nonzero()[0])/matrix_size))*100
sparsity

96.47006721700603

In [8]:
test_set=target_table.copy()
train_set=target_table.copy()

In [9]:
nonzero_inds=train_set.nonzero()
nonzero_pairs=list(zip(nonzero_inds[0],nonzero_inds[1]))

In [36]:
df=data[(data.timestamp>=889237455) & (data.rating>3)]

In [37]:
user_inds = [index for index in df.user_id]
item_inds = [index for index in df.item_id]

In [None]:
train_set[user_inds,item_inds]=0
train_set.eliminate_zeros()

In [42]:
users_altered=list(set(user_inds))
item_altered=list(set(item_inds))

In [43]:
def implicit_weighted_ALS(train_set, lambda_val=0.1,alpha=40,n_iter=10,rank_size=20,seed=21):
  conf=(alpha*train_set)
  num_user=conf.shape[0]
  num_item=conf.shape[1]

  rstate=np.random.RandomState(seed)
  X=sparse.csr_matrix(rstate.normal(size=(num_user,rank_size)))
  Y=sparse.csr_matrix(rstate.normal(size=(num_item,rank_size)))
  X_eye=sparse.eye(num_user)
  Y_eye=sparse.eye(num_item)

  lambda_eye=lambda_val*sparse.eye(rank_size)

  for i in range(n_iter):
    yTy=Y.T.dot(Y)
    xTx=X.T.dot(X)

    for u in range(num_user):
      conf_samp=conf[u,:].toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1
      CuI=sparse.diags(conf_samp,[0])
      yTCuIY=Y.T.dot(CuI).dot(Y)
      yTCupu=Y.T.dot(CuI+Y_eye).dot(pref.T)

      X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu)

    for i in range(num_item):
      conf_samp=conf[:,i].T.toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1

      CiI=sparse.diags(conf_samp,[0])
      xTCiIX=X.T.dot(CiI).dot(pref.T)
      xTCiPi = X.T.dot(CiI+ X_eye).dot(pref.T)

      Y[i]=spsolve(xTx+xTCiIX+lambda_eye,xTCiPi)

    return X, Y.T

In [44]:
user_vecs, item_vecs = implicit_weighted_ALS(train_set, lambda_val = 0.1, alpha = 15, n_iter = 50,rank_size = 20)

  warn('spsolve requires A be CSC or CSR matrix format',


In [45]:
pred_dic={}
for i in range(user_vecs.shape[0]):
  preference=user_vecs[i].dot(item_vecs).toarray()[0].tolist()
  pred_dic[i]=preference

In [46]:
rec_table=pd.DataFrame(pred_dic,index=movies)

In [47]:
rec_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
Kolya,0.000335,0.000480,-0.000128,0.000037,-0.000041,-0.000010,9.744219e-05,0.000188,0.000088,0.000371,...,0.000154,0.000502,0.000456,0.000402,0.000048,0.000189,0.000286,0.000055,0.000465,-0.000071
Men in Black,0.000347,-0.000066,-0.000108,-0.000197,0.000092,-0.000029,9.171307e-07,0.000112,0.000237,-0.000013,...,0.000284,0.000218,0.000138,-0.000023,-0.000021,0.000168,0.000076,0.000029,0.000004,0.000063
"Truth About Cats & Dogs, The",0.002813,0.005318,-0.001882,0.001320,-0.000585,0.001059,2.033325e-03,0.000536,0.000307,0.004038,...,0.004286,0.004686,0.003911,0.005054,0.003374,0.003814,0.002875,0.000512,0.003191,-0.001590
"Birdcage, The",0.016406,0.016881,-0.004243,0.000318,-0.002566,0.009015,1.097367e-02,0.008406,0.012716,0.022839,...,0.018937,0.007251,0.010204,0.008285,0.006563,0.000098,0.005361,-0.000817,0.010255,0.000449
"Adventures of Priscilla, Queen of the Desert, The",0.001417,0.001704,-0.000525,0.000428,-0.001033,-0.000166,1.601896e-03,0.000822,0.001195,0.001317,...,0.001184,-0.000016,0.000689,0.000812,0.000123,-0.000170,-0.000239,-0.000808,0.001093,-0.000185
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Walk in the Sun, A",0.006674,0.006256,-0.001097,0.001282,-0.004252,-0.000770,2.982095e-03,0.004378,0.002017,0.005182,...,0.006294,0.002419,0.003183,0.003006,0.004460,-0.000324,-0.000323,-0.002100,0.003324,0.001173
"Reluctant Debutante, The",0.002062,0.001684,-0.000302,0.000216,-0.001291,-0.000636,9.753890e-04,0.001837,0.000183,0.000992,...,0.001488,0.000772,0.000552,0.000895,0.000951,-0.000204,-0.000158,-0.000989,0.001242,0.000693
Killer: A Journal of Murder,0.003254,0.003897,-0.000999,0.000643,-0.000317,0.000659,1.656040e-03,0.001083,0.002284,0.003594,...,0.003660,0.002322,0.003388,0.002548,0.000963,0.000680,0.001356,-0.000477,0.001560,0.000005
I Don't Want to Talk About It (De eso no se habla),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [48]:
def run(user_id):
    return rec_table[user_id].sort_values(ascending=False)[1:21]

In [49]:
run(0)

Ben-Hur                                   0.065655
Kull the Conqueror                        0.060810
Mina Tannenbaum                           0.058065
Carrie                                    0.048211
Legal Deceit                              0.047691
Thin Line Between Love and Hate, A        0.046669
Star Trek VI: The Undiscovered Country    0.044711
Little Women                              0.043634
Flirt                                     0.040697
Color of Night                            0.039389
Starship Troopers                         0.038915
Curdled                                   0.038401
Chasing Amy                               0.037936
Hungarian Fairy Tale, A                   0.037889
Angels and Insects                        0.037840
Nixon                                     0.037729
B*A*P*S                                   0.036794
Gate of Heavenly Peace, The               0.036623
I Love Trouble                            0.036521
Carrington                     

In [85]:
item_titles=[]
for i in item_inds:
  item_titles.append(str(movies[movies.movie_id==i].movie_title.values[0])[:-1])

In [86]:
test_data=pd.DataFrame({'user_id':user_inds,'movie_title':item_titles})
test_data=test_data.sort_values(by='user_id').reset_index(drop=True)
test_data

Unnamed: 0,user_id,movie_title
0,1,When the Cats Away (Chacun cherche son chat)
1,1,Delicatessen
2,1,"Truth About Cats & Dogs, The"
3,1,Kolya
4,3,Cop Land
...,...,...
11298,942,That Darn Cat!
11299,942,Air Bud
11300,942,"Ghost and Mrs. Muir, The"
11301,942,To Be or Not to Be


In [87]:
def test_score():
  precision_list=[]
  recall_list=[]
  for i in test_data['user_id'].unique():
    test_title=list(test_data[test_data.user_id==i].movie_title)
    try:
      if len(test_title)>1:
        recommended_list=list(run(i).index)
        count=0
        for value in test_title:
          if value in recommended_list:
            count+=1
        precision=count/len(recommended_list)
        recall=count/(len(test_title))
        precision_list.append(precision)
        recall_list.append(recall)

    except:
      pass

  return f'precision: {np.mean(precision_list)}, recall: {np.mean(recall_list)}'

In [88]:
test_score()

'precision: 0.025364963503649633, recall: 0.017614080364328304'