In [1]:
import pandas as pd
import numpy as np
import random
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

In [2]:
data=pd.read_csv('/content/drive/MyDrive/project_2_1/data/full_data.csv')
data.movie_title=data.movie_title.str[:-1]

In [3]:
data['target']=data['rating']
data.loc[data['rating']<4, 'target'] = 0
data.loc[data['rating']>3, 'target'] = 1
data

Unnamed: 0,user_id,item_id,rating,timestamp,movie_title,release_date,Action,Adventure,Animation,Children,...,Romance,Sci-Fi,Thriller,War,Western,age,gender,occupation,term,target
0,196,242,3,881250949,Kolya,854064000,0,0,0,0,...,0,0,0,0,0,49,0,writer,27186949,0
1,196,257,2,881251577,Men in Black,860371200,1,1,0,0,...,0,1,0,0,0,49,0,writer,20880377,0
2,196,111,4,881251793,"Truth About Cats & Dogs, The",830476800,0,0,0,0,...,1,0,0,0,0,49,0,writer,50774993,1
3,196,25,4,881251955,"Birdcage, The",839030400,0,0,0,0,...,0,0,0,0,0,49,0,writer,42221555,1
4,196,382,4,881251843,"Adventures of Priscilla, Queen of the Desert, The",757382400,0,0,0,0,...,0,0,0,0,0,49,0,writer,123869443,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
99985,873,313,5,891392177,Titanic,852076800,1,0,0,0,...,1,0,0,0,0,48,1,administrator,39315377,1
99986,873,326,4,891392656,G.I. Jane,852076800,1,0,0,0,...,0,0,0,1,0,48,1,administrator,39315856,1
99987,873,348,3,891392577,Desperate Measures,886118400,0,0,0,0,...,0,0,1,0,0,48,1,administrator,5274177,0
99988,873,358,2,891392698,Spawn,852681600,1,1,0,0,...,0,1,1,0,0,48,1,administrator,38711098,0


In [4]:
users=list(np.sort(data['user_id'].unique()))
movies=list(data['movie_title'].unique())
target_data=list(data['target'])

rows=data['user_id'].astype('category').cat.codes
cols=data['movie_title'].astype('category').cat.codes

target_table=sparse.csr_matrix((target_data,(rows,cols)),shape=(len(users),len(movies)))
target_table

<943x1657 sparse matrix of type '<class 'numpy.int64'>'
	with 99585 stored elements in Compressed Sparse Row format>

In [5]:
matrix_size=target_table.shape[0]*target_table.shape[1]
sparsity=(1-(len(target_table.nonzero()[0])/matrix_size))*100
sparsity

96.47006721700603

In [6]:
test_set=target_table.copy()
train_set=target_table.copy()

In [7]:
nonzero_inds=train_set.nonzero()
nonzero_pairs=list(zip(nonzero_inds[0],nonzero_inds[1]))

In [8]:
random.seed(21)
num_samples = int(np.ceil(0.2 * len(nonzero_pairs)))
samples = random.sample(nonzero_pairs, num_samples)
user_inds = [index[0] for index in samples]
item_inds = [index[1] for index in samples]

In [9]:
train_set[user_inds,item_inds]=0
train_set.eliminate_zeros()

In [12]:
users_altered=list(set(user_inds))
item_altered=list(set(item_inds))

In [13]:
def implicit_weighted_ALS(train_set, lambda_val=0.1,alpha=40,n_iter=10,rank_size=20,seed=21):
  conf=(alpha*train_set)
  num_user=conf.shape[0]
  num_item=conf.shape[1]

  rstate=np.random.RandomState(seed)
  X=sparse.csr_matrix(rstate.normal(size=(num_user,rank_size)))
  Y=sparse.csr_matrix(rstate.normal(size=(num_item,rank_size)))
  X_eye=sparse.eye(num_user)
  Y_eye=sparse.eye(num_item)

  lambda_eye=lambda_val*sparse.eye(rank_size)

  for i in range(n_iter):
    yTy=Y.T.dot(Y)
    xTx=X.T.dot(X)

    for u in range(num_user):
      conf_samp=conf[u,:].toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1
      CuI=sparse.diags(conf_samp,[0])
      yTCuIY=Y.T.dot(CuI).dot(Y)
      yTCupu=Y.T.dot(CuI+Y_eye).dot(pref.T)

      X[u] = spsolve(yTy + yTCuIY + lambda_eye, yTCupu)

    for i in range(num_item):
      conf_samp=conf[:,i].T.toarray()
      pref=conf_samp.copy()
      pref[pref!=0]=1

      CiI=sparse.diags(conf_samp,[0])
      xTCiIX=X.T.dot(CiI).dot(pref.T)
      xTCiPi = X.T.dot(CiI+ X_eye).dot(pref.T)

      Y[i]=spsolve(xTx+xTCiIX+lambda_eye,xTCiPi)

    return X, Y.T

In [20]:
user_vecs, item_vecs = implicit_weighted_ALS(train_set, lambda_val = 0.1, alpha = 15, n_iter = 50,rank_size = 20)

  warn('spsolve requires A be CSC or CSR matrix format',


In [43]:
pred_dic={}
for i in range(user_vecs.shape[0]):
  preference=user_vecs[i].dot(item_vecs).toarray()[0].tolist()
  pred_dic[i]=preference

In [52]:
rec_table=pd.DataFrame(pred_dic,index=movies)

In [49]:
rec_table

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
Kolya,0.000434,0.000310,-0.000103,0.000015,0.000240,-0.000045,0.000038,0.000051,0.000122,0.000260,...,0.000135,0.000468,0.000363,0.000101,0.000009,0.000188,0.000097,0.000116,0.000256,-0.000128
Men in Black,0.000440,0.000028,-0.000071,-0.000129,0.000104,0.000041,-0.000066,0.000084,0.000030,-0.000161,...,0.000227,-0.000091,-0.000245,-0.000010,0.000056,-0.000037,0.000004,-0.000030,-0.000059,-0.000013
"Truth About Cats & Dogs, The",0.002522,0.003038,-0.000870,0.001281,0.001150,-0.000651,0.001580,-0.001163,0.000585,0.001508,...,0.002540,0.003496,0.001269,0.002581,0.003196,0.002340,0.001345,-0.000115,0.000914,-0.001516
"Birdcage, The",0.009030,0.009710,-0.003006,-0.000617,0.002053,0.010625,0.005587,0.004138,0.010129,0.011556,...,0.013702,0.003747,0.004523,0.003268,0.007465,-0.002162,0.002690,-0.001151,0.003107,0.001250
"Adventures of Priscilla, Queen of the Desert, The",0.001057,0.001212,0.000115,0.000525,-0.000278,-0.000045,0.001175,0.000445,0.000823,0.000503,...,0.001307,0.000154,0.000977,0.000731,0.000380,-0.000189,0.000327,-0.000631,0.001095,-0.000479
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
"Walk in the Sun, A",0.004097,0.003707,-0.000385,-0.000396,-0.001533,0.000525,0.001079,0.001662,0.001844,0.002365,...,0.004424,0.000734,0.000172,0.001272,0.003209,-0.000398,-0.000775,-0.001216,0.002095,-0.000066
"Reluctant Debutante, The",0.001945,0.001489,-0.000081,-0.000287,-0.000684,-0.000314,0.000774,0.001032,0.000436,0.000298,...,0.001195,0.000051,0.000215,0.000483,0.001003,-0.000533,-0.000139,-0.001032,0.000718,-0.000021
Killer: A Journal of Murder,0.001394,0.000994,0.000098,-0.000256,-0.000123,0.000647,0.000724,0.000617,0.001227,0.000723,...,0.002080,0.000554,0.000594,0.000398,0.000429,-0.000912,0.000596,-0.000680,0.001015,0.000133
I Don't Want to Talk About It (De eso no se habla),0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


In [53]:
def run(user_id):
    return rec_table[user_id].sort_values(ascending=False)[1:21]

In [54]:
run(0)

Angel Baby                                0.040376
Kull the Conqueror                        0.036441
Mina Tannenbaum                           0.033723
Little Women                              0.030847
Legal Deceit                              0.026991
Starship Troopers                         0.024092
Star Trek VI: The Undiscovered Country    0.023567
Flirt                                     0.023515
20,000 Leagues Under the Sea              0.023387
Excess Baggage                            0.022921
Carrington                                0.022591
Color of Night                            0.022590
Thin Line Between Love and Hate, A        0.022066
Carrie                                    0.021572
Bitter Moon                               0.021286
Chasing Amy                               0.021005
Angels and Insects                        0.020884
How to Be a Player                        0.020593
Rent-a-Kid                                0.020577
Microcosmos: Le peuple de l'her

In [63]:
item_titles=[]
for i in item_inds:
  item_titles.append(movies[i])

In [70]:
test_data=pd.DataFrame({'user_id':user_inds,'movie_title':item_titles})
test_data=test_data.sort_values(by='user_id').reset_index(drop=True)
test_data

Unnamed: 0,user_id,movie_title
0,0,Smoke
1,0,Killer (Bulletproof Heart)
2,0,Buddy
3,0,Twin Town
4,0,An Unforgettable Summer
...,...,...
11027,942,French Twist (Gazon maudit)
11028,942,Pete's Dragon
11029,942,"Thin Line Between Love and Hate, A"
11030,942,Things to Do in Denver when You're Dead


In [71]:
def test_score():
  precision_list=[]
  recall_list=[]
  for i in test_data['user_id'].unique():
    test_title=list(test_data[test_data.user_id==i].movie_title)
    try:
      if len(test_title)>1:
        recommended_list=list(run(i).index)
        count=0
        for value in test_title:
          if value in recommended_list:
            count+=1
        precision=count/len(recommended_list)
        recall=count/(len(test_title))
        precision_list.append(precision)
        recall_list.append(recall)

    except:
      pass

  return f'precision: {np.mean(precision_list)}, recall: {np.mean(recall_list)}'

In [72]:
test_score()

'precision: 0.061418685121107264, recall: 0.12483097654702882'