In [0]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /gdrive
/gdrive


In [0]:
cd My\ Drive

/gdrive/My Drive


In [0]:
cd Colab\ Notebooks

/gdrive/My Drive/Colab Notebooks


In [0]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class MF(object):
  def __init__(self, Y_data, k, lam=0.1, Xinit=None, Winit=None, learning_rate=0.5, max_iter=1000, print_every=100, user_based=1):
    self.Y_raw_data = Y_data
    self.k = k
    self.lam = lam
    self.learning_rate = learning_rate
    self.max_iter = max_iter
    self.print_every = print_every
    self.user_based = user_based
    self.n_users = int(np.max(Y_data[:, 0])) + 1
    self.n_items = int(np.max(Y_data[:, 1])) + 1
    self.n_ratings = Y_data.shape[0]
    if Xinit is None:
      self.X = np.random.randn(self.n_items, k)
    else:
      self.X = Xinit
    if Winit is None:
      self.W = np.random.randn(k, self.n_users)
    else:
      self.W = Winit
    self.Y_data_n = self.Y_raw_data.copy()
    
  def normalize_Y(self):
    if self.user_based:
      user_col = 0
      item_col = 1
      n_objects = self.n_users
    else:
      item_col = 0
      user_col = 1
      n_objects = self.n_items
    users = self.Y_raw_data[:, user_col]
    self.mu = np.zeros((n_objects,))
    for n in range(n_objects):
      ids = np.where(users == n)[0].astype(np.int32)
      item_ids = self.Y_data_n[ids, item_col]
      ratings = self.Y_data_n[ids, 2]
      m = np.mean(ratings)
      if np.isnan(m):
        m = 0
      self.mu[n] = m
      self.Y_data_n[ids, 2] = ratings - self.mu[n]
  def loss(self):
    L = 0
    for i in range(self.n_ratings):
      n, m, rate = self.Y_data_n[i, 0] , self.Y_data_n[i, 1], self.Y_data_n[i, 2]
      L += 0.5*(rate - self.X[m, :].dot(self.W[:, n]))**2
    L /= self.n_ratings
    L += self.lam*(np.linalg.norm(self.X, 'fro') + np.linalg.norm(self.W, 'fro'))
    return L
  def get_items_rated_by_user(self, user_id):
    ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
    item_ids = self.Y_data_n[ids, 1].astype(np.int32)
    ratings = self.Y_data_n[ids, 2]
    return (item_ids, ratings)
  def get_users_who_rate_item(self, item_id):
    ids = np.where(self.Y_data_n[:, 1] == item_id)[0]
    user_ids = self.Y_data_n[ids, 0].astype(np.int32)
    ratings = self.Y_data_n[ids, 2]
    return (user_ids, ratings)
    
  def updateX(self):
    for m in range(self.n_items):
      user_ids, ratings = self.get_users_who_rate_item(m)
      Wm = self.W[:, user_ids]
      grad_xm = -(ratings - self.X[m, :].dot(Wm)).dot(Wm.T)/self.n_ratings + self.lam*self.X[m, :]
      self.X[m, :] -= self.learning_rate*grad_xm.reshape((self.k, ))
        
  def updateW(self):
    for n in range(self.n_users):
      item_ids, ratings = self.get_items_rated_by_user(n)
      Xn = self.X[item_ids, :]
      grad_wn = -Xn.T.dot(ratings - Xn.dot(self.W[:, n]))/self.n_ratings + self.lam*self.W[:, n]
      self.W[:, n] -= self.learning_rate*grad_wn.reshape((self.k, ))
    
        
  def fit(self):
    self.normalize_Y()
    for it in range(self.max_iter):
      self.updateX()
      self.updateW()
      if(it + 1) % self.print_every == 0:
        rmse_train = self.evaluate_RMSE(self.Y_raw_data)
        print('Iter = ', it+1, ', loss = ', self.loss(), ', RMSE train = ', rmse_train)
  def pred(self, u, i):
    u = int(u)
    i = int(i)
    if self.user_based:
      bias = self.mu[u]
    else:
      bias = self.mu[i]
    pred = self.X[i, :].dot(self.W[:, u]) + bias
    if pred < 0:
      pred = 0
    if pred > 5:
      pred = 5
    return pred
  def pred_for_user(self, user_id):
    ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
    items_rated_by_u = self.Y_data_n[ids, 1]
    y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
    predicted_ratings = []
    for i in range(self.n_items):
      if i not in items_rated_by_u:
        predicted_ratings.append((i, y_pred[i]))
    return predicted_ratings
  def evaluate_RMSE(self, rate_test):
    n_tests = rate_test.shape[0]
    SE = 0
    for n in range(n_tests):
      pred = self.pred(rate_test[n, 0], rate_test[n, 1])
      SE += (pred - rate_test[n, 2])**2
    RMSE = np.sqrt(SE/n_tests)
    return RMSE
    

In [0]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('ml-100k/ub.base', sep='\t', names=r_cols, encoding='latin-1')
ratings_test = pd.read_csv('ml-100k/ub.test', sep='\t', names=r_cols, encoding='latin-1')

rate_train = ratings_base.as_matrix()
rate_test = ratings_test.as_matrix()

rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

  
  import sys


In [0]:
rs = MF(rate_train, k = 10, lam = .1, print_every = 10, 
    learning_rate = 0.75, max_iter = 70, user_based = 0)
rs.fit()
RMSE = rs.evaluate_RMSE(rate_test)
print('/n User-based MF, RMSE = ', RMSE)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Iter =  10 , loss =  10.75615525862199 , RMSE train =  1.1746593053280765
Iter =  20 , loss =  4.9723702423217775 , RMSE train =  1.0050273853052887
Iter =  30 , loss =  2.4041082257943365 , RMSE train =  0.9965310026332356
Iter =  40 , loss =  1.2301151393566168 , RMSE train =  0.9961859685394788
Iter =  50 , loss =  0.691905737310728 , RMSE train =  0.9961786157363083
Iter =  60 , loss =  0.4450962711894609 , RMSE train =  0.9961802067626432
Iter =  70 , loss =  0.33191264773278545 , RMSE train =  0.9961807334873811
/n User-based MF, RMSE =  1.049803828649758


In [0]:
rs.pred_for_user(23)


[(0, 3.45224655362373),
 (1, 3.4522567215175095),
 (2, 3.4522970646301228),
 (3, 3.4521848338009735),
 (4, 3.452237505197922),
 (5, 3.452212895935477),
 (9, 3.452241941424959),
 (12, 3.45222811630653),
 (13, 3.452235113194202),
 (14, 3.4522836045529757),
 (15, 3.4522260556527846),
 (16, 3.4521491154647994),
 (17, 3.452268690756878),
 (18, 3.452189482700394),
 (19, 3.452227916408497),
 (20, 3.4522578693735673),
 (21, 3.4522452019871848),
 (22, 3.452257358669432),
 (23, 3.4521871885958966),
 (24, 3.4522445063116263),
 (25, 3.45221744089716),
 (26, 3.4522133126701657),
 (27, 3.4522363745820766),
 (28, 3.4522875397244426),
 (29, 3.4521751780256267),
 (30, 3.4522190407012805),
 (31, 3.4521862089591204),
 (32, 3.4522345558627405),
 (33, 3.4521821472148546),
 (34, 3.452233557202249),
 (35, 3.4522043284911623),
 (36, 3.452260346748395),
 (37, 3.4522126689201196),
 (38, 3.4522073066443406),
 (39, 3.452182671638893),
 (41, 3.4522786850733937),
 (42, 3.4522493525436686),
 (43, 3.452301229030032),

In [0]:
x = rs.X
w = rs.W
pre = x.dot(w)
u_23 = pre[:, 23]
print(u_23[13])

1.079292464666561e-06


In [0]:
ratings_1m_based = pd.read_csv('ratings.dat', sep='::', names=r_cols, encoding='latin-1')
ratings_1m = ratings_1m_based.as_matrix()

  """Entry point for launching an IPython kernel.
  


In [0]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(ratings_1m, test_size=0.25, random_state=1)


In [0]:
rm = MF(train, k=2, lam=.1, print_every=2, learning_rate =2 , max_iter=10, user_based=1)
rm.fit()
RMSE = rm.evaluate_RMSE(test)
print('Item-based MF, RMSE = ', RMSE)

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


Iter =  2 , loss =  13.211398656957934 , RMSE train =  1.1653577277434233
Iter =  4 , loss =  8.465128538532104 , RMSE train =  1.053933215603122
Iter =  6 , loss =  5.494424802293621 , RMSE train =  1.031922929602697
Iter =  8 , loss =  3.604519585158595 , RMSE train =  1.0281444317765804
Iter =  10 , loss =  2.3969270437511327 , RMSE train =  1.0275011979725845
Item-based MF, RMSE =  1.036091435953699


In [0]:
rm.pred_for_user(600)

NameError: ignored