First download dataset to start ML! Here I stored data set on my Google Drive.

In [70]:
!pip install -U -q PyDrive
import os
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

# Download file to local (colab) working directory
def google_drive_cpdir(source_id, local_target):
  try:
    os.makedirs(local_target)
  except: 
    pass
  
  file_list = drive.ListFile({'q': "'{source_id}' in parents". \
                              format(source_id=source_id)}).GetList()
  result_list = []
  
  for f in file_list:
    if f["title"].startswith("."):
      continue
    fname = os.path.join(local_target, f['title'])
    if f['mimeType'] == 'application/vnd.google-apps.folder':
      result_list += google_drive_cpdir(f['id'], fname)
    else:
      f_ = drive.CreateFile({'id': f['id']})
      f_.GetContentFile(fname)
      result_list.append(fname)
  
  return result_list

# Copy file
cp_list = google_drive_cpdir('1SDt7ES_zOdaFR9ZFedZ9vF_M50odeUQl', '~/dataset')
cp_list

['~/dataset/users.dat',
 '~/dataset/README',
 '~/dataset/ratings.dat',
 '~/dataset/movies.dat']

In [71]:
with open('~/dataset/README', 'r') as f:
  print(f.read())

SUMMARY

These files contain 1,000,209 anonymous ratings of approximately 3,900 movies 
made by 6,040 MovieLens users who joined MovieLens in 2000.

USAGE LICENSE

Neither the University of Minnesota nor any of the researchers
involved can guarantee the correctness of the data, its suitability
for any particular purpose, or the validity of results based on the
use of the data set.  The data set may be used for any research
purposes under the following conditions:

     * The user may not state or imply any endorsement from the
       University of Minnesota or the GroupLens Research Group.

     * The user must acknowledge the use of the data set in
       publications resulting from the use of the data set
       (see below for citation information).

     * The user may not redistribute the data without separate
       permission.

     * The user may not use this information for any commercial or
       revenue-bearing purposes without first obtaining permission
       from a facult

In [72]:
!pip install -q pandas sklearn
import os

import tensorflow as tf
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [73]:
def read_movie_lens_1m():
  rating_file = os.path.join('~/dataset', 'ratings.dat')
  with open(rating_file, 'r') as f:
    df = pd.read_csv(f, sep='::', names=['userID','itemID','rating'], usecols=[0,1,2], engine='python')
    return df
  
df = read_movie_lens_1m()
df.userID = df.userID.astype(int)
df.itemID = df.itemID.astype(int)
df.rating = df.rating.astype(float)
df.head()

Unnamed: 0,userID,itemID,rating
0,1,1193,5.0
1,1,661,3.0
2,1,914,3.0
3,1,3408,4.0
4,1,2355,5.0


In [74]:
df_dim = len(df.userID.unique()), len(df.itemID.unique())
item_dim = df_dim[1]
print("Dataset: User x Item dimension is ", df_dim)

train, test = train_test_split(df, test_size=0.2)

train_dim = len(train.userID.unique()), item_dim
test_dim = len(test.userID.unique()), item_dim
print("Train: User x Item dimension is ", train_dim)
print("Test: User x Item dimension is ", test_dim)

Dataset: User x Item dimension is  (6040, 3706)
Train: User x Item dimension is  (6040, 3706)
Test: User x Item dimension is  (6035, 3706)


In [75]:
user_idx = df.userID.unique()
item_idx = df.itemID.unique()
user_map = {k: v for v, k in enumerate(user_idx)}
item_map = {k: v for v, k in enumerate(item_idx)}

train_pivot = np.zeros(df_dim, dtype=float)

for index, row in train.iterrows():
  u_pos, i_pos = user_map[row['userID']], item_map[row['itemID']]
  train_pivot[u_pos, i_pos] = row['rating']

test_pivot = np.copy(train_pivot)

for index, row in test.iterrows():
  u_pos, i_pos = user_map[row['userID']], item_map[row['itemID']]
  test_pivot[u_pos, i_pos] = row['rating']

train_pivot /= 5
test_pivot /= 5

print("Sample Training set")
train_pivot

Sample Training set


array([[1. , 0.6, 0.6, ..., 0. , 0. , 0. ],
       [1. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0.8, ..., 0. , 0. , 0. ],
       [0.8, 0. , 0. , ..., 0. , 0. , 0. ]])

https://github.com/miguelgfierro/sciblog_support/blob/master/Intro_to_Recommendation_Systems/Intro_Recommender.ipynb

In [114]:
#encoding_dim = 100

#input_img = Input(shape=(item_dim,))
#encoded = Dense(encoding_dim, activation='sigmoid')(input_img)
#decoded = Dense(item_dim, activation=None)(encoded)

#autoencoder = Model(input_img, decoded)

#autoencoder.compile(optimizer='rmsprop', loss='mean_squared_error')


def entry_stop_gradients(target, mask):
    mask_h = tf.abs(mask-1)
    return tf.stop_gradient(mask_h * target) + mask * target

graph = tf.Graph()

with graph.as_default():
  # Training Parameters
  learning_rate = 0.01
  batch_size = 100
  num_epochs = 100

  display_step = 1000

  # Network Parameters
  num_hidden_1 = 500 # 1st layer num features
  num_hidden_2 = 200 # 2nd layer num features (the latent dim)
  num_input = item_dim

  X = tf.placeholder("float", [None, num_input])
  Y = tf.placeholder("float", [None, num_input])

  weights = {
      'encoder_h1': tf.Variable(tf.random_normal([num_input, num_hidden_1])),
      'encoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_hidden_2])),
      'decoder_h1': tf.Variable(tf.random_normal([num_hidden_2, num_hidden_1])),
      'decoder_h2': tf.Variable(tf.random_normal([num_hidden_1, num_input])),
  }
  biases = {
      'encoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
      'encoder_b2': tf.Variable(tf.random_normal([num_hidden_2])),
      'decoder_b1': tf.Variable(tf.random_normal([num_hidden_1])),
      'decoder_b2': tf.Variable(tf.random_normal([num_input])),
  }

  activation = tf.nn.leaky_relu
  
  # Building the encoder
  def encoder(x):
      layer_1 = activation(tf.add(tf.matmul(x, weights['encoder_h1']),
                                     biases['encoder_b1']))
      layer_2 = activation(tf.add(tf.matmul(layer_1, weights['encoder_h2']),
                                     biases['encoder_b2']))
      return layer_2

  # Building the decoder
  def decoder(x):
      layer_1 = activation(tf.add(tf.matmul(x, weights['decoder_h1']),
                                     biases['decoder_b1']))
      layer_2 = tf.nn.sigmoid(tf.add(tf.matmul(layer_1, weights['decoder_h2']),
                                     biases['decoder_b2']))
      return layer_2

  # Construct model
  encoder_op = encoder(X)
  decoder_op = decoder(encoder_op)

  y_pred = decoder_op
  y_true = Y

  #X_train_batch = tf.train.batch([train_pivot], batch_size=batch_size)

  # Mask unseen data from loss function
  y_pred_autoenc = tf.where(tf.greater(X, 0.), y_pred, y_true)

  # Define loss and optimizer, minimize the squared error
  reg_losses = tf.get_collection(tf.GraphKeys.REGULARIZATION_LOSSES)
  reg_constant = 0.01  # Choose an appropriate one.
  
  loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2))
  optimizer_loss = tf.reduce_mean(tf.pow(y_true - y_pred, 2)) + sum(reg_losses) * reg_constant
  
  optimizer = tf.train.AdamOptimizer(learning_rate).minimize(optimizer_loss)

  # Initialize the variables (i.e. assign their default value)
  init = tf.global_variables_initializer()

In [115]:
#autoencoder.fit(train_pivot,
#                train_pivot,
#                epochs=100,
#                batch_size=256,
#                shuffle=True)
import math

train_count = df_dim[0]

saver = tf.train.Saver()
  
# Start Training
# Start a new TF session
sess = tf.Session(graph=graph)

# Run the initializer
sess.run(init)

# Training
for epoch in range(1, num_epochs+1):
  # Prepare Data

  final_loss = np.zeros((1))

  for start, end in zip(range(0, train_count, batch_size),
                          range(batch_size, train_count + 1, batch_size)):

    # Run optimization op (backprop) and cost op (to get loss value)
    _, l = sess.run([optimizer, optimizer_loss], \
                    feed_dict={X: train_pivot[start:end], Y: test_pivot[start:end]})

    final_loss += (end-start) * l

  final_loss = math.sqrt(final_loss)/float(train_count)

  # Display logs per step
  print('Epoch %i: Minibatch MSE Loss: %f' % (epoch, l))


#saver.save(sess, './model/model.ckpt')

Epoch 1: Minibatch MSE Loss: 0.271090
Epoch 2: Minibatch MSE Loss: 0.163188
Epoch 3: Minibatch MSE Loss: 0.137914
Epoch 4: Minibatch MSE Loss: 0.129262
Epoch 5: Minibatch MSE Loss: 0.126099
Epoch 6: Minibatch MSE Loss: 0.124411
Epoch 7: Minibatch MSE Loss: 0.123188
Epoch 8: Minibatch MSE Loss: 0.121881
Epoch 9: Minibatch MSE Loss: 0.120981
Epoch 10: Minibatch MSE Loss: 0.120593
Epoch 11: Minibatch MSE Loss: 0.120373
Epoch 12: Minibatch MSE Loss: 0.120305
Epoch 13: Minibatch MSE Loss: 0.119825
Epoch 14: Minibatch MSE Loss: 0.119708
Epoch 15: Minibatch MSE Loss: 0.119230
Epoch 16: Minibatch MSE Loss: 0.119221
Epoch 17: Minibatch MSE Loss: 0.119002
Epoch 18: Minibatch MSE Loss: 0.118977
Epoch 19: Minibatch MSE Loss: 0.118967
Epoch 20: Minibatch MSE Loss: 0.118748
Epoch 21: Minibatch MSE Loss: 0.118756
Epoch 22: Minibatch MSE Loss: 0.118697
Epoch 23: Minibatch MSE Loss: 0.118609
Epoch 24: Minibatch MSE Loss: 0.117813
Epoch 25: Minibatch MSE Loss: 0.116808
Epoch 26: Minibatch MSE Loss: 0.11

Epoch 72: Minibatch MSE Loss: 0.108692
Epoch 73: Minibatch MSE Loss: 0.108692
Epoch 74: Minibatch MSE Loss: 0.108692
Epoch 75: Minibatch MSE Loss: 0.108692
Epoch 76: Minibatch MSE Loss: 0.108692
Epoch 77: Minibatch MSE Loss: 0.108692
Epoch 78: Minibatch MSE Loss: 0.108692
Epoch 79: Minibatch MSE Loss: 0.108692
Epoch 80: Minibatch MSE Loss: 0.108692
Epoch 81: Minibatch MSE Loss: 0.108692
Epoch 82: Minibatch MSE Loss: 0.108692
Epoch 83: Minibatch MSE Loss: 0.108692
Epoch 84: Minibatch MSE Loss: 0.108692
Epoch 85: Minibatch MSE Loss: 0.108692
Epoch 86: Minibatch MSE Loss: 0.108692
Epoch 87: Minibatch MSE Loss: 0.108692
Epoch 88: Minibatch MSE Loss: 0.108692
Epoch 89: Minibatch MSE Loss: 0.108692
Epoch 90: Minibatch MSE Loss: 0.108692
Epoch 91: Minibatch MSE Loss: 0.108692
Epoch 92: Minibatch MSE Loss: 0.108692
Epoch 93: Minibatch MSE Loss: 0.108692
Epoch 94: Minibatch MSE Loss: 0.108692
Epoch 95: Minibatch MSE Loss: 0.108692
Epoch 96: Minibatch MSE Loss: 0.108692
Epoch 97: Minibatch MSE L

In [116]:
sample_idx = 10
sampleX, sampleY = train_pivot[sample_idx], test_pivot[sample_idx]

stackX = np.vstack([sampleX])
stackY = np.vstack([sampleY])

#ckpt = tf.train.get_checkpoint_state('./model/')
#saver.restore(sess, ckpt.model_checkpoint_path)
pred = sess.run([y_pred],  feed_dict={X: stackX, Y: stackY})

pred = pred[0][0]

for i, v in enumerate(sampleY):
  print(i, '*' if sampleX[i] != sampleY[i] else ' ', sampleY[i], pred[i])

0   0.0 0.0
1   0.0 0.0
2   0.0 1.0
3   0.0 0.0
4   0.2 1.0
5   1.0 0.0
6   0.0 0.0
7   1.0 0.0
8   0.0 0.0
9   0.0 1.0
10   0.0 0.0
11   0.0 0.0
12   0.0 0.0
13   1.0 1.0
14   0.0 0.0
15   0.8 1.0
16   0.0 0.0
17   0.0 0.0
18   1.0 0.0
19   0.0 1.0
20   0.6 0.0
21   0.0 0.0
22   0.0 0.0
23   0.0 0.0
24   0.0 0.0
25   0.0 0.0
26   0.0 0.0
27   0.0 0.0
28   0.0 0.0
29   0.0 0.0
30   0.0 0.0
31   0.0 0.0
32   0.0 0.0
33   0.0 0.0
34   0.0 0.0
35   0.0 0.0
36   0.0 0.0
37   0.0 0.0
38   1.0 1.0
39   0.0 1.0
40   0.0 0.0
41   0.0 0.0
42   0.0 0.0
43   0.0 0.0
44   0.0 0.0
45   0.0 0.0
46   0.0 0.0
47   0.0 0.0
48   0.0 0.0
49   0.6 0.0
50   0.0 0.0
51   1.0 1.0
52   0.0 0.0
53   0.0 0.0
54   0.0 0.0
55   0.0 0.0
56   0.0 0.0
57   0.0 0.0
58   0.0 0.0
59   0.0 0.0
60   0.0 0.0
61   0.0 1.0
62   0.0 0.0
63   0.0 0.0
64   0.0 0.0
65   0.0 0.0
66   0.0 0.0
67 * 0.8 0.0
68   0.0 0.0
69   0.0 0.0
70   0.0 0.0
71   0.0 0.0
72   0.0 0.0
73   0.0 0.0
74 * 0.6 1.0
75   0.0 0.0
76   0.0 0.0
77   0.0 

 0.0 0.0
1645   0.0 0.0
1646   0.0 1.0
1647   0.0 0.0
1648   0.0 0.0
1649   0.0 0.0
1650   0.0 0.0
1651   0.0 1.0
1652   0.0 0.0
1653   0.0 0.0
1654   0.0 0.0
1655   0.0 0.0
1656   0.0 0.0
1657   0.0 0.0
1658   0.0 0.0
1659   0.0 0.0
1660   0.0 0.0
1661   0.0 0.0
1662   0.0 0.0
1663   0.0 0.0
1664   0.0 0.0
1665   0.0 0.0
1666   0.0 0.0
1667   0.0 0.0
1668   0.0 0.0
1669   0.0 0.0
1670   0.0 0.0
1671   0.0 0.0
1672   0.0 0.0
1673   0.0 0.0
1674   0.0 0.0
1675   0.0 0.0
1676   0.0 0.0
1677   0.0 0.0
1678   0.0 0.0
1679   0.0 1.0
1680   0.0 0.0
1681   0.0 0.0
1682   0.0 0.0
1683   0.0 0.0
1684   0.0 0.0
1685   0.0 0.0
1686   0.0 0.0
1687   0.0 0.0
1688   0.0 0.0
1689   0.0 0.0
1690   0.0 0.0
1691   0.0 0.0
1692   0.0 0.0
1693   0.0 0.0
1694   0.0 0.0
1695   0.0 0.0
1696   0.0 0.0
1697   0.0 0.0
1698   0.0 0.0
1699   0.0 0.0
1700   0.0 0.0
1701   0.0 0.0
1702   0.0 0.0
1703   0.0 0.0
1704   0.0 0.0
1705   0.0 0.0
1706   0.0 0.0
1707   0.0 0.0
1708   0.0 0.0
1709   0.0 0.0
1710   0.0 0.0
1

  0.0 0.0
3117   0.0 0.0
3118   0.0 0.0
3119   0.0 0.0
3120   0.0 0.0
3121   0.0 0.0
3122   0.0 0.0
3123   0.0 0.0
3124   0.0 0.0
3125   0.0 0.0
3126   0.0 0.0
3127   0.0 0.0
3128   0.0 0.0
3129   0.0 1.0
3130   0.0 0.0
3131   0.0 0.0
3132   0.0 0.0
3133   0.0 0.0
3134   0.0 0.0
3135   0.0 0.0
3136   0.0 0.0
3137   0.0 1.0
3138   0.0 1.0
3139   0.0 0.0
3140   0.0 0.0
3141   0.0 0.0
3142   0.0 0.0
3143   0.0 0.0
3144   0.0 0.0
3145   0.0 0.0
3146   0.0 1.0
3147   0.0 0.0
3148   0.0 0.0
3149   0.0 1.0
3150   0.0 0.0
3151   0.0 0.0
3152   0.0 0.0
3153   0.0 0.0
3154   0.0 0.0
3155   0.0 0.0
3156   0.0 0.0
3157   0.0 0.0
3158   0.0 1.0
3159   0.0 1.0
3160   0.0 0.0
3161   0.0 0.0
3162   0.0 1.0
3163   0.0 0.0
3164   0.0 0.0
3165   0.0 0.0
3166   0.0 0.0
3167   0.0 0.0
3168   0.0 0.0
3169   0.0 0.0
3170   0.0 0.0
3171   0.0 0.0
3172   0.0 0.0
3173   0.0 0.0
3174   0.0 1.0
3175   0.0 0.0
3176   0.0 0.0
3177   0.0 0.0
3178   0.0 0.0
3179   0.0 0.0
3180   0.0 0.0
3181   0.0 0.0
3182   0.0 0.0
