# Collaborative Filtering Recommendations via Matrix Factorization

## Step 1: Set parameters and download dataset if it does not exist

In [1]:
import os
import urllib.request
import zipfile

In [2]:
# dataset choices: 100k, 1m, 10m, 20m, latest-small, latest
dataset = '100k'
data_dir = 'data/'
input_dir = '{}ml-{}'.format(data_dir, dataset)

In [3]:
def download_dataset(dataset, out_dir):
    zip_format = '{}ml-{}.zip'
    base_url = 'http://files.grouplens.org/datasets/movielens/'
    
    zip_url = zip_format.format(base_url, dataset)

    os.makedirs(data_dir, exist_ok=True)

    zip_file, _ = urllib.request.urlretrieve(zip_url)

    with zipfile.ZipFile(zip_file, 'r') as zip_ref:
        zip_ref.extractall(out_dir)

    os.unlink(zip_file)

In [4]:
if not os.path.exists(input_dir):
    download_dataset(dataset, data_dir)
    print("Database downloaded.")
else:
    print("Database already exists.")

Database already exists.


## Step 2: Read the rating data as a coordinate list

In [5]:
import numpy as np

In [6]:
train_input_file = 'u1.base'
test_input_file = 'u1.test'

In [7]:
def get_file_data(input_file, separator='\t', ignore_extra=True):
    with open(input_file, 'r') as in_file:
        file_data = np.array([[int(v) for v in line.split(separator)] for line in in_file], dtype=np.int32)
    if ignore_extra:
        file_data = file_data[:, :3]
    return file_data

In [8]:
train_data = get_file_data(os.path.join(input_dir, train_input_file))
test_data = get_file_data(os.path.join(input_dir, test_input_file))

user_indices_train = train_data[:, 0]
user_indices_test = test_data[:, 0]
num_users = max(np.max(user_indices_train), np.max(user_indices_test))

item_indices_train = train_data[:, 1]
item_indices_test = test_data[:, 1]
num_items = max(np.max(item_indices_train), np.max(item_indices_test))

rating_values = np.array(train_data[:, 2], dtype=np.float32)
num_ratings = np.size(rating_values)
mean_rating = np.mean(rating_values)

In [9]:
rank = 5
_lambda = 10.0
learn_rate = 0.01
threshold = 0.5
max_iter = 1000
report_freq = 50

In [10]:
print(
    train_data.shape, test_data.shape, 
    (num_users, num_items, num_ratings), 
    user_indices_train.shape, user_indices_test.shape,
    item_indices_train.shape, item_indices_test.shape,
    (mean_rating, rank, _lambda)
)

(80000, 3) (20000, 3) (943, 1682, 80000) (80000,) (20000,) (80000,) (20000,) (3.5283501, 5, 10.0)


## Step 3: Build TensorFlow graph

In [11]:
import tensorflow as tf

In [12]:
W = tf.Variable(tf.truncated_normal([num_users, rank], stddev=0.2, mean=0), name="users")
H = tf.Variable(tf.truncated_normal([rank, num_items], stddev=0.2, mean=0), name="items")

In [13]:
result = tf.matmul(W, H)
result_values = tf.gather(tf.reshape(result, [-1]), user_indices_train * tf.shape(result)[1] + item_indices_train,
                          name="extract_training_ratings")

In [14]:
diff_op = tf.subtract(tf.add(result_values, mean_rating, name="add_mean"), rating_values, name="raw_training_error")

In [15]:
with tf.name_scope("training_cost") as scope:
    base_cost = tf.reduce_sum(tf.square(diff_op, name="squared_difference"), name="sum_squared_error")
    # Add regularization.
    regularizer = tf.multiply(tf.add(tf.reduce_sum(tf.square(W)), tf.reduce_sum(tf.square(H))), _lambda, name="regularize")
    cost = tf.divide(tf.add(base_cost, regularizer), num_ratings * 2.0, name="average_error")

In [16]:
# Use an exponentially decaying learning rate.
global_step = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(learn_rate, global_step, 10000, 0.96, staircase=True)

In [17]:
with tf.name_scope("train") as scope:
    optimizer = tf.train.GradientDescentOptimizer(learning_rate)
    # Passing global_step to minimize() will increment it at each step so
    # that the learning rate will be decayed at the specified intervals.
    train_step = optimizer.minimize(cost, global_step=global_step)

In [18]:
with tf.name_scope("training_accuracy") as scope:
    # Just measure the absolute difference against the threshold
    good = tf.less(tf.abs(diff_op), threshold)

    accuracy_tr = tf.div(tf.reduce_sum(tf.cast(good, tf.float32)), num_ratings)

### Step 4: Run training and evaluation

In [19]:
with tf.Session() as sess:
    sess.run(tf.global_variables_initializer())
    # Run the graph and see how we're doing on every 500th iteration.
    for i in range(max_iter+1):
        if i % report_freq == 0:
            res = sess.run([accuracy_tr, cost])
            acc_tr = res[0]
            cost_ev = res[1]
            print("Training accuracy at step %s: %s" % (i, acc_tr))
            print("Training cost: %s" % (cost_ev))
        else:
            sess.run(train_step)

Training accuracy at step 0: 0.320188
Training cost: 0.653117
Training accuracy at step 50: 0.320237
Training cost: 0.65311
Training accuracy at step 100: 0.320275
Training cost: 0.653103
Training accuracy at step 150: 0.320275
Training cost: 0.653096
Training accuracy at step 200: 0.3203
Training cost: 0.653089
Training accuracy at step 250: 0.320325
Training cost: 0.653082
Training accuracy at step 300: 0.320338
Training cost: 0.653076
Training accuracy at step 350: 0.32035
Training cost: 0.653069
Training accuracy at step 400: 0.32035
Training cost: 0.653062
Training accuracy at step 450: 0.32035
Training cost: 0.653055
Training accuracy at step 500: 0.320338
Training cost: 0.653048
Training accuracy at step 550: 0.32035
Training cost: 0.653041
Training accuracy at step 600: 0.320363
Training cost: 0.653035
Training accuracy at step 650: 0.320363
Training cost: 0.653028
Training accuracy at step 700: 0.320388
Training cost: 0.653021
Training accuracy at step 750: 0.32035
Training co