In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf


In [2]:
rating_df = pd.read_csv("datasets/ml-latest-small/ratings.csv")
rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [3]:
# Standardisation of rating
rating_mean = rating_df["rating"].mean()
rating_std = rating_df["rating"].std()

rating_df["rating"] = (rating_df["rating"] - rating_mean) / rating_std

rating_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,0.478109,964982703
1,1,3,0.478109,964981247
2,1,6,0.478109,964982224
3,1,47,1.437315,964983815
4,1,50,1.437315,964982931


In [7]:
# Shuffle to unshuffle with permutation
unshuffle_c = lambda x, perm: x[np.where(np.arange(len(x))[:, None] == perm[None, :])[1]]

#### Create Training Dataset
    X = Movie Feature (20 features)
    W = User Prefrence Feature(20 corresponding features)
    Y = Movie-User Ratings
    R = Check Ratings Available
    
We will train our dataset for only 1000 movies and 100 users

Learnable Parameters = X, W(Both features)

In [10]:
total_feature = 20

def getRatingDataset(rating_df):
    total_user = rating_df["userId"].max()
    total_movie = rating_df["movieId"].max()
    
    X =  np.random.randn(total_movie, total_feature) * 0.70
    W = np.random.randn(total_user, total_feature)  * 0.70
    
    
    movie_ids = rating_df["movieId"].to_numpy()
    userIds = rating_df["userId"].to_numpy()
    ratings = rating_df["rating"].to_numpy()
    
    Y = np.random.randn(total_movie, total_user) * ratings.std()
    Y[movie_ids -1, userIds -1] = ratings
    
    R = np.zeros((total_movie, total_user))
    R[movie_ids -1, userIds -1] = 1
    
    X = X[:1000, :]
    W = W[:100, :]
    
    R = R[:1000, :100]
    Y = Y[:1000, :100]
    
    permutation_array = np.random.permutation(1000)
    X = X[permutation_array]
    Y = Y[permutation_array]
    R = R[permutation_array]
    
    X = tf.Variable(X, dtype = tf.float32)
    W = tf.Variable(W, dtype = tf.float32)
    
    parameters = {
        "X": X,
        "W": W
    }
    
    R = tf.constant(R, dtype = tf.float32)
    Y = tf.constant(Y, dtype= tf.float32)
    return parameters, R, Y, permutation_array

In [11]:
parameters, R, Y, permutation_array = getRatingDataset(rating_df)

In [12]:
BATCH_SIZE = 2000
# CACHE_FILE = "datasets/cache/rating_tensor_cache.lockfile"

dataset = tf.data.Dataset.from_tensor_slices((Y, R))
dataset = dataset.batch(BATCH_SIZE, num_parallel_calls =  tf.data.AUTOTUNE)
dataset = dataset.prefetch(tf.data.AUTOTUNE)



In [15]:
next(iter(dataset))

(<tf.Tensor: shape=(1000, 100), dtype=float32, numpy=
 array([[ 0.9391379 , -0.62168336, -1.5469475 , ..., -1.4051894 ,
         -0.9901998 , -0.63588667],
        [ 0.77678704, -1.7437524 , -1.0201    , ...,  1.1673007 ,
         -0.8892034 , -1.3624628 ],
        [ 2.7822034 ,  0.27709946, -0.11831682, ..., -0.4682122 ,
         -0.6648403 , -0.99462634],
        ...,
        [ 2.193403  , -0.5298526 ,  0.28266573, ...,  0.09966536,
          1.0181814 ,  0.79938084],
        [-0.3775971 , -0.8789895 , -1.5345936 , ..., -0.7527844 ,
         -2.047726  , -0.6896821 ],
        [-0.9181034 , -0.00406718,  0.5964418 , ...,  0.20640166,
          0.57484823,  1.7189838 ]], dtype=float32)>,
 <tf.Tensor: shape=(1000, 100), dtype=float32, numpy=
 array([[0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        ...,
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.],
        [0., 0., 0., ..., 0., 0., 0.]], d

In [723]:
from ipywidgets import IntProgress
from IPython.display import display


def train_model(dataset, parameters, num_iterations = 500, lr = 0.002, lambd = 0.2, print_cost = False):
    X = parameters["X"]
    W = parameters["W"]
    
    f = IntProgress(min=0, max=num_iterations) 
    display(f)
    
    for i in range(num_iterations):
        if i % 200 == 0:
            lambd /= np.sqrt(i + 1)
            
        f.value += 1
        total_loss = 0;
        total_batch = 0;
        
        for batch, (Y, R) in dataset.enumerate():
            X_ = X[batch * Y.shape[0]:(batch + 1) * Y.shape[0]]
            
            X_ = tf.Variable(X_, dtype = tf.float32)
            
            with tf.GradientTape() as tape:
                A = X_ @ tf.transpose(W)
                
                loss_value = tf.math.reduce_sum(R * tf.math.square(Y - A))/ tf.math.reduce_sum(R)
                loss_value += lambd * tf.math.reduce_sum(tf.math.square(W)) / tf.math.reduce_sum(R)
                loss_value += lambd * tf.math.reduce_sum(tf.math.square(X_)) / tf.math.reduce_sum(R)
                
                total_loss += loss_value.numpy()
                
            optimizer = tf.keras.optimizers.SGD(learning_rate=lr, momentum= 0.2)

            trainableParameters = [W, X_]
    
            grads = tape.gradient(loss_value, trainableParameters)
            optimizer.apply_gradients(zip(grads, trainableParameters))
            
            X[batch * Y.shape[0]:(batch + 1) * Y.shape[0]].assign(X_)
            total_batch += 1
            
                
        if i % 200 == 0 and print_cost:
            print("Loop {} Avg Cost: {}".format(i, total_loss))
            
            
    
    print("Last Loop Cost: {}".format(total_loss))
    
    return parameters, A

In [724]:
parameters, A = train_model(dataset, parameters, num_iterations = 10000, print_cost = True, lr = 3)

IntProgress(value=0, max=10000)

Loop 0 Avg Cost: 6.75318717956543
Loop 200 Avg Cost: 0.302849680185318
Loop 400 Avg Cost: 0.12474086880683899
Loop 600 Avg Cost: 0.07162811607122421
Loop 800 Avg Cost: 0.04466801881790161
Loop 1000 Avg Cost: 0.029381712898612022
Loop 1200 Avg Cost: 0.020677348598837852
Loop 1400 Avg Cost: 0.015534299425780773
Loop 1600 Avg Cost: 0.012279553338885307
Loop 1800 Avg Cost: 0.010064054280519485
Loop 2000 Avg Cost: 0.008459285832941532
Loop 2200 Avg Cost: 0.007238605059683323
Loop 2400 Avg Cost: 0.006274149287492037
Loop 2600 Avg Cost: 0.005489342845976353
Loop 2800 Avg Cost: 0.004836109932512045
Loop 3000 Avg Cost: 0.00428309291601181
Loop 3200 Avg Cost: 0.0038090855814516544
Loop 3400 Avg Cost: 0.003399169771000743
Loop 3600 Avg Cost: 0.003042392199859023
Loop 3800 Avg Cost: 0.002730366075411439
Loop 4000 Avg Cost: 0.002456436399370432
Loop 4200 Avg Cost: 0.00221517332829535
Loop 4400 Avg Cost: 0.002002067631110549
Loop 4600 Avg Cost: 0.0018133308039978147
Loop 4800 Avg Cost: 0.00164575560

In [725]:
params = {
    "X" : unshuffle_c(parameters["X"].numpy(), permutation_array),
    "W" : parameters["W"].numpy()
} 

Y_unshuffle = unshuffle_c(Y.numpy(), permutation_array)
R_unshuffle = unshuffle_c(R.numpy(), permutation_array)

# Choice Of First User For First Movie
(params["X"][0] @ params["W"][0].T * rating_std + rating_mean).round(1)

4.0

In [746]:
def predict(parameters, movieIds, userIds):
    movieIds = np.array(movieIds)
    userIds = np.array(userIds)
    
    A = parameters["X"][movieIds] @ parameters["W"][userIds].T
    A = np.array(A, dtype = np.float64).round(3)
    
    return A

In [755]:
userIds = range(20)
movieIds = range(100)

A = predict(params, movieIds, userIds)

userIds = np.tile(userIds, A.shape[0]).reshape(A.ravel().shape)
movieIds = np.repeat(movieIds, A.shape[1]).reshape(A.ravel().shape)

predictions = pd.DataFrame({"userId": userIds, "movieId": movieIds, "P_rating": A.ravel()})

predictions["movieId"] = predictions["movieId"] + 1 
predictions["userId"] = predictions["userId"] + 1 
ratingsDf_1 = rating_df.copy()

predictions.loc[:, ["P_rating"]] = predictions["P_rating"]  * rating_std + rating_mean
ratingsDf_1.loc[:, "rating"] = ratingsDf_1["rating"] * rating_std + rating_mean
predictions.merge(ratingsDf_1, on = ["userId", "movieId"])

Unnamed: 0,userId,movieId,P_rating,rating,timestamp
0,1,1,4.013439,4.0,964982703
1,5,1,4.003014,4.0,847434962
2,7,1,4.503428,4.5,1106635946
3,15,1,2.510112,2.5,1510577970
4,17,1,4.471109,4.5,1305696483
...,...,...,...,...,...
133,6,93,3.999886,4.0,845554584
134,6,95,4.007184,4.0,845553559
135,11,95,2.994888,3.0,902154458
136,14,95,4.991331,5.0,835441295
