In [37]:
import csv
import pandas as pd
import matplotlib.pyplot as plt
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder
from sklearn.metrics import mean_squared_error
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras.layers import Input, Reshape, Dot
from keras.layers import Embedding
from keras.layers import Concatenate, Dense, Dropout
from keras.optimizers import Adam
from keras.regularizers import l2
import numpy as np
import torch
import torch.nn as nn                 # the torch module to implement the Neural Networks
import torch.nn.parallel              # for parallel computations
import torch.optim as optim           # for optimizers
import torch.utils.data               # tools
from torch.autograd import Variable   # for Stochastic Gradient Descent
import random as rnd

df = pd.read_csv('netflix_prize_smaller.csv').astype({
    'MovieId' : 'int16',
    'CustId' : 'int32',
    'Rating' : 'int8',
    'Date' : 'datetime64[us]'
})
probe_df = pd.read_csv('probe_data_new.csv')
print(df)
df['CustId'] = df['CustId'].astype(np.uint32)
df['MovieId'] = df['MovieId'].astype(np.uint16)
df['Rating'] = df['Rating'].astype(np.uint8)

# https://stackoverflow.com/questions/57507832/unable-to-allocate-array-with-shape-and-data-type

print(df.shape)


      CustId  Rating       Date  MovieId
0    1488844       3 2005-09-06        1
1     822109       5 2005-05-13        1
2     885013       4 2005-10-19        1
3      30878       4 2005-12-26        1
4     823519       3 2004-05-03        1
..       ...     ...        ...      ...
755  1954284       5 2005-05-04     1000
756   299636       2 2005-10-27     1000
757  1635449       1 2005-01-10     1000
758   906984       4 2005-05-13     1000
759  2633357       3 2005-10-13     1000

[5010199 rows x 4 columns]
(5010199, 4)


In [38]:
# make predicitons on qualify.txt, don't have the ratings for
# make prections on probe.txt, HAVE the ratings in the training set !!
   # parse data into matrix form
   # extract the real ratings

# # Preprocess the data
# unique_users = df['CustId'].unique()
# unique_movies = df['MovieId'].unique()

# user_to_index = {user: idx for idx, user in enumerate(unique_users)}
# movie_to_index = {movie: idx for idx, movie in enumerate(unique_movies)}

# df['CustId'] = df['CustId'].map(user_to_index)
# df['MovieId'] = df['MovieId'].map(movie_to_index)

# https://dantegates.github.io/2020/04/21/a-tutorial-on-collaborative-filtering-in-sklearn.html

# df['user_id_encoding'] = OrdinalEncoder().fit_transform(df.CustId.values.reshape((-1, 1))).astype(int).reshape(-1)
# df['movie_id_encoding'] = OrdinalEncoder().fit_transform(df.MovieId.values.reshape((-1, 1))).astype(int).reshape(-1)

# ratings_matrix = csr_matrix((df.Rating, (df.user_id_encoding, df.movie_id_encoding)))
# print('Total size of X:', ratings_matrix.shape[0] * ratings_matrix.shape[1], '\nNumber of non-zero elements in X:', ratings_matrix.count_nonzero())
# print("% filled:", ratings_matrix.count_nonzero() / (ratings_matrix.shape[0] * ratings_matrix.shape[1]) * 100)
# print(ratings_matrix)

# df_p = pd.pivot_table(df,values='Rating',index='Cust_Id',columns='Movie_Id')



df_movie_summary = df.groupby('MovieId')['Rating'].agg(['count'])
df_movie_summary.index = df_movie_summary.index.map(int)
movie_benchmark = round(df_movie_summary['count'].quantile(0.98),0)
drop_movie_list = df_movie_summary[df_movie_summary['count'] < movie_benchmark].index

print(f'Movie minimum times of review: {movie_benchmark}')

df_cust_summary = df.groupby('CustId')['Rating'].agg(['count'])
df_cust_summary.index = df_cust_summary.index.map(int)
cust_benchmark = round(df_cust_summary['count'].quantile(0.98),0)
drop_cust_list = df_cust_summary[df_cust_summary['count'] < cust_benchmark].index

print(f'Customer minimum times of review: {cust_benchmark}')

df = df[~df['MovieId'].isin(drop_movie_list)]
df = df[~df['CustId'].isin(drop_cust_list)]
print('After Trim Shape: {}'.format(df.shape))


Movie minimum times of review: 56544.0
Customer minimum times of review: 62.0
After Trim Shape: (123981, 4)


In [39]:
num_movies = df['MovieId'].nunique()
num_users = df['CustId'].nunique()

unique_users = df['CustId'].unique()
unique_movies = df['MovieId'].unique()

print("Users: ", num_users)
print("Movies :",num_movies)

x = df[['CustId','Rating', 'MovieId']].values
y = df['Rating'].values
print(df)
x_training, x_test, y_training, y_test = train_test_split(x, y, test_size=0.2)
# x_train, x_val, y_train, y_val = train_test_split(x_training, y_training, test_size=0.1)

# convert to a users x movies matrix
def transform(data):
    new_data = []
    for cust_id in unique_users:
        # get all movies from this user
        movie_ids = data[:,2][data[:,0]==cust_id]
        # get all ratings from this user
        rating_ids = data[:,1][data[:,0]==cust_id]
        ratings = np.zeros(num_movies)
        # fill in the user's row with the ratings
        for movie_id in movie_ids :
          idx_in_rating_list =  list(unique_movies).index(movie_id)
          ratings[idx_in_rating_list] = rating_ids[list(movie_ids).index(movie_id)]
        new_data.append(list(ratings))

    return new_data

num_probe_movies = probe_df['MovieId'].nunique()
num_probe_users = probe_df['CustId'].nunique()

unique_probe_users = probe_df['CustId'].unique()
unique_probe_movies = probe_df['MovieId'].unique()
def transform_probe(data):
    new_data = []
    for cust_id in unique_probe_users:
        # get all movies from this user
        movie_ids = data[:,1][data[:,0]==cust_id]
        # get all ratings from this user
        rating_ids = data[:,2][data[:,0]==cust_id]
        ratings = np.zeros(num_probe_movies)
        # fill in the user's row with the ratings
        for movie_id in movie_ids :
          idx_in_rating_list =  list(unique_probe_movies).index(movie_id)
          ratings[idx_in_rating_list] = rating_ids[list(movie_ids).index(movie_id)]
        new_data.append(list(ratings))

    return new_data
x_training = np.array(x_training)
x_test = np.array(x_test)
x_training = transform(x_training)
print(x_training)
print(len(x_training))
print(x_test)
print(len(x_test))
print(len(y_test))
x_test_original = x_test
x_test = transform(x_test)

print(len(x_test))

Users:  8112
Movies : 20
        CustId  Rating       Date  MovieId
3      1990901       4 2004-05-24       30
8       306466       3 2004-04-02       30
16     1987434       2 2005-07-15       30
23     1468812       5 2004-04-05       30
36     2422606       4 2004-05-11       30
...        ...     ...        ...      ...
89940    63786       4 2005-07-04      985
89961  1677862       1 2005-10-05      985
89967   502274       4 2005-10-12      985
89974   477466       4 2005-10-17      985
89990  2180413       4 2005-12-02      985

[123981 rows x 4 columns]
[[4.0, 2.0, 0.0, 0.0, 0.0, 5.0, 0.0, 1.0, 3.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 1.0], [3.0, 4.0, 4.0, 3.0, 4.0, 0.0, 4.0, 4.0, 4.0, 4.0, 0.0, 0.0, 4.0, 3.0, 4.0, 4.0, 4.0, 4.0, 0.0, 0.0], [2.0, 4.0, 5.0, 0.0, 4.0, 0.0, 0.0, 4.0, 0.0, 4.0, 1.0, 3.0, 4.0, 3.0, 1.0, 3.0, 4.0, 5.0, 3.0, 0.0], [5.0, 0.0, 5.0, 0.0, 4.0, 3.0, 5.0, 3.0, 0.0, 0.0, 4.0, 4.0, 5.0, 2.0, 0.0, 0.0, 5.0, 4.0, 5.0, 0.0], [4.0, 4.0, 5.0, 4.0, 0.

In [42]:
x_training = torch.FloatTensor(x_training)
x_test_tensor = torch.FloatTensor(x_test)

# x_training = x_training / 5.0
# x_test_tensor = x_test_tensor / 5.0
x_test_tensor[x_test_tensor == 0] = -1
x_test_tensor[x_test_tensor == 1] = 0
x_test_tensor[x_test_tensor == 2] = 0
x_test_tensor[x_test_tensor > 2] = 1

x_training[x_training == 0] = -1
x_training[x_training == 1] = 0
x_training[x_training == 2] = 0
x_training[x_training > 2] = 1



class RBM():
    def __init__(self, visible_nodes, hidden_nodes):
        self.Weights = torch.randn(hidden_nodes, visible_nodes)
        self.bias = torch.randn(1, hidden_nodes)
        self.bias2 = torch.randn(1, visible_nodes)
        
    def sample_hidden_nodes(self, visible_neurons):
        wx = torch.mm(visible_neurons, self.Weights.t())
        activation = wx + self.bias.expand_as(wx)
        p_h_given_v = torch.sigmoid(activation)
        return p_h_given_v, torch.bernoulli(p_h_given_v)
    
    def sample_visible_nodes(self, y):
        wy = torch.mm(y, self.Weights)
        activation = wy + self.bias2.expand_as(wy)
        p_v_given_h = torch.sigmoid(activation)
        return p_v_given_h, torch.bernoulli(p_v_given_h)
    
    def train(self, input_ratings_vector, visible_nodes_k_samples, probability_vector, prob_hidden_nodes_k_samples):
        self.Weights += (torch.mm(input_ratings_vector.t(), probability_vector) - 
                   torch.mm(visible_nodes_k_samples.t(), prob_hidden_nodes_k_samples)).t()
        self.bias2 += torch.sum((input_ratings_vector-visible_nodes_k_samples), 0)
        self.bias += torch.sum((probability_vector-prob_hidden_nodes_k_samples), 0)



visible_nodes = len(x_training[0])      # no. of visible nodes
hidden_nodes = 300                    # the features to be detected by RBM, hence can define any relevant number
batch_size = 300
rbm = RBM(visible_nodes, hidden_nodes)
optimizer = torch.optim.Adam([rbm.Weights, rbm.bias, rbm.bias2], lr=0.001)
num_epoch = 3        # 10 because as we have a binary outcome and less data, the model will converge quickly


import torch.nn as nn

# Define the loss function
loss_function = nn.MSELoss()

# creating a for loop to iterate through these epochs and in each epoch all observations go in the network 
# and then updating the weights after observations of each batch that passed through the network
# and then we get our final visible nodes with new ratings for the movies that were not orignally rated
for epoch in range(1, num_epoch+1):
    train_loss = 0                      
    s = 0                          
    
    for user_number in range(0, num_users - batch_size, batch_size):
        id_user = unique_users[user_number]
        id_end_user = unique_users[user_number + batch_size]
        # CHECK THIS; LIKELY WRONG
        # should unique_users be sorted?
        visible_node = x_training[id_user:id_end_user]
        movie_vector = x_training[id_user:id_end_user]
        probability_hidden_node,_ = rbm.sample_hidden_nodes(movie_vector)
        
        for gibbs_samples in range(10):
            _,hidden_nodes_at_k = rbm.sample_hidden_nodes(visible_node)
            _,visible_node = rbm.sample_visible_nodes(hidden_nodes_at_k)
            visible_node[movie_vector<0] = movie_vector[movie_vector<0] 
    
        phk,_ = rbm.sample_hidden_nodes(visible_node)
        rbm.train(movie_vector, visible_node, probability_hidden_node, phk)
        
        # Zero the gradients before backward pass and optimization step
        optimizer.zero_grad()

        
        loss = loss_function(movie_vector[movie_vector >= 0], visible_node[movie_vector >= 0])
        if(not loss.isnan()):
            train_loss+=loss.item()
            print(train_loss)
            s += 1

        # Update weights
        optimizer.step()
    print(train_loss, s)
    if(s > 0):
        print('epoch: ' + str(epoch) + ' loss: ' + str(train_loss/s) )

def predict(rbm_model, input_data):
    _, hidden_nodes = rbm_model.sample_hidden_nodes(input_data)
    probabilities, visible_nodes = rbm_model.sample_visible_nodes(hidden_nodes)
    return probabilities, visible_nodes

def map_probabilities(probabilities):
    final_predictions = []
    final_probabilities = []
    for user_row in probabilities :
        user_predictions = np.zeros(len(user_row))
        user_probs = np.zeros(len(user_row))
        for i, user_rating in enumerate(user_row) :
            prediction = 0
            if user_rating < 0.03 :
                prediction = 1
            elif user_rating < 0.25 :
                prediction = 2
            elif user_rating < 0.44 :
                prediction = 3
            elif user_rating < 0.98 :
                prediction = 5
            else :
                prediction = 4
            user_predictions[i] = prediction
            user_probs[i] = user_rating
        final_predictions.append(user_predictions)
        final_probabilities.append(user_probs)
    return final_predictions, final_probabilities

# probe = np.array(probe_df)

# num_movies = probe_df['MovieId'].nunique()
# num_users = probe_df['CustId'].nunique()

# unique_users = probe_df['CustId'].unique()
# unique_movies = probe_df['MovieId'].unique()

# probe = transform_probe(probe)
# probabilities, results = predict(rbm, torch.FloatTensor(probe))
probabilities, results = predict(rbm, x_test_tensor)
new_results, final_probs = map_probabilities(probabilities)
# print(len(probe))
# print(len(probe[0]))
# print(len(new_results))
# print(len(new_results[0]))
# MSE = mean_squared_error(probe, results)

def find_mse(test, results) :
    sum = 0
    count = 0
    for i, user in enumerate(test) :
        for j, rating in enumerate(user) :
            if rating > 0 :
                result = results[i][j]
                print(f"Guessed: {result} Actual: {rating}")
                sum += pow((rating - result), 2)
                count+=1
    return sum / count
    
def map_probs_to_value(probs, results) :
    ratings = []
    prob_list = []
    for i, user in enumerate(results) :
        for j, rating in enumerate(user) :
            if rating > 0 :
                prob = probs[i][j]
                print(f"Rating: {rating} Maps to prob: {prob}")
                ratings.append(rating)
                prob_list.append(prob)
    return ratings, prob_list

print(x_test)
def get_test_results_from_matrix(test, results) :
    test_results = []
    print(test)
    for entry in test:
        user_id = entry[0]
        movie_id = entry[2]
        user_idx = list(unique_users).index(user_id)
        movie_idx = list(unique_movies).index(movie_id)
        prediction = results[user_idx][movie_idx]
        test_results.append(prediction)
    return test_results

rbm_predictions = get_test_results_from_matrix(x_test_original, new_results)
MSE = mean_squared_error(y_test, rbm_predictions)
print("RMSE: ", pow(MSE,1/2))
MSE = find_mse(x_test, new_results)
# ratings, prob_list = map_probs_to_value(final_probs, x_test)
# plt.scatter(prob_list, ratings)
# ratings_probs_df = pd.DataFrame({'Rating': ratings, 'Probability': prob_list})
# five_ratings = ratings_probs_df[ratings_probs_df['Rating'] == 3]
# plt.hist(five_ratings['Probability'], bins=100, edgecolor='black')
# plt.xlim([0, 1])
print(f"RSME: {pow(MSE, 1/2) }")

# why normalizing it making it worse
# loss functions? check why user ids are creating empty tensors
# MAX RSME: 0.9977795245011049


0 0
0 0
0 0
[[0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 3.0, 0.0, 0.0, 0.0, 4.0, 5.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0], [0.0, 0.0, 0.0, 2.0, 0.0, 3.0, 3.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 1.0, 5.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 4.0, 0.0, 4.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 4.0, 0.0, 0.0, 0.0, 0.0, 4.0, 0.0, 0.0], [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0], [0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], [0.0, 0.0, 4.0, 0.0, 0.0, 0.0, 0.0, 0.0, 3.0, 0.0, 0.0, 0.0, 4.0, 5.0

In [41]:
# # This was some github library that had built in RBM functions 

# # #instantiate the sparse matrix generation
# # am = AffinityMatrix(df = data, **header)

# # #obtain the sparse matrix
# # X, _, _ = am.gen_affinity_matrix()

# # # Set the embedding dimension d of Matrix factorization
# # e_dimension = 200 # was 50

# # X_train_array = [x_train[:, 0], x_train[:, 1]]
# # X_val_array = [x_val[:, 0], x_val[:, 1]]
# # X_test_array = [x_test[:, 0], x_test[:, 1]]
# # user = Input(shape=(1,))
# # u = Embedding(n_users, e_dimension, embeddings_initializer='he_normal',
# #               embeddings_regularizer=l2(1e-7))(user) # embedding was 1e-6
# # u = Reshape((e_dimension,))(u)
# # movie = Input(shape=(1,))
# # m = Embedding(n_movies, e_dimension, embeddings_initializer='he_normal',
# #               embeddings_regularizer=l2(1e-7))(movie)  # embedding was 1e-6
# # m = Reshape((e_dimension,))(m)

# # x = Dot(axes=1)([u, m])
# # ## new : concat user, id along with dot
# # x = Concatenate()([u, m,x])
# # x = Dropout(0.2)(x)

# # # Build last deep learning layers
# # x = Dense(256, activation='relu')(x)
# # x = Dropout(0.2)(x)
# # # x = Dense(128, activation='relu')(x)
# # # x = Dropout(0.2)(x)
# # x = Dense(1)(x)

# # model = Model(inputs=[user, movie], outputs=x)
# # model.compile(loss='mean_squared_error',
# #               optimizer=Adam(lr=0.003),  ## 0.001
# #               metrics=[tf.keras.metrics.RootMeanSquaredError()]
# #               )

# # # Set up for early stop if the validation loss stop improving for more than 1 epoch
# # callbacks_list = [keras.callbacks.EarlyStopping(monitor='val_loss',
# #                                                 patience=1,
# #                                                 ),
# #                   # Saves the weights after every epoch
# #                   keras.callbacks.ModelCheckpoint(
# #                       filepath='Model_1',
# #                       monitor='val_loss',
# #                       save_best_only=True,
# #                       )]

# # # Print model info summary
# # model.summary()

# # history = model.fit(x=X_train_array, y=y_train, batch_size=64,
# #                     epochs=10, # 20
# #                     verbose=1,
# #                     callbacks=callbacks_list,
# #                     validation_data=(X_val_array, y_val)
# #                     )

# # # Save the model (we should make a good habit of always saving our models after training)
# # model.save("Model_1")

# # m = tf.keras.metrics.RootMeanSquaredError()
# # m.update_state(model.predict(X_test_array), y_test)
# # m.result().numpy()




# from sklearn.neural_network import BernoulliRBM
# from sklearn.datasets import load_digits
# from sklearn.model_selection import train_test_split
# from sklearn.pipeline import Pipeline
# from sklearn import linear_model, metrics, preprocessing


# # Create a Restricted Boltzmann Machine
# rbm = BernoulliRBM(random_state=2, verbose=True,learning_rate=0.01, n_iter=2)

# # Create a logistic regression classifier
# logistic = linear_model.LogisticRegression(solver='newton-cg', tol=1)

# # Create a pipeline that first transforms the data using the RBM, then fits the transformed data using the logistic regression classifier
# classifier = Pipeline(steps=[('rbm', rbm), ('logistic', logistic)])

# # Train the classifier
# classifier.fit(x_train, y_train)

# # Evaluate the classifier
# results = classifier.predict(x_test)
# MSE = metrics.mean_squared_error(y_test, results)
# # for predicted, actual, data in zip(results, y_test, x_test) :
# #     print(f'Predicted: {predicted} with {x_test}, Actual: {actual}')

# print(f"RSME: {pow(MSE, 1/2) } with learning rate {0.01}")
# # print("Logistic regression using RBM features:\n%s\n" % (
# #     metrics.classification_report(y_test, classifier.predict(x_test))))


# # [4044631 rows x 4 columns]
# # [BernoulliRBM] Iteration 1, pseudo-likelihood = 0.00, time = 47.62s
# # [BernoulliRBM] Iteration 2, pseudo-likelihood = 0.00, time = 421.26s
# # [BernoulliRBM] Iteration 3, pseudo-likelihood = 0.00, time = 203.87s
# # [BernoulliRBM] Iteration 4, pseudo-likelihood = 0.00, time = 149.02s
# # [BernoulliRBM] Iteration 5, pseudo-likelihood = 0.00, time = 422.02s
# # [BernoulliRBM] Iteration 6, pseudo-likelihood = 0.00, time = 555.76s
# # [BernoulliRBM] Iteration 7, pseudo-likelihood = 0.00, time = 532.41s
# # [BernoulliRBM] Iteration 8, pseudo-likelihood = 0.00, time = 494.89s
# # [BernoulliRBM] Iteration 9, pseudo-likelihood = 0.00, time = 538.57s
# # [BernoulliRBM] Iteration 10, pseudo-likelihood = 0.00, time = 567.80s

# # 65 mins, 95%

# # [236006 rows x 4 columns]
# # [BernoulliRBM] Iteration 1, pseudo-likelihood = 0.00, time = 11.13s
# # [BernoulliRBM] Iteration 2, pseudo-likelihood = 0.00, time = 28.39s
# # [BernoulliRBM] Iteration 3, pseudo-likelihood = 0.00, time = 26.53s
# # [BernoulliRBM] Iteration 4, pseudo-likelihood = 0.00, time = 27.49s
# # [BernoulliRBM] Iteration 5, pseudo-likelihood = 0.00, time = 28.75s
# # [BernoulliRBM] Iteration 6, pseudo-likelihood = 0.00, time = 27.04s
# # [BernoulliRBM] Iteration 7, pseudo-likelihood = 0.00, time = 24.84s
# # [BernoulliRBM] Iteration 8, pseudo-likelihood = 0.00, time = 25.66s
# # [BernoulliRBM] Iteration 9, pseudo-likelihood = 0.00, time = 26.84s
# # [BernoulliRBM] Iteration 10, pseudo-likelihood = 0.00, time = 24.89s

# # 5 mins, 99%