In [None]:
# !pip3 install surprise

## Import Packages

In [None]:
#Local Imports
import time
import datetime
import os
from os import path
import numpy as np
from numpy import *
import pandas as pd
import pickle
import tqdm as tqdm
import requests, zipfile, io
from collections import defaultdict

#Import for Recommendation Models
from surprise import Reader, Dataset, SVD, accuracy, SVDpp, SlopeOne, BaselineOnly, CoClustering, KNNBasic, KNNWithMeans, KNNWithZScore
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

#Import for visualizations
import matplotlib.pyplot as plt

## Configuration Chunk

In [None]:
movies_datapath = 'ml-latest-small/movies.csv'
ratings_datapath = 'ml-latest-small/ratings.csv'
chunk_size = 500000
result_dir = 'results/'
os.makedirs(result_dir,exist_ok=True)

## Dataset helpers

In [None]:
# Loading the mapping data which is to map each movie Id
# in the ratings with it's title and genre
# the resulted data structure is a dictionary where the
# movie id is the key, the genre and titles are values
def load_mapping_data():
    movie_data = {}
    df_dtype = {
        "movieId": int,
        "title": str,
        "genres": str
    }
    cols = list(df_dtype.keys())
    for df_chunk in tqdm.tqdm(pd.read_csv(movies_datapath, usecols=cols, dtype=df_dtype, chunksize=chunk_size)):
        combine_data = [list(a) for a in
                        zip(df_chunk["movieId"].tolist(), df_chunk["title"].tolist(),
                            df_chunk["genres"].tolist())]
        for a in combine_data:
            movie_data[a[0]] = [a[1], a[2]]
    del df_chunk
    return movie_data

In [None]:
# Loading the rating data which is around 27M records it takes around 2 minutes
# the resulted data structure us a dictionary where the
# user id is the key and all their raings are values for example for user 1 :
# 1 = {
#     [movieId,rating,timestamp],
#     [movieId,rating,timestamp],
#     [movieId,rating,timestamp],
#   }

def load_data():
    rating_data = {}
    unique_user_id = []
    chunk_size = 50000
    df_dtype = {
        "userId": int,
        "movieId": int,
        "rating": float,
        "timestamp": int,
    }
    cols = list(df_dtype.keys())
    for df_chunk in tqdm.tqdm(pd.read_csv(ratings_datapath, usecols=cols, dtype=df_dtype, chunksize=chunk_size)):
        user_id = df_chunk["userId"].tolist()
        unique_user_id.extend(set(user_id))
        movie_id = df_chunk["movieId"].tolist()
        rating = df_chunk["rating"].tolist()
        timestamp = df_chunk["timestamp"].tolist()
        combine_data = [list(a) for a in zip(user_id, movie_id, rating, timestamp)]
        for a in combine_data:
            if a[0] in rating_data.keys():
                rating_data[a[0]].extend([[a[0], a[1], a[2], a[3]]])
            else:
                rating_data[a[0]] = [[a[0], a[1], a[2], a[3]]]
    del df_chunk
    
    return rating_data, unique_user_id

In [None]:
# Split the data into training and testing
# this processes isn't being done for the whole dataset instead it's being done
# for each user id, for each user we split their ratings 80 training and 20 testing
# the resulted training and testing datasets are including the whole original dataset
def spilt_data(rating_data, unique_user_id):
    training_data = []
    testing_data = []
    t0 = time.time()
    t1 = time.time()
    for u in unique_user_id:
        if len(rating_data[u]) == 1:
            x_test = rating_data[u]
            x_train = rating_data[u]
        else:
            x_train, x_test = train_test_split(rating_data[u], test_size=0.2)
        training_data.extend(x_train)
        testing_data.extend(x_test)
    total = t1 - t0
    print(int(total))
    return training_data, testing_data

In [None]:
def get_movie_title(movie_id, movie_data):
    if movie_id in movie_data.keys():
        return movie_data[movie_id][0]

In [None]:
def get_movie_genre(movie_id, movie_data):
    if movie_id in movie_data.keys():
        return movie_data[movie_id][1]


## Get train test data

In [None]:
def get_train_test_data(new_sample = False):
    if new_sample:
        rating_data, unique_user_id = load_data()
        training_data, testing_data = spilt_data(rating_data, unique_user_id)
        training_dataframe = pd.DataFrame.from_records(training_data)
        training_dataframe.columns = ["userId","movieId","rating","timestamp"]
        testing_dataframe = pd.DataFrame.from_records(testing_data)
        testing_dataframe.columns=["userId","movieId","rating","timestamp"]

        file = open('training_dataframe', 'wb')
        pickle.dump(training_dataframe, file)
        file.close()

        file = open('testing_dataframe', 'wb')
        pickle.dump(testing_dataframe, file)
        file.close()

    else:
        file = open('training_dataframe', 'rb')
        training_dataframe = pickle.load(file)
        file.close()

        file = open('testing_dataframe', 'rb')
        testing_dataframe = pickle.load(file)
        file.close()

    return training_dataframe, testing_dataframe

In [None]:
def convert_traintest_dataframe_forsurprise(training_dataframe, testing_dataframe):
	training_dataframe = training_dataframe.iloc[:, :-1]
	testing_dataframe = testing_dataframe.iloc[:, :-1]
	reader = Reader(rating_scale=(0,5))
	trainset = Dataset.load_from_df(training_dataframe[['userId', 'movieId', 'rating']], reader)
	testset = Dataset.load_from_df(testing_dataframe[['userId', 'movieId', 'rating']], reader)
	# trainset = trainset.construct_trainset(trainset.raw_ratings)
	testset = testset.construct_testset(testset.raw_ratings)
	return trainset, testset

## do

In [None]:
svd_param_grid = {'n_factors':[50,100,150],
                  'n_epochs': [20, 25], 
                  'lr_all': [0.007, 0.009, 0.01],
                  'reg_all': [0.4, 0.6],
                  'sim_options': {'name': ['msd', 'cosine'],
                              'min_support': [1, 5],
                              'user_based': [True, False]}
                  }

knn_param_grid = {'k': [15, 20, 25, 30, 40, 50, 60]}

baseline_param_grid = {}
slopeOne_param_grid = {}
clustering_param_grid = {}

algos = {
    'svd' : (SVD, {'n_epochs': [20, 25]}),
    # # 'svdpp' : (SVDpp, svd_param_grid),
    'baseline' : (BaselineOnly, baseline_param_grid),
    'slopeOne' : (SlopeOne, slopeOne_param_grid),
    'coClustering' : (CoClustering, clustering_param_grid),
    'kNNBasic' : (KNNBasic, knn_param_grid),
}

In [None]:
def collaborative_filtering(trainset, testset, cf_model = "svd"):
  print("\n" + "-" *5 + cf_model+" algorithm using surprise package " + "-" *5)
  algo = GridSearchCV(algos[cf_model][0], algos[cf_model][1], measures=['rmse', 'mae'], cv=5, n_jobs=-1)
  algo.fit(trainset)
  best_algo = algo.best_estimator['rmse']
  print(best_algo)
  predictions = algo.fit(trainset.construct_testset(trainset.raw_ratings))
  # predictions = algo.test(testset)
  # rmse = accuracy.rmse(predictions)
  # mae = accuracy.mae(predictions)
  return algo.best_estimator['rmse'], algo.best_estimator['mae'], predictions

In [None]:
def hybrid_approach_train(trainset, testset, weights = []):
  rmse_arr = []
  mae_arr = []
  for algo in algos:
    start_time = time.time()
    rmse, mae, predictions = collaborative_filtering(trainset, testset, cf_model = algo)
    file = open(result_dir + algo + "train_predictions", 'wb')
    pickle.dump(predictions, file)
    file.close()
    rmse_arr.append(rmse)
    mae_arr.append(mae)
    print("Elapsed Time: ", time.time() - start_time)

  file = open(result_dir+'mae_arr', 'wb')
  pickle.dump(mae_arr, file)
  file.close()

  file = open(result_dir+'rmse_arr', 'wb')
  pickle.dump(rmse_arr, file)
  file.close()

In [None]:
if not path.exists('ml-latest-small'):
    print("Downloading Files for first time use: ")
    download_file = requests.get('http://files.grouplens.org/datasets/movielens/ml-latest-small.zip')
    zipped_file = zipfile.ZipFile(io.BytesIO(download_file.content)) # having First.csv zipped file.
    zipped_file.extractall()

#We do not want the data to be sampled everytime, else the predictions won't match with each other.
df_train_test, df_val = get_train_test_data(new_sample = False)
trainset, testset = convert_traintest_dataframe_forsurprise(df_train_test, df_val)
# hybrid_approach_train(trainset, testset)

In [None]:
# Parameter space
svd_param_grid = {'n_epochs': [20, 25], 
                  'lr_all': [0.007, 0.009, 0.01],
                  'reg_all': [0.4, 0.6]}

# svdpp_gs = GridSearchCV(SVDpp, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
# svdpp_gs.fit(trainset)

svd_gs = GridSearchCV(SVD, svd_param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5, refit=True)
svd_gs.fit(trainset)
file = open("/common/home/jrp328/Downloads/MDM/results/svdpp_grid_search", "wb")
pickle.dump(svd_gs, file)
file.close()
file = open("/common/home/jrp328/Downloads/MDM/results/svdpp_grid_search","rb")
svdpp_gs = pickle.load(file)
file.close()
print('SVDpp - RMSE:', round(svdpp_gs.best_score['rmse'], 4), '; MAE:', round(svdpp_gs.best_score['mae'], 4))

print('SVD   - RMSE:', round(svd_gs.best_score['rmse'], 4), '; MAE:', round(svd_gs.best_score['mae'], 4))

print("------SVDpp-----")
print('RMSE =', svdpp_gs.best_params['rmse'])
print('MAE =', svdpp_gs.best_params['mae'])

print("------SVD-----")
print('RMSE =', svd_gs.best_params['rmse'])
print('MAE =', svd_gs.best_params['mae'])

param_grid = {'k': [15, 20, 25, 30, 40, 50, 60]}

knnbasic_gs = GridSearchCV(KNNBasic, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnbasic_gs.fit(trainset)

knnmeans_gs = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnmeans_gs.fit(trainset)

knnz_gs = GridSearchCV(KNNWithZScore, param_grid, measures=['rmse', 'mae'], cv=5, n_jobs=5)
knnz_gs.fit(trainset)

x = [15, 20, 25, 30, 40, 50, 60]
y1 = knnbasic_gs.cv_results['mean_test_rmse']
y2 = knnbasic_gs.cv_results['mean_test_mae']

y3 = knnmeans_gs.cv_results['mean_test_rmse']
y4 = knnmeans_gs.cv_results['mean_test_mae']

y5 = knnz_gs.cv_results['mean_test_rmse']
y6 = knnz_gs.cv_results['mean_test_mae']

plt.figure(figsize=(18,5))

plt.subplot(1, 2, 1)
plt.title('K Neighbors vs RMSE', loc='center', fontsize=15)
plt.plot(x, y1, label='KNNBasic', color='lightcoral', marker='o')
plt.plot(x, y5, label='KNNWithZScore', color='indianred', marker='o')
plt.plot(x, y3, label='KNNWithMeans', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('RMSE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

plt.subplot(1, 2, 2)
plt.title('K Neighbors vs MAE', loc='center', fontsize=15)
plt.plot(x, y2, label='KNNBasic', color='lightcoral', marker='o')
plt.plot(x, y4, label='KNNWithMeans', color='indianred', marker='o')
plt.plot(x, y6, label='KNNWithZScore', color='darkred', marker='o')
plt.xlabel('K Neighbor', fontsize=15)
plt.ylabel('MAE Value', fontsize=15)
plt.legend()
plt.grid(ls='dotted')

plt.show()

In [None]:
def compute_error(actual_ratings, estimate_ratings):
	ratings = np.array(actual_ratings)
	estimate = np.array(estimate_ratings)

	rmse = np.sqrt(np.sum(np.square(np.subtract(ratings, estimate)))/np.size(ratings))
	mae = np.sum(np.abs(np.subtract(ratings, estimate)))/np.size(ratings)

	return rmse, mae

In [None]:
def precision_recall_calculation(predictions, threshold=3.5):

    # First map the predictions to each user.
    user_predict_true = defaultdict(list)
    for user_id, movie_id, true_rating, predicted_rating, _ in predictions:
        user_predict_true[user_id].append((predicted_rating, true_rating))

    precisions = dict()
    recalls = dict()
    for user_id, user_ratings in user_predict_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        no_of_relevant_items = sum((true_rating >= threshold) for (predicted_rating, true_rating) in user_ratings)

        # Number of recommended items in top 10
        no_of_recommended_items = sum((predicted_rating >= threshold) for (predicted_rating, true_rating) in user_ratings[:10])

        # Number of relevant and recommended items in top 10
        no_of_relevant_and_recommended_items = sum(((true_rating >= threshold) and (predicted_rating >= threshold)) for (predicted_rating, true_rating) in user_ratings[:10])

        # Precision: Proportion of recommended items that are relevant
        precisions[user_id] = no_of_relevant_and_recommended_items / no_of_recommended_items if no_of_recommended_items != 0 else 1

        # Recall: Proportion of relevant items that are recommended
        recalls[user_id] = no_of_relevant_and_recommended_items / no_of_relevant_items if no_of_relevant_items != 0 else 1

    # Averaging the values for all users
    average_precision=sum(precision for precision in precisions.values()) / len(precisions)
    average_recall=sum(recall for recall in recalls.values()) / len(recalls)
    F_score=(2*average_precision*average_recall) / (average_precision + average_recall)
    
    return [average_precision, average_recall, F_score]

In [None]:
def hybrid_approach_test(trainset, testset, weights = []):
  predictions_all = []
  for algo in algos:
    file = open(result_dir+algo+"predictions",'rb')
    predictions_all.append(pickle.load(file))
    file.close()
    
  file = open('/content/drive/MyDrive/MDM/results/mae_arr', 'rb')
  mae_arr = pickle.load(file)
  file.close()

  file = open('/content/drive/MyDrive/MDM/results/rmse_arr', 'rb')
  rmse_arr = pickle.load(file)
  file.close()

  actual_ratings = []
  estimate_arr = []

  for p in predictions_all[1]:
    actual_ratings.append(p[2])

  for i, predictions in enumerate(predictions_all):
    estimate_arr.append([])
    for p in predictions:
      estimate_arr[i].append(p[3])

  if len(weights) == 0:
    total = 0
    for i, (e,f) in enumerate(zip(rmse_arr, mae_arr)):
      if i in [0, 1, 2, 3, 4, 5]:
        total += (1)/((e) ** 1)

    for i, (e,f) in enumerate(zip(rmse_arr, mae_arr)):
      if i in [0, 1, 2, 3, 4, 5]:
        weights.append((1)/(((e) ** 1) * total))
      else:
        weights.append(0)

    hybrid_estimates = np.zeros(np.asarray(estimate_arr[0]).shape)

    for i, estimate in enumerate(estimate_arr):
      hybrid_estimates += np.multiply(estimate, weights[i])

  print(weights)

  hybrid_predictions = []

  for p, h in zip(predictions_all[0], hybrid_estimates):
    hybrid_predictions.append((p[0], p[1], p[2], h, p[4]))

  rmse, mae = compute_error(actual_ratings, hybrid_estimates)
  [precision, recall, F_score] = precision_recall_calculation(hybrid_predictions, threshold=3.5)

  print("\n" + "-" *5 + " Hybrid algorithm " + "-" *5)
  print("RMSE: ", rmse)
  print("MAE: ", mae)
  print("Precision: ", precision)
  print("Recall: ", recall)
  print("F-Score: ",F_score)

  print(str(rmse) + "\t" + str(mae) + "\t" + str(precision) + "\t" + str(recall) + "\t" + str(F_score))