In [1]:
# Imports for ensuring same RNG in multiple runs
from random import shuffle, seed

# Data Processing Imports
import numpy as np
import pandas as pd # for reading and performing operations on the dataset

# Recommender System Imports
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import KNNWithMeans, SVD, SVDpp, NMF
from surprise.model_selection import GridSearchCV
from surprise import accuracy

In [2]:
my_seed = 1812 # random number to ensure same RNG
seed(my_seed) # set the seed for python stdlib randomness
np.random.seed(my_seed) # set the same seed for numpy related randomness

In [3]:
df=pd.read_csv('../train.txt',sep=' ',header=None, names=['user_id', 'item_id', 'rating']) # read the dataset into a dataframe
reader = Reader(rating_scale=(1, 5)) # Configure information about the dataset needed later when reading the dataframe
data=Dataset.load_from_df(df, reader) # Loading data from a pandas dataframe `df` to the name `data`

In [4]:
df.head(5) # Preview of the data loaded

Unnamed: 0,user_id,item_id,rating
0,1,18,4
1,1,23,4
2,1,26,4
3,1,37,4
4,1,53,3


In [5]:
rating_matrix=df.pivot_table(index=['item_id'],columns=['user_id'],values='rating') # Create an Item-User Matrix from the given data
rating_matrix=rating_matrix.fillna(0) # fill the unknown ratings with 0
rating_matrix # preview of the 1617 x 943 Item-User Rating Matrix

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680,3.0,3.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
1681,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# Configuring Grid Search for `KNNWithMeans`

param_grid = {'k': [40,60], # number of neighbors to consider
              'sim_options': {'name': ['cosine', 'pearson'], # distance measure to use in similarity matrix
                              'min_support': [50,60,100], # minimum number of common users to consider
                              'user_based': [False]} # Not doing a user based match so False
              }

# Initialize Grid Search with necessary paramters
# Setting performance measure as RMSE and Cross Validation to 10 Folds
# Enabling Parallel Processing by setting `n_jobs=-1`
grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=10, n_jobs=-1)

# Fitting the replaced data which is the training data into the model
grid_search.fit(data)

In [7]:
algo = grid_search.best_estimator['rmse'] # get the model that gave the best RMSE
print(grid_search.best_params['rmse']) # get the params that yielded the best RMSE
print(grid_search.best_score['rmse']) # get the best RMSE

trainset = data.build_full_trainset() # create a trainset
algo.fit(trainset) # fit the entire trainset to the model

{'k': 40, 'sim_options': {'name': 'pearson', 'min_support': 50, 'user_based': False}}
1.0351689119799294
Computing the pearson similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNWithMeans at 0x11eee9580>

In [8]:
trainset, testset = train_test_split(data, test_size=0.3)
algo_knn = grid_search.best_estimator['rmse']
algo_knn.fit(trainset)
knn_preds = algo_knn.test(testset) # test the model with test data
accuracy.rmse(knn_preds)

Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0590


1.0590427815747592

In [9]:
# Re-read data for matrix factorization operations
df=pd.read_csv('../train.txt',sep=' ',header=None, names=['user_id', 'item_id', 'rating']) # read the dataset into a dataframe
reader = Reader(rating_scale=(1, 5)) # Configure information about the dataset needed later when reading the dataframe
data=Dataset.load_from_df(df, reader) # Loading data from a pandas dataframe `df` to the name `data`
trainset, testset = train_test_split(data, test_size=0.3) # split to train test again for SVD

In [10]:
param_grid = {'n_factors': [5,150], # number of common users
              'n_epochs':[30,100], # epochs used in SGD iteration
              'lr_all':[0.005, 0.01], # learning rate for all parameters
              'reg_all':[0.1]} # regularization rate for all paramters
              
gs = GridSearchCV(SVD, param_grid, measures = ['rmse'], cv=3) # SVD Grid Search
gs.fit(data)
print(gs.best_score)
print(gs.best_params)

{'rmse': 0.9458325419502596}
{'rmse': {'n_factors': 150, 'n_epochs': 30, 'lr_all': 0.01, 'reg_all': 0.1}}


In [11]:
algo_svd = gs.best_estimator['rmse']
algo_svd.fit(trainset)
svd_preds = algo_svd.test(testset) # test the model with test data
accuracy.rmse(svd_preds) # check the RMSE of the predicted values

RMSE: 0.9500


0.9499886498298932

In [12]:
algo_svd_pp=SVDpp() # SVD++
algo_svd_pp.fit(trainset) # fit the train data
predictions = algo_svd_pp.test(testset) # test the model with test data
accuracy.rmse(predictions) # check the RMSE of the predicted values

RMSE: 0.8168


0.8167695289515158

In [13]:
with open(f'results.txt','w') as f:
    for user_id in range(1,944): # user_id range, exclusive of end limit
        for item_id in sorted(1,1683): # item_id range, exclusive of end limit
            f.writelines(' '.join(map(str,(user_id, item_id, int(algo.predict(user_id,item_id).est))))) # call the predict method of the algorithm and write the lines to a file
            f.write('\n') # add a new line after every line

In [14]:
# Addressing Cold Start
ratings = pd.DataFrame(df.groupby('item_id')['rating'].mean()) # group by average ratings
ratings['num of ratings'] = pd.DataFrame(df.groupby('item_id')['rating'].count()) # create a new column with item frequency
top_items=ratings[ratings['num of ratings'] > 50] # items that are rated above 50 are considered top
top_items.reset_index(level=0, inplace=True) # reset the index to make sure item_id is a column

In [15]:
def cold_start_solver(user_id):
    ratings = [] # empty list to store ratings of the new user
    to_review = 5 # number of items new user must rate
    to_recommend = 5 # number of items to recommend
    for _ in range(to_review): # loop to get input ratings from user
        item = top_items.sample(1)
        rating = input(f'Rate {item["item_id"].values[0]}; Enter a number in the range (1 - 5), 1-low, 5-high')
        ratings.append({'user_id': user_id, 'item_id': item['item_id'].values[0], 'rating': rating})

    new_df = df.append(ratings, ignore_index=True) # build a new sample df with old ratings appended new ratings
    new_data = Dataset.load_from_df(new_df, reader)

    svd_ =  SVDpp() # create a new model as old model is not aware of the new user data
    svd_.fit(new_data.build_full_trainset())

    items = [(int(svd_.predict(user_id, iid).est),iid) for iid in df['item_id'].unique()] # get the ratings using the `predict` method
    ranked_items=sorted(items,reverse=True)[:to_recommend] # do a reverse sort of `items` list and get only `to_recommend` no of elements
    for recommendation, (_,iid) in enumerate(ranked_items,1): # no need to display the predicted score
        print(f'{recommendation=} : {iid}')

In [16]:
cold_start_solver(999) # for a new user with id 999

Rate 635; Enter a number in the range (1 - 5), 1-low, 5-high5
Rate 1123; Enter a number in the range (1 - 5), 1-low, 5-high5
Rate 1585; Enter a number in the range (1 - 5), 1-low, 5-high5
Rate 1069; Enter a number in the range (1 - 5), 1-low, 5-high5
Rate 258; Enter a number in the range (1 - 5), 1-low, 5-high5
recommendation=1 : 37
recommendation=2 : 95
recommendation=3 : 163
recommendation=4 : 181
recommendation=5 : 361


In [17]:
result_df=pd.read_csv('results.txt',sep=' ',header=None, names=['user_id', 'item_id', 'rating']) # read the dataset into a dataframe
result_df.pivot_table(index=['item_id'],columns=['user_id'],values='rating') # Create an Item-User Matrix from the given data

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3,1,3,2,2,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
2,3,1,3,2,2,3,3,3,3,3,...,3,3,3,3,2,3,3,3,3,3
3,2,1,3,2,2,3,3,3,3,3,...,3,3,3,3,2,3,3,3,3,3
4,3,1,3,2,3,3,3,3,3,3,...,3,3,3,3,2,3,3,3,3,3
5,3,1,3,2,3,3,3,3,3,3,...,3,3,3,3,2,3,3,3,3,3
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,3,1,3,2,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1679,3,1,3,2,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1680,3,2,3,3,3,3,3,3,3,3,...,3,3,3,3,3,3,3,3,3,3
1681,3,1,3,2,2,3,3,3,3,3,...,3,3,3,3,2,3,3,3,3,3
