In [33]:

import pandas as pd
import numpy as np
from tabulate import tabulate

from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate
from surprise.model_selection import KFold

from surprise import KNNBasic
from surprise import SVD
from surprise import BaselineOnly
from surprise import KNNWithZScore
from surprise import CoClustering
from surprise import NormalPredictor


In [34]:
train_data = pd.read_csv("../data/train-PDA2019.csv")
train_data.head()
#train_data = train_data.iloc[:,0:3]

Unnamed: 0,userID,itemID,rating,timeStamp
0,5,648,5,978297876
1,5,1394,5,978298237
2,5,3534,5,978297149
3,5,104,4,978298558
4,5,2735,5,978297919


In [35]:
test_data = pd.read_csv("../data/test-PDA2019.csv")
test_data.columns = ['userID', 'recommended_itemID']

In [36]:
train_data.sort_values("userID")


Unnamed: 0,userID,itemID,rating,timeStamp
435825,1,919,4,978301467
435827,1,608,4,978301445
435828,1,2797,4,978302039
435829,1,1545,4,978824176
435830,1,594,4,978302292
435826,1,1721,4,978300150
435843,3,3108,3,978299752
435842,3,3735,3,978298845
435841,3,1213,2,978298496
435840,3,1552,3,978300021


In [37]:
# I commented this code out as it takes too long to run
'''
recommenders = (SVD, KNNBasic, BaselineOnly, KNNWithZScore, CoClustering, NormalPredictor)
titles = ('SVD','KNN-Basic','KNN-Baseline','KNNWithZScore','CoClustering','Random')


kf = KFold(random_state=0)

table = []
fold_n = 0
for rec in recommenders:
    out = cross_validate(rec(), data, ['rmse','mae','fcp'],kf)
    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))
    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))
    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))
    
    new_line = [titles[fold_n], mean_rmse, mean_mae, mean_fcp]
    table.append(new_line)
    fold_n += 1 
'''

"\nrecommenders = (SVD, KNNBasic, BaselineOnly, KNNWithZScore, CoClustering, NormalPredictor)\ntitles = ('SVD','KNN-Basic','KNN-Baseline','KNNWithZScore','CoClustering','Baseline','Random')\n\n\nkf = KFold(random_state=0)\n\ntable = []\nfold_n = 0\nfor rec in recommenders:\n    out = cross_validate(rec(), data, ['rmse','mae','fcp'],kf)\n    mean_rmse = '{:.3f}'.format(np.mean(out['test_rmse']))\n    mean_mae = '{:.3f}'.format(np.mean(out['test_mae']))\n    mean_fcp = '{:.3f}'.format(np.mean(out['test_fcp']))\n    \n    new_line = [titles[fold_n], mean_rmse, mean_mae, mean_fcp]\n    table.append(new_line)\n    fold_n += 1 \n"

In [38]:
'''
header = ['Recommenders','Pred Accuracy (RMSE)','Pred Accuracy (MAE)', 'Rank quality (FCP)']

print(tabulate(table, header, tablefmt='pipe'))

| Recommenders   |   Pred Accuracy (RMSE) |   Pred Accuracy (MAE) |   Rank quality (FCP) | 
|:---------------|-----------------------:|----------------------:|---------------------:|
| SVD            |                  0.885 |                 0.695 |                0.72  |
| KNN-Basic      |                  0.932 |                 0.731 |                0.715 |
| KNN-Baseline   |                  0.903 |                 0.713 |                0.707 |
| KNNWithZScore  |                  0.929 |                 0.734 |                0.704 |
| CoClustering   |                  0.91  |                 0.711 |                0.714 |
| Random         |                  1.48  |                 1.184 |                0.496 |
'''

"\nheader = ['Recommenders','Pred Accuracy (RMSE)','Pred Accuracy (MAE)', 'Rank quality (FCP)']\n\nprint(tabulate(table, header, tablefmt='pipe'))\n\n| Recommenders   |   Pred Accuracy (RMSE) |   Pred Accuracy (MAE) |   Rank quality (FCP) | \n|:---------------|-----------------------:|----------------------:|---------------------:|\n| SVD            |                  0.885 |                 0.695 |                0.72  |\n| KNN-Basic      |                  0.932 |                 0.731 |                0.715 |\n| KNN-Baseline   |                  0.903 |                 0.713 |                0.707 |\n| KNNWithZScore  |                  0.929 |                 0.734 |                0.704 |\n| CoClustering   |                  0.91  |                 0.711 |                0.714 |\n| Baseline       |                  1.48  |                 1.184 |                0.496 |\n"

In [39]:
#Because the Rank is high and the Errors are smaller for SVD I will continue with this recommender

#[U,S,V] = np.linalg.svd(train_data)
#print("---------U-------")
#print(U)
#print("---------S-------")
#print(S)
#print("---------V-------")
#print(V)



In [40]:

#####----------------- SVD ------------------


In [41]:
reader = Reader(rating_scale=(1,5))
data = Dataset.load_from_df(train_data[['userID', 'itemID','rating']], reader)


In [42]:
rating_df = data.build_full_trainset()

In [43]:
user_rating = rating_df.ur
item_rating = rating_df.ir
num_users = rating_df.n_users
num_items = rating_df.n_items
num_ratings = rating_df.n_ratings


mean_ = rating_df.global_mean
mean_ = np.round(mean_, decimals=2)

density_ = num_ratings/(num_users*num_items)
sparsity_ = 1 - density_

In [44]:
print("Users", num_users)
print("items", num_items)
print("ratings ", num_ratings)
print("Mean rating", mean_)
print("density " ,density_ *100, "%")
print("sparsity ", sparsity_ *100 ,"%")

Users 5690
items 1824
ratings  470711
Mean rating 3.64
density  4.5354172447815495 %
sparsity  95.46458275521846 %


In [45]:
recommender = SVD()

In [46]:
recommender.fit(rating_df)
# output <surprise.prediction_algorithms.matrix_factorization.SVD at 0x18664ff0>

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x11b40df0>

In [47]:
print("number of factors used: ", recommender.n_factors)
#output: number of factors used:  100


number of factors used:  100


In [48]:
userid = 1
itemid = 104
true_rating = 4.0

pred = recommender.predict(userid,itemid)
pred_rating = np.around(pred.est, decimals=1)
print(pred_rating)
print (pred)
print(type(pred))

3.4
user: 1          item: 104        r_ui = None   est = 3.37   {'was_impossible': False}
<class 'surprise.prediction_algorithms.predictions.Prediction'>


In [49]:
#Get a set of unique items that were rated in training data
items_train = train_data["itemID"]
items = items_train.sort_values()
items = items.as_matrix(items)
items_set = set(items)
len(items_set)
#len(items)

  after removing the cwd from sys.path.


849

In [50]:
#Get an array of users in the test data
users_test = test_data["userID"].as_matrix()
#test_data.as_matrix(test_data, columns= 0)
users_test

  


array([    1,     3,    11, ..., 12061, 12063, 12073], dtype=int64)

In [51]:
#all_recommendations = pd.DataFrame(columns=["userID","itemID","recommendation estimate"])
#all_recommendations["userID"] = users_test
#all_recommendations

In [52]:
predictions = []

i = 0
for user in users_test:
    for item in items_set:
        predictions += recommender.predict(user,item)
    i += 1

In [53]:
# 0 is the user
# 1 is the item id
# 3 is the prediction
predictions

[1,
 2050,
 None,
 2.6667682349954136,
 {'was_impossible': False},
 1,
 2053,
 None,
 2.2241146727026253,
 {'was_impossible': False},
 1,
 2054,
 None,
 2.903004211596921,
 {'was_impossible': False},
 1,
 2057,
 None,
 3.6242901464265023,
 {'was_impossible': False},
 1,
 2058,
 None,
 3.5653246611867044,
 {'was_impossible': False},
 1,
 2059,
 None,
 3.2866877188468857,
 {'was_impossible': False},
 1,
 2064,
 None,
 4.490379623046693,
 {'was_impossible': False},
 1,
 2065,
 None,
 3.77938464839756,
 {'was_impossible': False},
 1,
 2067,
 None,
 4.17796772393318,
 {'was_impossible': False},
 1,
 2077,
 None,
 3.56633926702054,
 {'was_impossible': False},
 1,
 2078,
 None,
 3.8326461463592034,
 {'was_impossible': False},
 1,
 2080,
 None,
 3.874518949423004,
 {'was_impossible': False},
 1,
 2081,
 None,
 3.8383074119401486,
 {'was_impossible': False},
 1,
 2082,
 None,
 3.0375067018303343,
 {'was_impossible': False},
 1,
 2083,
 None,
 3.899911813903294,
 {'was_impossible': False},
 1,
 

In [54]:
# Cut the string into each individual prediction
ind_pred = []
for i in range(0, len(predictions), 5):
    ind_pred.append(predictions[i:i + 5])
        

In [55]:
columns = ["userID", "itemID", "none", "predEst", "impossible"]
df = pd.DataFrame(ind_pred , columns=columns)

In [56]:
# dataframe containing all individual rating estimations.
df = df.sort_values(by=['userID','predEst'], ascending=False)
df

Unnamed: 0,userID,itemID,none,predEst,impossible
1690864,12073,1148,,4.358928,{'was_impossible': False}
1690629,12073,608,,4.266238,{'was_impossible': False}
1690711,12073,858,,4.232650,{'was_impossible': False}
1690908,12073,1240,,4.201220,{'was_impossible': False}
1690466,12073,260,,4.194182,{'was_impossible': False}
1690666,12073,2762,,4.190969,{'was_impossible': False}
1690812,12073,3091,,4.169999,{'was_impossible': False}
1690890,12073,1207,,4.118689,{'was_impossible': False}
1690595,12073,541,,4.097803,{'was_impossible': False}
1690878,12073,1196,,4.095246,{'was_impossible': False}


In [57]:
df = df.iloc[:,0:4]
len(df["userID"])

1691208

In [58]:
# make a list of all top 10s
top10 = []
for i in range(0, len(df["userID"]), len(items_set)):
    top10.append(df[i:i+10])

top10[0]

Unnamed: 0,userID,itemID,none,predEst
1690864,12073,1148,,4.358928
1690629,12073,608,,4.266238
1690711,12073,858,,4.23265
1690908,12073,1240,,4.20122
1690466,12073,260,,4.194182
1690666,12073,2762,,4.190969
1690812,12073,3091,,4.169999
1690890,12073,1207,,4.118689
1690595,12073,541,,4.097803
1690878,12073,1196,,4.095246


In [59]:
# User ID and 10 Item IDs
cols = ["userID", "recommended_itemIDs"]
top10_df = pd.DataFrame(columns = cols)

top10_df.iloc[:,0:2]

Unnamed: 0,userID,recommended_itemIDs


In [60]:
#---------TESTING
value = top10[0].iloc[2]['userID']

In [61]:
items_string = ""
items_str_list = []
users_list = []
for elem in top10:
    list_items = elem["itemID"].tolist()
    string_items = str(list_items).strip('[]').replace(",", " ")
    items_str_list.append(string_items)
    users_list.append(elem.iloc[2]['userID'])


In [62]:
top10_df["recommended_itemIDs"] = items_str_list
top10_df["userID"] = users_list

In [65]:
top10 = top10_df.sort_values("userID")
top10_df.to_csv("../data/competition.csv", index=False)

In [64]:
top10_df


Unnamed: 0,userID,recommended_itemIDs
0,12073,1148 608 858 1240 260 2762 3091 1207 5...
1,12063,913 912 858 750 904 1207 2324 908 898 ...
2,12061,912 608 926 904 913 1207 1307 1682 119...
3,12051,750 608 318 260 593 2997 908 1148 3091...
4,12047,2329 1949 2324 1223 1207 318 3435 908 ...
5,12043,318 246 1207 858 2762 1148 1949 904 34...
6,12037,750 593 318 913 2959 1206 2997 2692 91...
7,12031,858 2858 923 912 608 593 1299 2692 303...
8,12029,1207 2028 608 318 3091 1136 858 1193 1...
9,12025,1256 913 1207 912 2997 1148 2731 898 3...
