In [4]:
from collections import defaultdict

from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise import accuracy
from surprise.model_selection import cross_validate

In [73]:
def get_top_n(predictions, n=5):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        print(len(top_n[uid]))
        top_n[uid] = user_ratings[:n]

    return top_n



def surprise_train(n_ratings=1000000):
    import pandas as pd
    # First train an SVD algorithm on the movielens dataset and get RMSE from cross validation
    data_df = pd.read_csv("ratings.csv")
    data_df = data_df.iloc[:n_ratings]
    reader = Reader(rating_scale=(0.5,5.0))
    data = Dataset.load_from_df(data_df[['userId', 'movieId', 'rating']], reader)
    cross_validate(SVD(), data, verbose=True, n_jobs=2)
    
    # # Now we split the set into train and test and fit the algorithm ot the train set
    trainset, testset = train_test_split(data, test_size=.25)
    #trainset = data.build_full_trainset()
    algo = SVD()
    algo.fit(trainset)
    
    # predict top recommendations for each user
    predictions = algo.test(testset)
    top_recos = get_top_n(predictions, n=25)
    return trainset, testset, algo, top_recos, predictions





def testing():
    predictions = algo.test(testset)
    accuracy.rmse(predictions)
    
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
#testset = trainset.build_anti_testset()
#predictions = algo.test(testset)

#top_n = get_top_n(predictions, n=10)
# get RMSE
#accuracy.rmse(predictions)

# Print the recommended items for each user
#for uid, user_ratings in top_n.items():
#    print(uid, [iid for (iid, _) in user_ratings])

In [74]:
train,test,model,top_recos, predictions = surprise_train(100000)


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8976  0.9079  0.9049  0.9015  0.9054  0.9035  0.0036  
MAE (testset)     0.6978  0.7019  0.7006  0.6997  0.6988  0.6998  0.0014  
Fit time          6.34    6.39    6.44    6.34    6.24    6.35    0.07    
Test time         0.22    0.23    0.23    0.22    0.22    0.23    0.01    
12
54
67
35
17
22
56
98
171
73
25
119
90
48
62
43
124
112
21
26
60
11
165
26
32
29
25
540
65
49
25
342
61
323
9
6
3
158
83
69
6
196
28
94
23
41
97
24
13
56
104
26
22
25
13
32
10
36
27
74
30
128
49
115
122
174
53
53
84
26
54
88
85
55
50
15
357
300
88
149
266
121
21
16
252
47
19
71
31
147
157
38
11
64
93
28
210
19
55
9
117
38
37
61
19
213
71
145
54
61
27
346
16
26
21
23
168
28
156
73
1
47
147
197
8
33
26
65
350
77
200
46
57
12
43
68
40
73
22
82
46
10
65
30
123
7
26
37
41
19
45
21
8
163
50
39
268
78
28
27
39
11
194
138
10
46
7
17
9
21
32
94
12
22
30
21
118
53
24
116
5

In [72]:
import numpy as np
test.sort(key=lambda tup: tup[2])
test.sort(key=lambda tup: tup[0])
users = [x[0] for x in test]
#print(users)
arr = np.unique(np.asarray(users))
#print(arr)# user 542 does not exist (the only one in this set)


'''
HAY PROBLEMAS CON LA FORMA EN QUE SURPRISE GENERA PREDICCIONES.
NO GENERA LA MISMA CANTIDAD PARA TODOS LOS USUARIO Y ESO NO ES BUENO
VAMOS A DEJAR ESTA COSA ENRREDADA PARA DESPUES
'''


#print(test)
#print(type(top_recos.keys()))
print(len(top_recos[4]))
print(predictions[0])
print(predictions[0][4])

7
user: 104        item: 2340       r_ui = 1.00   est = 3.06   {'was_impossible': False}
{'was_impossible': False}


In [6]:
from surprise import BaselineOnly
from surprise import SVD
from surprise import Dataset
from surprise import Reader
import os
import pandas as pd
from surprise.model_selection import cross_validate

n_ratings = 1000000

# path to dataset file
file_path = os.path.expanduser('~/PROJS/hallmark/surprise_code/12_12_recosys/ratings_new.csv')
#reader = Reader(line_format='user item rating timestamp', sep=',', rating_scale=(1,10),skip_lines=1)
#data = Dataset.load_from_file(file_path, reader=reader)
data_df = pd.read_csv("ratings.csv")
data_df = data_df.iloc[:n_ratings]
reader = Reader(rating_scale=(0.5,5.0))
data = Dataset.load_from_df(data_df[['userId', 'movieId', 'rating']], reader)


# We can now use this dataset as we please, e.g. calling cross_validate
cross_validate(SVD(), data, verbose=True, n_jobs=2)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8368  0.8336  0.8319  0.8355  0.8333  0.8342  0.0017  
MAE (testset)     0.6396  0.6382  0.6365  0.6387  0.6381  0.6382  0.0010  
Fit time          35.58   34.20   36.55   35.26   35.83   35.48   0.77    
Test time         1.39    1.34    1.38    1.43    1.41    1.39    0.03    


{'test_rmse': array([0.83682816, 0.83358649, 0.8318936 , 0.83549357, 0.83332664]),
 'test_mae': array([0.63955108, 0.63824305, 0.63648125, 0.63866097, 0.63813269]),
 'fit_time': (35.58294677734375,
  34.2040376663208,
  36.54542112350464,
  35.25814771652222,
  35.834367513656616),
 'test_time': (1.3891146183013916,
  1.3417341709136963,
  1.3804545402526855,
  1.4315814971923828,
  1.4110774993896484)}

In [None]:
# n_jobs=5

#{'test_rmse': array([0.83286841, 0.83403583, 0.83453788, 0.837163  , 0.83377609]),
# 'test_mae': array([0.63797554, 0.63812647, 0.6379149 , 0.63997025, 0.6375087 ]),
# 'fit_time': (34.25440835952759,
#  37.010011434555054,
#  36.49393129348755,
#  36.3558554649353,
#  33.53095293045044),
# 'test_time': (1.2990314960479736,
#  1.3195006847381592,
#  1.2744128704071045,
#  1.252169132232666,
#  1.2351505756378174)}

In [None]:
# n_jobs=1


#{'test_rmse': array([0.83318748, 0.8343154 , 0.83587293, 0.83275152, 0.83569748]),
# 'test_mae': array([0.63789076, 0.63755536, 0.63905574, 0.63709123, 0.63979902]),
# 'fit_time': (34.5315260887146,
#  35.73594927787781,
#  35.420734882354736,
#  35.074718952178955,
#  35.664207458496094),
# 'test_time': (2.5257678031921387,
#  2.1665899753570557,
#  2.3544838428497314,
#  2.5684831142425537,
#  2.2046148777008057)}

In [None]:
# n_jobs = 2

#{'test_rmse': array([0.83682816, 0.83358649, 0.8318936 , 0.83549357, 0.83332664]),
# 'test_mae': array([0.63955108, 0.63824305, 0.63648125, 0.63866097, 0.63813269]),
# 'fit_time': (35.58294677734375,
#  34.2040376663208,
#  36.54542112350464,
#  35.25814771652222,
#  35.834367513656616),
# 'test_time': (1.3891146183013916,
#  1.3417341709136963,
#  1.3804545402526855,
#  1.4315814971923828,
#  1.4110774993896484)}