In [1]:
import pandas as pd
import numpy as np
df = pd.read_pickle('reviews0')

In [22]:
from sklearn.preprocessing import LabelEncoder
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [45]:
le = LabelEncoder()
tr = pd.DataFrame()

In [46]:
tr['userID'] = le.fit_transform(df.userID)
tr['attraction'] = le.fit_transform(df.attraction)
tr['rating'] = df.user_rating.apply(lambda x: int(x[0])).values
tr = tr.sample(frac=1)

In [47]:
data = Dataset.load_from_df(tr, Reader(rating_scale=(1, 5)))

In [5]:
%%time
cross_validate(SVD(), data, cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6356  0.6348  0.6346  0.6348  0.6346  0.6349  0.0003  
RMSE (testset)    0.8370  0.8367  0.8355  0.8390  0.8379  0.8372  0.0012  
Fit time          26.16   27.95   27.19   25.73   24.99   26.40   1.05    
Test time         1.38    1.29    1.33    1.17    1.22    1.28    0.08    
Wall time: 1min 44s


In [6]:
from surprise import AlgoBase
class mrate(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        # call base method
        AlgoBase.fit(self, trainset)
        # user mean
#         self.user_r = {}
#         for u in self.trainset.all_users():
#             umean = np.mean([x for _,x in self.trainset.ur[u]])
#             self.user_r.update({u:umean})
            
        # attraction mean
        self.at_r = {}
        for i in self.trainset.all_items():
            imean = np.mean([x for _,x in self.trainset.ir[i]])
            self.at_r.update({i:imean})

        return self

    def estimate(self, u, i):
        if self.trainset.knows_item(i):
            return self.at_r[i]
        else:
            return self.trainset.global_mean

In [12]:
%%time
algo = mrate()
cross_validate(algo, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm mrate on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6512  0.6489  0.6487  0.6509  0.6487  0.6497  0.0011  
RMSE (testset)    0.8416  0.8358  0.8366  0.8413  0.8344  0.8379  0.0030  
Fit time          0.16    0.34    0.32    0.33    0.32    0.29    0.07    
Test time         1.02    1.01    1.02    1.21    1.00    1.05    0.08    
Wall time: 15.8 s


In [10]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [15]:
from surprise import accuracy

In [22]:
from surprise.model_selection.split import KFold

In [23]:
kf = KFold(n_splits=3)

In [32]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    accuracy.rmse(algo.test(testset))

RMSE: 0.5000
RMSE: 0.6614
RMSE: 0.5000


In [7]:
t = pd.read_pickle('reviews1.pkl')

In [10]:
id2 = t.userID.value_counts() >2

In [14]:
u2 = id2.loc[id2].index

In [18]:
t2 = t.loc[t.userID.apply(lambda x: x in u2)]

In [39]:
le = LabelEncoder()
tr = pd.DataFrame()
tr['userID'] = le.fit_transform(t2.userID)
tr['attraction'] = le.fit_transform(t2.attraction)
tr['rating'] = t2.user_rating.apply(lambda x: int(x[0])).values

In [40]:
data = Dataset.load_from_df(tr, Reader(rating_scale=(1, 5)))

In [42]:
%%time
cross_validate(SVD(), data, cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.5987  0.5957  0.6034  0.5976  0.5978  0.5986  0.0026  
RMSE (testset)    0.7885  0.7860  0.7965  0.7849  0.7928  0.7897  0.0043  
Fit time          16.23   16.32   16.33   16.19   16.05   16.23   0.10    
Test time         0.78    0.73    0.71    0.72    0.76    0.74    0.03    
Wall time: 56.7 s


{u'fit_time': (16.22699999809265,
  16.324999809265137,
  16.328999996185303,
  16.194999933242798,
  16.05299997329712),
 u'test_mae': array([0.5986822 , 0.59569447, 0.60335191, 0.59755149, 0.59783255]),
 u'test_rmse': array([0.78848976, 0.7860169 , 0.796534  , 0.78487375, 0.7927735 ]),
 u'test_time': (0.7810001373291016,
  0.7260000705718994,
  0.7100000381469727,
  0.7179999351501465,
  0.7580001354217529)}

In [52]:
data = Dataset.load_from_df(tr[0:428683], Reader(rating_scale=(1, 5)))

In [53]:
%%time
cross_validate(SVD(), data, cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6372  0.6402  0.6400  0.6394  0.6357  0.6385  0.0018  
RMSE (testset)    0.8365  0.8424  0.8401  0.8426  0.8405  0.8404  0.0022  
Fit time          16.54   16.69   16.79   16.61   16.45   16.62   0.12    
Test time         0.83    0.82    0.81    0.79    0.78    0.80    0.02    
Wall time: 1min 4s


{u'fit_time': (16.538999795913696,
  16.693000078201294,
  16.794999837875366,
  16.61299991607666,
  16.44599986076355),
 u'test_mae': array([0.63716556, 0.64019452, 0.63998636, 0.63935787, 0.63567301]),
 u'test_rmse': array([0.83653499, 0.84244422, 0.84006237, 0.84258364, 0.8405265 ]),
 u'test_time': (0.8250000476837158,
  0.8199999332427979,
  0.8080000877380371,
  0.7860000133514404,
  0.7820000648498535)}