In [1]:
import pandas as pd
import numpy as np
df = pd.read_pickle('reviews0')

In [2]:
from sklearn.preprocessing import LabelEncoder
from surprise import SVD
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate

In [3]:
le = LabelEncoder()
tr = pd.DataFrame()

In [4]:
tr['userID'] = le.fit_transform(df.userID)
tr['attraction'] = le.fit_transform(df.attraction)
tr['rating'] = df.user_rating.apply(lambda x: int(x[0]))
tr = tr.sample(frac=1)

In [11]:
data = Dataset.load_from_df(tr, Reader(rating_scale=(1, 5)))

In [5]:
%%time
cross_validate(SVD(), data, cv=5, verbose=True, n_jobs=-1)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6356  0.6348  0.6346  0.6348  0.6346  0.6349  0.0003  
RMSE (testset)    0.8370  0.8367  0.8355  0.8390  0.8379  0.8372  0.0012  
Fit time          26.16   27.95   27.19   25.73   24.99   26.40   1.05    
Test time         1.38    1.29    1.33    1.17    1.22    1.28    0.08    
Wall time: 1min 44s


In [6]:
from surprise import AlgoBase
class mrate(AlgoBase):

    def __init__(self):
        AlgoBase.__init__(self)

    def fit(self, trainset):
        # call base method
        AlgoBase.fit(self, trainset)
        # user mean
#         self.user_r = {}
#         for u in self.trainset.all_users():
#             umean = np.mean([x for _,x in self.trainset.ur[u]])
#             self.user_r.update({u:umean})
            
        # attraction mean
        self.at_r = {}
        for i in self.trainset.all_items():
            imean = np.mean([x for _,x in self.trainset.ir[i]])
            self.at_r.update({i:imean})

        return self

    def estimate(self, u, i):
        if self.trainset.knows_item(i):
            return self.at_r[i]
        else:
            return self.trainset.global_mean

In [12]:
%%time
algo = mrate()
cross_validate(algo, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm mrate on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.6512  0.6489  0.6487  0.6509  0.6487  0.6497  0.0011  
RMSE (testset)    0.8416  0.8358  0.8366  0.8413  0.8344  0.8379  0.0030  
Fit time          0.16    0.34    0.32    0.33    0.32    0.29    0.07    
Test time         1.02    1.01    1.02    1.21    1.00    1.05    0.08    
Wall time: 15.8 s


In [10]:
cross_validate(algo, data, cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm mrate on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.3333  0.3333  0.2000  0.3500  1.7000  0.5833  0.5610  
RMSE (testset)    0.4303  0.4303  0.2828  0.4950  1.9723  0.7222  0.6289  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{u'fit_time': (0.0, 0.0009999275207519531, 0.0, 0.0, 0.0),
 u'test_mae': array([0.33333333, 0.33333333, 0.2       , 0.35      , 1.7       ]),
 u'test_rmse': array([0.43033148, 0.43033148, 0.28284271, 0.49497475, 1.97230829]),
 u'test_time': (0.0, 0.0, 0.0, 0.0009999275207519531, 0.0009999275207519531)}

In [9]:
t.to_pickle('t.pkl')

In [10]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [11]:
testset

[(131276L, 240L, 5.0), (108291L, 795L, 5.0), (71464L, 124L, 4.0)]

In [13]:
algo.fit(trainset)

<__main__.mrate at 0x27491a90>

In [14]:
algo.test(testset)

[Prediction(uid=131276L, iid=240L, r_ui=5.0, est=4.555555555555555, details={u'was_impossible': False}),
 Prediction(uid=108291L, iid=795L, r_ui=5.0, est=4.555555555555555, details={u'was_impossible': False}),
 Prediction(uid=71464L, iid=124L, r_ui=4.0, est=4.555555555555555, details={u'was_impossible': False})]

In [15]:
from surprise import accuracy

In [22]:
from surprise.model_selection.split import KFold

In [23]:
kf = KFold(n_splits=3)

In [32]:
for trainset, testset in kf.split(data):
    algo.fit(trainset)
    accuracy.rmse(algo.test(testset))

RMSE: 0.5000
RMSE: 0.6614
RMSE: 0.5000


In [None]:
%%time
cross_validate(algo, data, cv=5, verbose=True, n_jobs=-1)

In [104]:
%%time
algo.test(da.build_full_trainset().build_testset())

[Prediction(uid=1L, iid=1L, r_ui=4.0, est=4.166666666666666, details={u'was_impossible': False}),
 Prediction(uid=1L, iid=2L, r_ui=4.0, est=4.25, details={u'was_impossible': False}),
 Prediction(uid=2L, iid=2L, r_ui=5.0, est=4.5, details={u'was_impossible': False}),
 Prediction(uid=2L, iid=1L, r_ui=4.0, est=4.416666666666666, details={u'was_impossible': False}),
 Prediction(uid=3L, iid=1L, r_ui=5.0, est=4.666666666666666, details={u'was_impossible': False})]

In [105]:
a

Unnamed: 0,userID,attraction,rating
0,1,1,4
1,2,2,5
2,3,1,5
3,1,2,4
4,2,1,4


In [None]:
cross_validate(algo, data, cv=5, verbose=True, n_jobs=-1)

In [96]:
dic[3]

4

In [80]:
a

Unnamed: 0,userID,attraction,rating
0,1,1,4
1,2,2,5
2,3,1,5
3,1,2,4
4,2,1,4


In [35]:
algo=MyOwnAlgorithm()
da = Dataset.load_from_df(a, Reader(rating_scale=(1, 5)))
cross_validate(algo, da, verbose=True)

Evaluating RMSE, MAE of algorithm MyOwnAlgorithm on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
MAE (testset)     0.7500  0.7500  0.5000  0.5000  0.5000  0.6000  0.1225  
RMSE (testset)    0.7500  0.7500  0.5000  0.5000  0.5000  0.6000  0.1225  
Fit time          0.00    0.00    0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    0.00    0.00    


{u'fit_time': (0.0, 0.0, 0.0, 0.0, 0.0),
 u'test_mae': array([0.75, 0.75, 0.5 , 0.5 , 0.5 ]),
 u'test_rmse': array([0.75, 0.75, 0.5 , 0.5 , 0.5 ]),
 u'test_time': (0.0, 0.0, 0.0, 0.0, 0.0)}

In [48]:
algo.fit(da.build_full_trainset())

<__main__.MyOwnAlgorithm at 0x277e30b8>

In [65]:
te = da.build_full_trainset().build_testset()

In [67]:
a.rating.mean()

4.4

In [66]:
algo.test(te)

[Prediction(uid=1L, iid=1L, r_ui=4.0, est=4.4, details={u'was_impossible': False}),
 Prediction(uid=1L, iid=2L, r_ui=4.0, est=4.4, details={u'was_impossible': False}),
 Prediction(uid=2L, iid=2L, r_ui=5.0, est=4.4, details={u'was_impossible': False}),
 Prediction(uid=2L, iid=1L, r_ui=4.0, est=4.4, details={u'was_impossible': False}),
 Prediction(uid=3L, iid=1L, r_ui=5.0, est=4.4, details={u'was_impossible': False})]

In [62]:
algo.predict(1,2)

Prediction(uid=1, iid=2, r_ui=None, est=4.4, details={u'was_impossible': False})

In [46]:
from surprise.model_selection import train_test_split
trainset, testset = train_test_split(data, test_size=.25)

In [25]:
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x27834d68>

In [39]:
te = tr.iloc[3:4]

In [38]:
tr.iloc[3:4]

Unnamed: 0,userID,attraction,rating
54052,315764,99,5


In [47]:
tt = Dataset.load_from_df(te, Reader())

In [46]:
te

Unnamed: 0,userID,attraction,rating
54052,22,99,5


In [41]:
tt.build_full_trainset()

<surprise.trainset.Trainset instance at 0x000000004BCC2388>

In [16]:
sum(df.userID.value_counts() == 1)

116322

In [None]:
for i in df.loc[df.userID == 'A TripAdvisor Member'].index:
    df.iloc[i]['userID'] = 'A TripAdvisor Member {}'.format(i)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [23]:
df.loc[df.userID == 'A TripAdvisor Member']

Unnamed: 0,num,userID,attraction,user_rating,user_location,user_contributions,user_upvotes,viamobile,review_quote,review_content,review_link
256,253,A TripAdvisor Member,National Gallery,40,missing,3,2,via mobile,Great,"Always worth a visit!!!\nIf in London, you can...",/ShowUserReviews-g186338-d188862-r573773285-Na...
272,268,A TripAdvisor Member,National Gallery,40,missing,20,3,via mobile,Amazing,I quite literally spent all day in here and di...,/ShowUserReviews-g186338-d188862-r573512784-Na...
475,469,A TripAdvisor Member,National Gallery,50,missing,23,4,via mobile,Always a good idea.,The museums in London are free and is always a...,/ShowUserReviews-g186338-d188862-r568515619-Na...
818,810,A TripAdvisor Member,National Gallery,40,missing,11,4,via mobile,National Gallery,Amazing.\nI need a few more days.\nAll Museums...,/ShowUserReviews-g186338-d188862-r557975808-Na...
950,939,A TripAdvisor Member,National Gallery,50,missing,11,4,via mobile,Some of the greats here,Beautiful gallery with loads of great art. Not...,/ShowUserReviews-g186338-d188862-r553425319-Na...
1154,1146,A TripAdvisor Member,National Gallery,50,missing,21,2,via mobile,Beautiful,What a lovely place. Well kept. The artwork is...,/ShowUserReviews-g186338-d188862-r545829430-Na...
1347,1338,A TripAdvisor Member,National Gallery,50,missing,11,2,via mobile,Unexpectedly great,"The gallery has so much to see, and it’s free....",/ShowUserReviews-g186338-d188862-r539212127-Na...
1363,1354,A TripAdvisor Member,National Gallery,50,missing,5,0,via mobile,Great visit,As we love art we were very impressed with the...,/ShowUserReviews-g186338-d188862-r538766427-Na...
1371,1362,A TripAdvisor Member,National Gallery,50,missing,6,2,via mobile,Monochrome exhibit was excellent,"It sounded dull, and took some young people (1...",/ShowUserReviews-g186338-d188862-r538512791-Na...
2028,2020,A TripAdvisor Member,National Gallery,40,missing,3,1,via mobile,Enjoy both the old and new,"This museum had it all, an exhibit of Devinci,...",/ShowUserReviews-g186338-d188862-r515602860-Na...


In [17]:
df.userID.value_counts()

A TripAdvisor Member    1326
WMIM                     416
David M                  305
David S                  264
John H                   261
BigHugh51                258
sheepygold               256
Richard F                250
John S                   245
John M                   244
David B                  243
Paul B                   242
John B                   234
Paul H                   226
Chris S                  219
Paul S                   219
John W                   208
David R                  205
Borzov                   204
Taxidevil                203
David G                  196
David C                  195
swift314                 195
David H                  189
David W                  184
Ian C                    184
Peter S                  183
John D                   183
buffalojim67             182
Sandygranny              180
                        ... 
Silva A                    1
Rose-Marie A               1
AugustoReisJr              1
Murry W       