In [175]:
from surprise import Dataset
import pandas as pd
import numpy as np
from surprise import Reader
from surprise import KNNBasic
from surprise.model_selection import cross_validate
from collections import defaultdict
from pprint import pprint

In [176]:
ratings ={
    'itemid':  ['Tuote1', 'Tuote2', 'Tuote3', 'Tuote4', 'Tuote1', 'Tuote2', 'Tuote3', 'Tuote4', 'Tuote1', 'Tuote3', 'Tuote2', 'Tuote1', 'Tuote4','Tuote1','Tuote2','Tuote3'],
    'userid': ['Maria', 'Maria', 'Maria', 'Maria', 'Timo', 'Timo', 'Timo', 'Timo', 'Arttu','Leo', 'Leo', 'Anna', 'Anna', 'Saara', 'Saara', 'Saara'],
    'rating': [1,1,5,5,5,5,2,2,5,5,1,2,4,1,1,5]
}

df = pd.DataFrame(ratings)
df

Unnamed: 0,itemid,userid,rating
0,Tuote1,Maria,1
1,Tuote2,Maria,1
2,Tuote3,Maria,5
3,Tuote4,Maria,5
4,Tuote1,Timo,5
5,Tuote2,Timo,5
6,Tuote3,Timo,2
7,Tuote4,Timo,2
8,Tuote1,Arttu,5
9,Tuote3,Leo,5


In [177]:
x = df.groupby('itemid')[['rating']].value_counts()
x

itemid  rating
Tuote1  1         2
        5         2
        2         1
Tuote2  1         3
        5         1
Tuote3  5         3
        2         1
Tuote4  2         1
        4         1
        5         1
Name: count, dtype: int64

In [178]:
mean_rating = df.groupby('itemid')[['rating']].mean()
mean_rating.sort_values(by='rating', ascending=False)


Unnamed: 0_level_0,rating
itemid,Unnamed: 1_level_1
Tuote3,4.25
Tuote4,3.666667
Tuote1,2.8
Tuote2,2.0


In [179]:
# tehdään reader
reader = Reader(rating_scale=(1,5))
#luodaan surprise dataset
data = Dataset.load_from_df(df[['userid', 'itemid', 'rating']],reader)
print(data)

<surprise.dataset.DatasetAutoFolds object at 0x000002107F0EB1D0>


In [180]:
# ei tehä tälle, koska pieni aineisto
# from surprise.model_selection import train_test_split
# trainset, testset = train_test_split(data, test_size=0.25)
# print(trainset)
# print(testset)

In [181]:
#tehdään koko datasta trainset
trainset = data.build_full_trainset()
trainset

<surprise.trainset.Trainset at 0x2107f0eabd0>

In [182]:
# luodaan ja treenataan algoritmi
sim_options = {
    'user_based' : True # False compute similarities between items
}

algo = KNNBasic(sim_options = sim_options)
algo.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x2107689f3b0>

In [183]:
user_id = 'Leo'
item_id = 'Tuote3'

pred = algo.predict(user_id,item_id, r_ui=5,verbose=True)

user: Leo        item: Tuote3     r_ui = 5.00   est = 4.93   {'actual_k': 4, 'was_impossible': False}


In [184]:
# Sample prediction # rating 2
user_id = 'Timo'
item_id = 'Tuote3'

pred = algo.predict(user_id, item_id, r_ui=2, verbose=True)

user: Timo       item: Tuote3     r_ui = 2.00   est = 2.53   {'actual_k': 4, 'was_impossible': False}


In [185]:
# Sample prediction #rating 1
user_id = 'Saara'
item_id = 'Tuote1'

pred = algo.predict(user_id, item_id, r_ui=1, verbose=True)

user: Saara      item: Tuote1     r_ui = 1.00   est = 1.38   {'actual_k': 5, 'was_impossible': False}


In [186]:
# Sample prediction #rating 1
user_id = 'Maria'
item_id = 'Tuote1'

pred = algo.predict(user_id, item_id, r_ui=1, verbose=True)

user: Maria      item: Tuote1     r_ui = 1.00   est = 1.39   {'actual_k': 5, 'was_impossible': False}


In [187]:
# Than predict ratings for all pairs (u, i) that are NOT in the training set.
testset = trainset.build_anti_testset()
predictions = algo.test(testset)

# Example:output
pprint(predictions)

[Prediction(uid='Arttu', iid='Tuote2', r_ui=3.125, est=4.578947368421052, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Arttu', iid='Tuote3', r_ui=3.125, est=2.31578947368421, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Arttu', iid='Tuote4', r_ui=3.125, est=2.3248730964467, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Leo', iid='Tuote1', r_ui=3.125, est=1.1428571428571428, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Leo', iid='Tuote4', r_ui=3.125, est=4.793103448275862, details={'actual_k': 2, 'was_impossible': False}),
 Prediction(uid='Anna', iid='Tuote2', r_ui=3.125, est=1.4705882352941175, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Anna', iid='Tuote3', r_ui=3.125, est=4.647058823529412, details={'actual_k': 3, 'was_impossible': False}),
 Prediction(uid='Saara', iid='Tuote4', r_ui=3.125, est=4.55072463768116, details={'actual_k': 3, 'was_impossible': False})]


In [188]:
# This block copied from Surprise documentation at
# http://surprise.readthedocs.io/en/stable/FAQ.html#how-to-get-the-top-n-recommendations-for-each-user

def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n

top_n = get_top_n(predictions, n=10)#

# Print the recommended items for each user
for uid, user_ratings in top_n.items():
    print(uid, [iid for (iid, _) in user_ratings])

Arttu ['Tuote2', 'Tuote4', 'Tuote3']
Leo ['Tuote4', 'Tuote1']
Anna ['Tuote3', 'Tuote2']
Saara ['Tuote4']


In [192]:
# Evaluate accuracy
# MAE = mean absolute error, RMSE = root mean squared error
cross_validate(algo, data, measures=['MAE', 'RMSE'], cv=3, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating MAE, RMSE of algorithm KNNBasic on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
MAE (testset)     0.9588  1.1185  1.9000  1.3258  0.4113  
RMSE (testset)    1.3717  1.7128  2.5397  1.8747  0.4904  
Fit time          0.00    0.00    0.00    0.00    0.00    
Test time         0.00    0.00    0.00    0.00    0.00    


{'test_mae': array([0.95875421, 1.11851852, 1.9       ]),
 'test_rmse': array([1.37170161, 1.71277777, 2.53968502]),
 'fit_time': (0.0, 0.0, 0.0),
 'test_time': (0.0010259151458740234, 0.0, 0.0)}