# Surprise homework

In [1]:
import io
import pandas as pd
from surprise import get_dataset_dir

from collections import defaultdict

### Functions

In [2]:
from surprise import Dataset
from surprise import SVD
from surprise import accuracy

from surprise.model_selection import cross_validate
from surprise.model_selection import train_test_split

### Load data

In [3]:
data = Dataset.load_builtin('ml-100k')

In [4]:
K = 30
trainset, testset = train_test_split(data, test_size=.25)

### Define algorithms

In [5]:
from surprise import NormalPredictor
from surprise import SVD
from surprise import KNNWithMeans

algorithms_rsma = {}
algorithms = {
    "n_p": NormalPredictor(),
    "kNN_cos": KNNWithMeans(k = K, sim_options = {'name': 'cosine'}),
    "kNN_MSD": KNNWithMeans(k = K),
    "kNN_Pearson": KNNWithMeans(k = K, sim_options = {'name': 'pearson'}),
    "SVD" : SVD()
}

### Select algorithm

In [6]:
for [name, algo] in algorithms.items():
    cv = cross_validate(algo, data, measures=['RMSE'], verbose=True)
    algorithms_rsma[name] = cv['test_rmse'].mean()

Evaluating RMSE of algorithm NormalPredictor on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.5169  1.5142  1.5280  1.5162  1.5180  1.5187  0.0049  
Fit time          0.12    0.12    0.13    0.10    0.12    0.12    0.01    
Test time         0.16    0.12    0.16    0.15    0.13    0.15    0.02    
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9584  0.9627  0.9534  0.9565  0.9581  0.9578  0.0030  
Fit time          1.50    1.56    1.45    1.50    1

### Calculate precision@k and recall@k

In [7]:
#Определим самый быстрый алгоритм
fastest_algorithm = min(algorithms_rsma.items(), key=lambda i: i[1])
print(fastest_algorithm)
precision_algorithm = algorithms[fastest_algorithm[0]]

def precision_recall_at_k(predictions, k, threshold):
    "Return precision and recall at k metrics for each user"

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        # When n_rec_k is 0, Precision is undefined. We here set it to 0.

        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        # When n_rel is 0, Recall is undefined. We here set it to 0.

        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 0

    return precisions, recalls
precision_algorithm.fit(trainset)
predictions = precision_algorithm.test(testset)

precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=3.52)

# Precision and recall averaged over all users
precision_at_k = sum(prec for prec in precisions.values()) / len(precisions)
recall_at_k = sum(rec for rec in recalls.values()) / len(recalls)

print(precision_at_k)
print(recall_at_k)

('SVD', 0.9356722113886086)
0.7379109225874864
0.3684181101197005


### Predict

In [8]:
# обратите внимание на функцию build_anti_testset
k = 5
user_index = '16'

precision_algorithm.fit(trainset)
testset = trainset.build_anti_testset()
predictions = precision_algorithm.test(testset)

user_predictions = list(filter(lambda x: x.uid == user_index, predictions))
unrated_tuple = sorted(user_predictions, key=lambda x: x.est, reverse=True)[0:k]

def get_movie_info(urated_tuple):
    data = pd.read_csv(get_dataset_dir() + '/ml-100k/ml-100k/u.item', sep='|',encoding='ISO-8859-1', header = None) 
    movie_info = {}
    for movie_id in urated_tuple:
        row = data.iloc[int(movie_id)]
        movie_info[movie_id] = (row[1], row[2])
    return movie_info
movie_info = get_movie_info(list(map(lambda x: x[1], unrated_tuple)))

### Запись в файл

In [9]:
file = open("task2.2_Korop.txt", "w")
file.write('User ' + user_index + "\n")
print('User ' + user_index)
for movie in unrated_tuple:
    file.write('{}, {}, {}'.format(movie[1], movie_info[movie[1]], round(movie[3], 3)))
    file.write("\n")
    print(movie[1],movie_info[movie[1]], movie[3])
file.close()

User 16
408 ('Jack (1996)', '07-Aug-1996') 5
169 ('Cinema Paradiso (1988)', '01-Jan-1988') 5
22 ('Taxi Driver (1976)', '16-Feb-1996') 5
12 ('Mighty Aphrodite (1995)', '30-Oct-1995') 4.995386042950577
651 ('Rosencrantz and Guildenstern Are Dead (1990)', '01-Jan-1990') 4.975425681235239
