# Recommendation System Notebook Sample

0. [Params](#Params)
1. [Acquisitor and Cleaner](#Acquisitor-and-Cleaner)
2. [Training Preparator](#Training-Preparator)
3. [Trainer](#Trainer)
4. [Metrics Evaluator](#Metrics-Evaluator)
5. [Prediction Preparator](#Prediction-Preparator)
6. [Predictor](#Predictor)
7. [Feedback](#Feedback)
8. [Sample Application](#Sample-Application)

# Recommendation System

In [2]:
import marvin_recommendation_system_engine

# Params

In [3]:
from surprise import NormalPredictor
from surprise import BaselineOnly
from surprise import KNNBasic
from surprise import KNNWithMeans
from surprise import KNNBaseline
from surprise import SVD
from surprise import SVDpp
from surprise import NMF
from surprise import SlopeOne
from surprise import CoClustering


params = {
    "param_grid": {
        'n_epochs': [5, 10, 20], 
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6]  
    },
    "algo": "SVD",
    "measures": ['rmse', 'mae'],
    "n_cv": 3,
    
    "prediction": {
        "pred_type": "all",
        "n_pred": 10
    }
}


params = {
    "param_grid": {
        "k": [20, 30, 40]
    },
    "sim_options": {
        'name': 'pearson_baseline',
        'user_based': False  # compute  similarities between items
    },
    "algo": "KNNBaseline",
    "measures": ['rmse', 'mae'],
    "n_cv": 3,
    
    "prediction": {
        "pred_type": "all",
        "n_pred": 10
    }
}

# Aquisitor and Cleaner

In [4]:
from surprise import Dataset

# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

marvin_initial_dataset = {
    "data": data
}

# Training Preparator

In [5]:
trainset = marvin_initial_dataset["data"].build_full_trainset()
testset = trainset.build_anti_testset()

marvin_dataset = {
    "data": marvin_initial_dataset["data"],
    "trainset": trainset,
    "testset": testset
}

# Trainer

In [14]:
params = {
    "param_grid": {
        "k": [20, 30, 40]
    },
    "sim_options": {
        'name': 'pearson_baseline',
        'user_based': False  # compute  similarities between items
    },
    "algo": "KNNBaseline",
    "measures": ['rmse', 'mae'],
    "n_cv": 3,
    
    "prediction": {
        "pred_type": "all",
        "n_pred": 10
    }
}

from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import KNNBaseline

algo_dict = {"SVD": SVD, "KNNBaseline": KNNBaseline}
gs = GridSearchCV(
    algo_dict[params["algo"]],
    params["param_grid"],
    measures=params["measures"],
    cv=params["n_cv"])

gs.fit(marvin_dataset["data"])

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.


In [8]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import KNNBaseline
algo_dict = {"SVD": SVD, "KNNBaseline": KNNBaseline}

params = {
    "param_grid": {
        'n_epochs': [5, 10, 20], 
        'lr_all': [0.002, 0.005],
        'reg_all': [0.4, 0.6]  
    },
    "algo": "SVD",
    "measures": ['rmse', 'mae'],
    "n_cv": 3,
    
    "prediction": {
        "pred_type": "all",
        "n_pred": 10
    }
}

gs2 = GridSearchCV(
    algo_dict[params["algo"]],
    params["param_grid"],
    measures=params["measures"],
    cv=params["n_cv"])

gs2.fit(marvin_dataset["data"])

In [360]:
predictions = gs.best_estimator["rmse"].fit(trainset).test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
RMSE: 0.7925


0.79246878945876542

In [9]:
predictions = gs2.best_estimator["rmse"].fit(trainset).test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)

NameError: name 'accuracy' is not defined

In [10]:
from surprise import accuracy
accuracy.rmse(predictions, verbose=True)

RMSE: 0.4570


0.45701905530241249

In [15]:
gs.best_score["rmse"]

0.93545240740713675

In [13]:
gs2.best_score["rmse"]

0.95874267399728408

In [206]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import KNNBaseline


gs = GridSearchCV(
    params["algo"],
    params["param_grid"],
    measures=params["measures"],
    cv=params["n_cv"])

gs.fit(marvin_dataset["data"])

# We can now use the algorithm that yields the best rmse:
algo = gs.best_estimator['rmse']
algo.fit(marvin_dataset["trainset"])


# Get the predictions for null values in the set
if params["prediction"]["pred_type"] == "top_n":
    predictions = algo.test(marvin_dataset["testset"])
else:
    predictions = "To generate predictions, set prediction pred_type to top_n"

marvin_model = {
    "grid_search": gs,
    "model": algo,
    "predictions": predictions
}

Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the msd similarity matr

In [230]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import KNNBaseline

if params["algo"] == "KNNBaseline":
    algo = KNNBaseline(sim_options=params["sim_options"])
#algo = params["algo"](sim_options=params["sim_options"])
algo.fit(marvin_dataset["trainset"])


# Get the predictions for null values in the set
if params["prediction"]["pred_type"] == "top_n":
    predictions = algo.test(marvin_dataset["testset"])
else:
    predictions = "To generate predictions, set prediction pred_type to top_n"

marvin_model = {
    #"grid_search": gs,
    "model": algo,
    "predictions": predictions
}

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


# Metrics Evaluation

In [231]:
import pandas as pd
df_results = pd.DataFrame.from_dict(marvin_model["grid_search"].cv_results)

# combination of parameters that gave the best RMSE score
print("Best Model: {}".format([key + ": " + str(value) for (key,value) in marvin_model["grid_search"].best_params['rmse'].items()]))

# best RMSE score
print("Best RMSE: {}".format(marvin_model["grid_search"].best_score['rmse']))


df_results[['params', 'mean_test_mae', 'mean_test_rmse', 'mean_test_time']].sort_values('mean_test_rmse')

KeyError: 'grid_search'

# Prediction Preparator

In [232]:
input_message = {
    "User_id": 196,
    "Item_id": 302
}

In [233]:
from surprise import get_dataset_dir
import io
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

def get_top_n_for_user(predictions, userId, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        userId(str): Target User Id 
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    # First map the predictions to each user.
    user_predictions = [(iid, est) for (uid, iid, true_r, est, _) in predictions if uid == str(userId)]
    # Read the mappings raw id <-> movie name
    rid_to_name, name_to_rid = read_item_names()
    
    # Then sort the predictions for each user and retrieve the k highest ones.
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_n = [(rid_to_name[pid], val)  for (pid, val) in user_predictions[:n]]

    return top_n

def get_top_neighbors(model, targetId, kind="Item", n=10):
    # Read the mappings raw id <-> movie name
    rid_to_name, name_to_rid = read_item_names()
    
    if kind=="Item":
        # Retrieve inner id of the movie Toy Story
        item_raw_id = name_to_rid['Toy Story (1995)']
        print(item_raw_id)
        item_inner_id = model["model"].trainset.to_inner_iid(item_raw_id)
        print(item_inner_id)
        # Retrieve inner ids of the nearest neighbors of Toy Story.
        item_neighbors = model["model"].get_neighbors(item_inner_id, k=n)

        # Convert inner ids of the neighbors into names.
        item_neighbors = (model["model"].trainset.to_raw_iid(inner_id)
                               for inner_id in item_neighbors)
        item_neighbors = (rid_to_name[rid]
                               for rid in item_neighbors)

        print()
        print('The 10 nearest neighbors of Toy Story are:')
        for movie in item_neighbors:
            print(movie)


    
pred = get_top_neighbors(marvin_model, 1, kind="Item", n=10)

1
24
()
The 10 nearest neighbors of Toy Story are:
Beauty and the Beast (1991)
Raiders of the Lost Ark (1981)
That Thing You Do! (1996)
Lion King, The (1994)
Craft, The (1996)
Liar Liar (1997)
Aladdin (1992)
Cool Hand Luke (1967)
Winnie the Pooh and the Blustery Day (1968)
Indiana Jones and the Last Crusade (1989)


In [163]:
# get a prediction for specific users and items.
pred = marvin_model["model"].predict(
    str(input_message["User_id"]), str(input_message["Item_id"]), r_ui=4, verbose=True)

user: 196        item: 302        r_ui = 4.00   est = 4.16   {u'actual_k': 39, u'was_impossible': False}


In [164]:
pred = get_top_n_for_user(
    marvin_model["predictions"], userId=input_message["User_id"], n=params["prediction"]["n_pred"])

ValueError: need more than 1 value to unpack

In [165]:
for i, res in enumerate(pred):
    print("{0}: {1}".format(i+1,res[0]))

1: 1
2: 3


TypeError: 'int' object has no attribute '__getitem__'

In [166]:
pred = get_top_neighbors(marvin_dataset, marvin_model, 1, kind="Item", n=10)

1
24
()
The 10 nearest neighbors of Toy Story are:
So Dear to My Heart (1949)
My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993)
Somebody to Love (1994)
Crows and Sparrows (1949)
Total Eclipse (1995)
Mr. Jones (1993)
Convent, The (Convento, O) (1995)
Incognito (1997)
Every Other Weekend (1990)
Homage (1995)


In [167]:
marvin_model

{'model': <surprise.prediction_algorithms.knns.KNNBaseline at 0x7f71b2010250>,
 'predictions': 'To generate predictions, set prediction pred_type to top_n'}

In [226]:
from surprise import accuracy
from surprise import KNNBaseline
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = ((rid, rid_to_name[rid])
                       for rid in toy_story_neighbors)
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)
    
testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
()
The 10 nearest neighbors of Toy Story are:
(u'588', u'Beauty and the Beast (1991)')
(u'174', u'Raiders of the Lost Ark (1981)')
(u'845', u'That Thing You Do! (1996)')
(u'71', u'Lion King, The (1994)')
(u'928', u'Craft, The (1996)')
(u'294', u'Liar Liar (1997)')
(u'95', u'Aladdin (1992)')
(u'523', u'Cool Hand Luke (1967)')
(u'969', u'Winnie the Pooh and the Blustery Day (1968)')
(u'210', u'Indiana Jones and the Last Crusade (1989)')
RMSE: 0.4807


0.48071109787164656

In [225]:
from surprise import accuracy
from surprise import KNNBaseline
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNWithMeans(sim_options=sim_options)
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = ((rid, rid_to_name[rid])
                       for rid in toy_story_neighbors)
print()
print('The 10 nearest neighbors of Toy Story are:')
for movie in toy_story_neighbors:
    print(movie)
    
testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

Computing the cosine similarity matrix...
Done computing similarity matrix.
()
The 10 nearest neighbors of Toy Story are:
(u'626', u'So Dear to My Heart (1949)')
(u'1332', u"My Life and Times With Antonin Artaud (En compagnie d'Antonin Artaud) (1993)")
(u'1334', u'Somebody to Love (1994)')
(u'1350', u'Crows and Sparrows (1949)')
(u'1260', u'Total Eclipse (1995)')
(u'1436', u'Mr. Jones (1993)')
(u'1342', u'Convent, The (Convento, O) (1995)')
(u'361', u'Incognito (1997)')
(u'1348', u'Every Other Weekend (1990)')
(u'1320', u'Homage (1995)')
RMSE: 0.7833


0.78332978015773791

In [297]:
from surprise import accuracy
from surprise import KNNBaseline
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
#sim_options = {'name': 'cosine', 'user_based': False}
algo = SVD()
algo.fit(trainset)

# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()


    
testset = trainset.build_testset()
predictions = algo.test(testset)
# RMSE should be low as we are biased
accuracy.rmse(predictions, verbose=True)  # ~ 0.68 (which is low)

RMSE: 0.6789


0.67891984957172713

In [298]:
algo.default_prediction()

3.5298600000000002

In [300]:
algo.sim_options['user_based'] = False

In [301]:
sim = algo.compute_similarities()

In [330]:
sim

array([[ 1.        ,  0.47761194,  0.5       , ...,  0.5       ,
         0.5       ,  0.5       ],
       [ 0.47761194,  1.        ,  0.11363636, ...,  0.5       ,
         0.5       ,  0.5       ],
       [ 0.5       ,  0.11363636,  1.        , ...,  0.        ,
         0.        ,  0.        ],
       ..., 
       [ 0.5       ,  0.5       ,  0.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.5       ,  0.5       ,  0.        , ...,  1.        ,
         1.        ,  1.        ],
       [ 0.5       ,  0.5       ,  0.        , ...,  1.        ,
         1.        ,  1.        ]])

In [302]:
sim.shape

(1682, 1682)

In [303]:
algo.sim_options

{u'user_based': False}

In [325]:
algo.qi#.shape

array([[-0.01344781, -0.18877988, -0.05081615, ...,  0.23417274,
        -0.03214978,  0.05739335],
       [ 0.25097753,  0.05450006,  0.18308459, ..., -0.16997462,
        -0.08567283,  0.17697557],
       [-0.21345191, -0.22525173, -0.16773123, ..., -0.03289561,
        -0.15182392,  0.05576241],
       ..., 
       [ 0.10187684, -0.09450286, -0.05308913, ...,  0.00173178,
        -0.21610139, -0.09103197],
       [ 0.05438501, -0.05268797,  0.10346942, ...,  0.25860461,
         0.13132637, -0.00351207],
       [ 0.00362663, -0.16417464,  0.07767464, ...,  0.10456627,
         0.04597677,  0.15818502]])

In [324]:
algo.bi#.shape

array([ 0.57284212,  0.73401274, -0.72939167, ...,  0.01660365,
        0.02172976,  0.00815445])

In [329]:
np.dot(algo.qi.T, algo.bi)

array([  4.16615227, -11.46032426,   9.7307177 ,  -4.07564577,
         2.36565003,  -2.46877541,  -5.69297119,  19.08683194,
        -1.24666333,  -1.34587118,   7.03237798,  -9.70966023,
        -8.20497566,  -3.76013903,  -0.38086554,  13.07566109,
        -7.76094279,   8.88517201,   1.41691281, -21.24812826,
        12.80708227,   0.65563962,  -5.87901845,   9.72469786,
        -0.19197726,  -3.81374514,  -6.18407944,  -5.05235358,
        16.61901663,  -7.82536638,  10.86437821,   0.48779208,
         8.0889516 ,   6.15273583,   9.25929054,   6.44486173,
         8.81877031,  -7.8497428 ,   7.45697181,  -8.97651882,
         8.97373707,   8.20836464,  12.19486235,  -6.69904092,
        16.2050095 ,  17.30825229, -18.88283358,  -3.54779538,
         4.20890555,   9.76682864,  13.37786359,   8.08136372,
        -0.06305101,   0.30058104,  -3.92139083,  -1.09542658,
       -10.8164049 ,   0.27290053,  -4.3067889 ,  -0.81976782,
         2.78873308,   7.72437312,   2.77352923,   5.46

In [261]:
toy_story_raw_id

u'1'

In [290]:
import numpy as np
row = sim[int(toy_story_raw_id)-1].copy()
sorted_list = np.argsort(np.array(row))[::-1][:10]
#return sorted_list with values
n_neighbors_list = [(rid_to_name[str(ix+1)], row[ix]) for ix in sorted_list]

In [291]:
n_neighbors_list

[(u'Toy Story (1995)', 1.0),
 (u'To Gillian on Her 37th Birthday (1996)', 1.0),
 (u'Love Jones (1997)', 1.0),
 (u'Saint of Fort Washington, The (1993)', 1.0),
 (u"Margaret's Museum (1995)", 1.0),
 (u'Sleepover (1995)', 1.0),
 (u'Chain Reaction (1996)', 1.0),
 (u'Fear (1996)', 1.0),
 (u'Above the Rim (1994)', 1.0),
 (u'Panther (1995)', 1.0)]

In [280]:
sorted_list

array([   0,  845,  871, 1466, 1465, 1459,  929,  974, 1439, 1437])

In [286]:
rid_to_name['1682']

u'Scream of Stone (Schrei aus Stein) (1991)'

In [268]:
row

array([ 1.        ,  0.47761194,  0.5       , ...,  0.5       ,
        0.5       ,  0.5       ])

In [260]:
# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
#toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
#toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
#toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
 #                      for inner_id in toy_story_neighbors)
#toy_story_neighbors = ((rid, rid_to_name[rid])
 #                      for rid in toy_story_neighbors)