# Recommendation System Notebook Sample

0. [Params](#Params)
1. [Acquisitor and Cleaner](#Acquisitor-and-Cleaner)
2. [Training Preparator](#Training-Preparator)
3. [Trainer](#Trainer)
4. [Metrics Evaluator](#Metrics-Evaluator)
5. [Prediction Preparator](#Prediction-Preparator)
6. [Predictor](#Predictor)
7. [Feedback](#Feedback)
8. [Sample Application](#Sample-Application)

# Recommendation System

In [21]:
import marvin_recommendation_system_engine

# Params

In [22]:
params = {
    "algo": [
        {
            "name": "SVD",
            "param_grid": {
                "n_epochs": [10], 
                "lr_all": [0.005],
                "reg_all": [0.6]   
            }
        },
        {
            "name": "BaselineOnly",
            "param_grid": {
                "bsl_options": {
                    "method": ["als"],
                    "n_epochs": [5],
                    "reg_u": [12],
                    "reg_i": [5]
               }

            },
            "full_name": "BaselineOnlyALS"
        },
        {
            "name": "BaselineOnly",
            "param_grid": {
                "bsl_options": {
                    "method": ["sgd"],
                    "learning_rate": [.00005]
               }

            },
            "full_name": "BaselineOnlySGD"

        },
        {
            "name": "KNNBasic",
            "param_grid": {
                "k": [20],
                "sim_options": {
                    "name": ["msd", "cosine", "pearson_baseline"],
                    "min_support": [5],
                    "user_based": ["False"]
                }
            },
            "full_name": "KNNBasicItem"
        },
        {
            "name": "KNNBasic",
            "param_grid": {
                "k": [20],
                "sim_options": {
                    "name": ["msd", "cosine", "pearson_baseline"],
                    "min_support": [5],
                    "user_based": ["True"]
                }
            },
            "full_name": "KNNBasicUser"
        },
        {
            "name": "KNNWithMeans",
            "param_grid": {
                "k": [20],
                "sim_options": {
                    "name": ["msd", "cosine", "pearson_baseline"],
                    "min_support": [5],
                    "user_based": ["False"]
                }
            }
        }
            
            
    ],
    "measures": ["rmse", "mae"],
    "n_cv": 3,
    
    "prediction": {
        "pred_type": "all",
        "n_pred": 10
    }
}

# Aquisitor and Cleaner

In [23]:
from surprise import Dataset

In [24]:
# Load the movielens-100k dataset (download it if needed),
data = Dataset.load_builtin('ml-100k')

marvin_initial_dataset = {
    "data": data
}

# Training Preparator

In [25]:
trainset = marvin_initial_dataset["data"].build_full_trainset()
print(trainset.global_mean)
testset = trainset.build_anti_testset()
print(testset[0])
marvin_dataset = {
    "data": marvin_initial_dataset["data"],
    "trainset": trainset,
    "testset": testset
}

3.52986
(u'196', u'302', 3.5298600000000002)


# Trainer

In [26]:
from surprise.model_selection import GridSearchCV
from surprise import SVD
from surprise import KNNBaseline
from surprise import KNNBasic
from surprise import BaselineOnly
from surprise import KNNWithMeans

In [None]:
algo_dict = {"SVD": SVD, "KNNBaseline": KNNBaseline, "KNNBasic": KNNBasic, "BaselineOnly": BaselineOnly, "KNNWithMeans": KNNWithMeans}

model_dict = {}

for algo in params["algo"]:
    
    print(algo)
    
    # Get Name and Initiate Algorithm
    algo_name = algo["name"]

    if algo.get("full_name", False):
        full_name = algo["full_name"]
    else:
        full_name = algo_name
    
    model_dict[full_name] = {}
        
    # Initialize Gridsearch
    gs = GridSearchCV(
        algo_dict[algo_name],
        algo["param_grid"],
        measures=params["measures"],
        cv=params["n_cv"])
    
    gs.fit(marvin_dataset["data"])
    
    # We can now use the algorithm that yields the best rmse:
    best_algo = gs.best_estimator['rmse']
    best_algo.fit(marvin_dataset["trainset"])

    # Get the predictions for null values in the set
    model_dict[full_name]["grid_search"] = gs
    model_dict[full_name]["model"] = best_algo
    
marvin_model = model_dict

{'param_grid': {'lr_all': [0.005], 'reg_all': [0.6], 'n_epochs': [10]}, 'name': 'SVD'}


# Metrics Evaluator

In [None]:
import pandas as pd
from surprise import accuracy

In [26]:
metrics_dict = {}

for algo in params["algo"]:
    
    algo_name = algo["name"]

    if algo.get("full_name", False):
        full_name = algo["full_name"]
    else:
        full_name = algo_name
    print(full_name)
    
    metrics_dict[full_name] = {}
    
    # combination of parameters that gave the best RMSE score
    best_model = [key + ": " + str(value) for (key,value) in marvin_model[full_name]["grid_search"].best_params['rmse'].items()]
    #print("Best Model: {}".format(best_model))

    # best RMSE score
    train_rmse = marvin_model[full_name]["grid_search"].best_score['rmse']
    #print("Train RMSE: {}".format(train_rmse))
    
    # Prediction Score
    # Train the algorithm on the trainset, and predict ratings for the testset
    predictions = marvin_model[full_name]["model"].test(marvin_dataset["testset"])
    #print(len(predictions))
    # Then compute RMSE
    test_rmse = accuracy.rmse(predictions, verbose=False)
    #print("Test Set Score: {}".format(test_rmse))
    
    metrics_dict[full_name]["best_model"] = best_model
    metrics_dict[full_name]["train_rmse"] = train_rmse
    #metrics_dict[full_name]["predictions"] = predictions
    metrics_dict[full_name]["test_rmse"] = test_rmse


marvin_metrics = metrics_dict

SVD
BaselineOnlyALS
BaselineOnlySGD
KNNBasicItem
KNNBasicUser
KNNWithMeans


In [30]:
pd.DataFrame(marvin_metrics).T.sort_values('test_rmse')

Unnamed: 0,best_model,test_rmse,train_rmse
BaselineOnlySGD,"[bsl_options: {'learning_rate': 5e-05, 'method...",0.0657408,1.08465
SVD,"[lr_all: 0.005, reg_all: 0.6, n_epochs: 10]",0.364537,0.974032
BaselineOnlyALS,"[bsl_options: {'n_epochs': 5, 'reg_i': 5, 'met...",0.578061,0.943989
KNNWithMeans,"[sim_options: {'min_support': 5, 'name': 'pear...",0.822605,0.952561
KNNBasicItem,"[sim_options: {'min_support': 5, 'name': 'msd'...",0.925392,0.982872
KNNBasicUser,"[sim_options: {'min_support': 5, 'name': 'msd'...",0.925392,0.984469


In [31]:
from collections import defaultdict
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''

    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1

    return precisions, recalls

#predictions = marvin_model["model"].test(marvin_dataset["testset"])
#precisions, recalls = precision_recall_at_k(predictions, k=5, threshold=4)

# Precision and recall can then be averaged over all users
#print(sum(prec for prec in precisions.values()) / float(len(precisions)))
#print(sum(rec for rec in recalls.values()) / float(len(recalls)))

# Prediction Preparator

In [14]:
input_message = {
    "User_id": 196,
    "Item_id": 302
}

In [15]:
# get a prediction for specific users and items.
pred_dict = {}

for algo in params["algo"]:
    
    # Get Name and Initiate Algorithm
    algo_name = algo["name"]

    if algo.get("full_name", False):
        full_name = algo["full_name"]
    else:
        full_name = algo_name
    
    pred_dict[full_name] = marvin_model[full_name]["model"].predict(
        str(input_message["User_id"]), str(input_message["Item_id"]), r_ui=4, verbose=False)[3]
                                           

final_prediction = pred_dict

In [16]:
final_prediction

{'BaselineOnlyALS': 4.1972649922983392,
 'BaselineOnlySGD': 3.6962422165779385,
 'KNNBasicItem': 4.0148182865364941,
 'KNNBasicUser': 4.0148182865364941,
 'KNNWithMeans': 4.2384236870323271,
 'SVD': 3.9518854712543905}

# Further Evaluation

In [None]:
from surprise import get_dataset_dir
import io
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

def get_top_n_for_user(predictions, userId, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        userId(str): Target User Id 
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''
    # First map the predictions to each user.
    user_predictions = [(iid, est) for (uid, iid, true_r, est, _) in predictions if uid == str(userId)]
    # Read the mappings raw id <-> movie name
    rid_to_name, name_to_rid = read_item_names()
    
    # Then sort the predictions for each user and retrieve the k highest ones.
    user_predictions.sort(key=lambda x: x[1], reverse=True)
    top_n = [(rid_to_name[pid], val)  for (pid, val) in user_predictions[:n]]

    return top_n

def get_top_neighbors(model, targetId, kind="Item", n=10):
    # Read the mappings raw id <-> movie name
    rid_to_name, name_to_rid = read_item_names()
    
    # Retrieve inner id of the movie Toy Story
    item_raw_id = name_to_rid['Toy Story (1995)']
    item_inner_id = model["model"].trainset.to_inner_iid(item_raw_id)
    # Retrieve inner ids of the nearest neighbors of Toy Story.
    item_neighbors = model["model"].get_neighbors(item_inner_id, k=n)

    # Convert inner ids of the neighbors into names.
    item_neighbors = (model["model"].trainset.to_raw_iid(inner_id)
                           for inner_id in item_neighbors)
    item_neighbors = (rid_to_name[rid]
                           for rid in item_neighbors)

    print('The 10 nearest neighbors of Toy Story are:')
    for movie in item_neighbors:
        print(movie)

In [None]:
get_top_n_for_user()

In [None]:
get_top_neighbors(marvin_model["KNNBaseline"], 1, n=10)