<a href="https://colab.research.google.com/github/irislqy/recommendation_engine/blob/master/Movie_Recommendations_Algorithm.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Recommendations


### Install Surprise

In [0]:
!pip install -q scikit-surprise

[K    100% |████████████████████████████████| 3.3MB 8.4MB/s 
[?25h  Building wheel for scikit-surprise (setup.py) ... [?25ldone
[?25h

In [0]:
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate

# Load the movielens-100k dataset (download it if needed).
data = Dataset.load_builtin('ml-100k')

# Use the famous SVD algorithm.
algo = SVD()

# Run 5-fold cross-validation and print results.
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Dataset ml-100k could not be found. Do you want to download it? [Y/n] Y
Trying to download dataset from http://files.grouplens.org/datasets/movielens/ml-100k.zip...
Done! Dataset ml-100k has been saved to /root/.surprise_data/ml-100k
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9266  0.9344  0.9442  0.9360  0.9395  0.9362  0.0058  
MAE (testset)     0.7309  0.7382  0.7430  0.7403  0.7372  0.7379  0.0040  
Fit time          5.87    5.86    5.80    5.86    5.82    5.84    0.03    
Test time         0.15    0.23    0.14    0.23    0.14    0.18    0.04    


{'fit_time': (5.873595714569092,
  5.8569605350494385,
  5.801485538482666,
  5.8643107414245605,
  5.817120790481567),
 'test_mae': array([0.73090646, 0.73821335, 0.74301281, 0.74026596, 0.73723624]),
 'test_rmse': array([0.92661767, 0.93441805, 0.94424307, 0.93602463, 0.93946255]),
 'test_time': (0.14624261856079102,
  0.23040246963500977,
  0.14353108406066895,
  0.23046469688415527,
  0.1409289836883545)}

# Collaborative Filtering Recommendation Exploration
## Knn Exploration of MovieLens with Surprise

In [0]:
import io  # needed because of weird encoding of u.item file
from surprise import KNNBaseline
from surprise import Dataset
from surprise import get_dataset_dir

## Helper Function to Convert IDS to Names

In [0]:
def read_item_names():
    """Read the u.item file from MovieLens 100-k dataset and return two
    mappings to convert raw ids into movie names and movie names into raw ids.
    """

    file_name = get_dataset_dir() + '/ml-100k/ml-100k/u.item'
    rid_to_name = {}
    name_to_rid = {}
    with io.open(file_name, 'r', encoding='ISO-8859-1') as f:
        for line in f:
            line = line.split('|')
            rid_to_name[line[0]] = line[1]
            name_to_rid[line[1]] = line[0]

    return rid_to_name, name_to_rid

## Train KNN based model

In [0]:
# First, train the algorithm to compute the similarities between items
data = Dataset.load_builtin('ml-100k')
trainset = data.build_full_trainset()
sim_options = {'name': 'pearson_baseline', 'user_based': False}
algo = KNNBaseline(sim_options=sim_options)
algo.fit(trainset)



Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBaseline at 0x7f2eb0fd2908>

## Recommendations

In [0]:
# Read the mappings raw id <-> movie name
rid_to_name, name_to_rid = read_item_names()

# Retrieve inner id of the movie Toy Story
toy_story_raw_id = name_to_rid['Toy Story (1995)']
toy_story_inner_id = algo.trainset.to_inner_iid(toy_story_raw_id)

# Retrieve inner ids of the nearest neighbors of Toy Story.
toy_story_neighbors = algo.get_neighbors(toy_story_inner_id, k=10)

# Convert inner ids of the neighbors into names.
toy_story_neighbors = (algo.trainset.to_raw_iid(inner_id)
                       for inner_id in toy_story_neighbors)
toy_story_neighbors = (rid_to_name[rid]
                       for rid in toy_story_neighbors)


In [0]:
read_item_names()

({'1': 'Toy Story (1995)',
  '2': 'GoldenEye (1995)',
  '3': 'Four Rooms (1995)',
  '4': 'Get Shorty (1995)',
  '5': 'Copycat (1995)',
  '6': 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)',
  '7': 'Twelve Monkeys (1995)',
  '8': 'Babe (1995)',
  '9': 'Dead Man Walking (1995)',
  '10': 'Richard III (1995)',
  '11': 'Seven (Se7en) (1995)',
  '12': 'Usual Suspects, The (1995)',
  '13': 'Mighty Aphrodite (1995)',
  '14': 'Postino, Il (1994)',
  '15': "Mr. Holland's Opus (1995)",
  '16': 'French Twist (Gazon maudit) (1995)',
  '17': 'From Dusk Till Dawn (1996)',
  '18': 'White Balloon, The (1995)',
  '19': "Antonia's Line (1995)",
  '20': 'Angels and Insects (1995)',
  '21': 'Muppet Treasure Island (1996)',
  '22': 'Braveheart (1995)',
  '23': 'Taxi Driver (1976)',
  '24': 'Rumble in the Bronx (1995)',
  '25': 'Birdcage, The (1996)',
  '26': 'Brothers McMullen, The (1995)',
  '27': 'Bad Boys (1995)',
  '28': 'Apollo 13 (1995)',
  '29': 'Batman Forever (1995)',
  '30': 'Belle de jour