In [1]:
import warnings
warnings.filterwarnings('ignore')

import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
column_names = ['user_id', 'item_id', 'rating', 'timestamp']
df = pd.read_csv('u.data', sep='\t', names=column_names)
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,0,50,5,881250949
1,0,172,5,881250949
2,0,133,1,881250949
3,196,242,3,881250949
4,186,302,3,891717742


In [3]:
movie_titles = pd.read_csv("Movie_Id_Titles")
movie_titles.head()

Unnamed: 0,item_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)


In [4]:
df = pd.merge(df,movie_titles,on='item_id')
df.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title
0,0,50,5,881250949,Star Wars (1977)
1,290,50,5,880473582,Star Wars (1977)
2,79,50,4,891271545,Star Wars (1977)
3,2,50,5,888552084,Star Wars (1977)
4,8,50,5,879362124,Star Wars (1977)


In [5]:
df.groupby('title')['rating'].mean().sort_values(ascending=False).head()

title
Marlene Dietrich: Shadow and Light (1996)     5.0
Prefontaine (1997)                            5.0
Santa with Muscles (1996)                     5.0
Star Kid (1997)                               5.0
Someone Else's America (1995)                 5.0
Name: rating, dtype: float64

In [6]:
df.groupby('title')['rating'].count().sort_values(ascending=False).head()

title
Star Wars (1977)             584
Contact (1997)               509
Fargo (1996)                 508
Return of the Jedi (1983)    507
Liar Liar (1997)             485
Name: rating, dtype: int64

In [7]:
ratings = pd.DataFrame(df.groupby('title')['rating'].mean())
ratings.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
'Til There Was You (1997),2.333333
1-900 (1994),2.6
101 Dalmatians (1996),2.908257
12 Angry Men (1957),4.344
187 (1997),3.02439


In [8]:
ratings['num of ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
ratings.head()

Unnamed: 0_level_0,rating,num of ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'Til There Was You (1997),2.333333,9
1-900 (1994),2.6,5
101 Dalmatians (1996),2.908257,109
12 Angry Men (1957),4.344,125
187 (1997),3.02439,41


In [9]:
moviemat = df.pivot_table(index='item_id',columns='user_id',values='rating').fillna(0)
print(moviemat.head())


user_id  0    1    2    3    4    5    6    7    8    9    ...  934  935  936  \
item_id                                                    ...                  
1        0.0  5.0  4.0  0.0  0.0  4.0  4.0  0.0  0.0  0.0  ...  2.0  3.0  4.0   
2        0.0  3.0  0.0  0.0  0.0  3.0  0.0  0.0  0.0  0.0  ...  4.0  0.0  0.0   
3        0.0  4.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  4.0   
4        0.0  3.0  0.0  0.0  0.0  0.0  0.0  5.0  0.0  0.0  ...  5.0  0.0  0.0   
5        0.0  3.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0   

user_id  937  938  939  940  941  942  943  
item_id                                     
1        0.0  4.0  0.0  0.0  5.0  0.0  0.0  
2        0.0  0.0  0.0  0.0  0.0  0.0  5.0  
3        0.0  0.0  0.0  0.0  0.0  0.0  0.0  
4        0.0  0.0  0.0  2.0  0.0  0.0  0.0  
5        0.0  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 944 columns]


In [10]:
print(moviemat.shape)

(1682, 944)


In [11]:
movie_to_idx = {
    movie: i for i, movie in enumerate(list(movie_titles.set_index('item_id').loc[moviemat.index].title))
}
movie_to_idx

{'Toy Story (1995)': 0,
 'GoldenEye (1995)': 1,
 'Four Rooms (1995)': 2,
 'Get Shorty (1995)': 3,
 'Copycat (1995)': 4,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 5,
 'Twelve Monkeys (1995)': 6,
 'Babe (1995)': 7,
 'Dead Man Walking (1995)': 8,
 'Richard III (1995)': 9,
 'Seven (Se7en) (1995)': 10,
 'Usual Suspects, The (1995)': 11,
 'Mighty Aphrodite (1995)': 12,
 'Postino, Il (1994)': 13,
 "Mr. Holland's Opus (1995)": 14,
 'French Twist (Gazon maudit) (1995)': 15,
 'From Dusk Till Dawn (1996)': 16,
 'White Balloon, The (1995)': 17,
 "Antonia's Line (1995)": 18,
 'Angels and Insects (1995)': 19,
 'Muppet Treasure Island (1996)': 20,
 'Braveheart (1995)': 21,
 'Taxi Driver (1976)': 22,
 'Rumble in the Bronx (1995)': 23,
 'Birdcage, The (1996)': 24,
 'Brothers McMullen, The (1995)': 25,
 'Bad Boys (1995)': 26,
 'Apollo 13 (1995)': 27,
 'Batman Forever (1995)': 28,
 'Belle de jour (1967)': 29,
 'Crimson Tide (1995)': 30,
 'Crumb (1994)': 31,
 'Desperado (1995)': 32,
 'Doom 

In [12]:
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors

In [13]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(moviemat.values)
movie_user_mat_sparse

<1682x944 sparse matrix of type '<class 'numpy.float64'>'
	with 100003 stored elements in Compressed Sparse Row format>

In [14]:
# utils import
from fuzzywuzzy import fuzz

In [15]:
# define model
model_knn = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
# fit
model_knn.fit(movie_user_mat_sparse)

NearestNeighbors(algorithm='brute', metric='cosine', n_jobs=-1, n_neighbors=20)

In [16]:
def fuzzy_matching(mapper, fav_movie, verbose=True):
    """
    return the closest match via fuzzy ratio. If no match found, return None
    
    Parameters
    ----------    
    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie
    
    verbose: bool, print log if True

    Return
    ------
    index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in mapper.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        print('Oops! No match is found')
        return
    if verbose:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
    return match_tuple[0][1]



def make_recommendation(model_knn, data, mapper, fav_movie, n_recommendations):
    """
    return top n similar movie recommendations based on user's input movie
    Parameters
    ----------
    model_knn: sklearn model, knn model

    data: movie-user matrix

    mapper: dict, map movie title name to index of the movie in data

    fav_movie: str, name of user input movie

    n_recommendations: int, top n recommendations

    Return
    ------
    list of top n similar movie recommendations
    """
    # fit
    model_knn.fit(data)
    # get input movie index
    print('You have input movie:', fav_movie)
    idx = fuzzy_matching(mapper, fav_movie, verbose=True)
    # inference
    print('Recommendation system start to make inference')
    print('......\n')
    distances, indices = model_knn.kneighbors(data[idx], n_neighbors=n_recommendations+1)
    # get list of raw idx of recommendations
    raw_recommends = \
        sorted(list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())), key=lambda x: x[1])[:0:-1]
    # get reverse mapper
    reverse_mapper = {v: k for k, v in mapper.items()}
    # print recommendations
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_mapper[idx], dist))

In [17]:
my_favorite = 'Toy Story'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: Toy Story
Found possible matches in our database: ['Toy Story (1995)']

Recommendation system start to make inference
......

Recommendations for Toy Story:
1: Raiders of the Lost Ark (1981), with distance of 0.3776175050042344
2: Jerry Maguire (1996), with distance of 0.37592529851886347
3: Fargo (1996), with distance of 0.36939923923861095
4: Star Trek: First Contact (1996), with distance of 0.3632726854037742
5: Willy Wonka and the Chocolate Factory (1971), with distance of 0.3618423665130154
6: Mission: Impossible (1996), with distance of 0.3586782396588516
7: Rock, The (1996), with distance of 0.33544521129158
8: Independence Day (ID4) (1996), with distance of 0.31021439592414524
9: Return of the Jedi (1983), with distance of 0.30007502870792213
10: Star Wars (1977), with distance of 0.26622322826178724


In [18]:
my_favorite = 'batman'

make_recommendation(
    model_knn=model_knn,
    data=movie_user_mat_sparse,
    fav_movie=my_favorite,
    mapper=movie_to_idx,
    n_recommendations=10)

You have input movie: batman
Found possible matches in our database: ['Batman (1989)']

Recommendation system start to make inference
......

Recommendations for batman:
1: Empire Strikes Back, The (1980), with distance of 0.3561644212720909
2: Indiana Jones and the Last Crusade (1989), with distance of 0.3509174945485136
3: Batman Forever (1995), with distance of 0.3436320608215734
4: True Lies (1994), with distance of 0.33560720799637955
5: Speed (1994), with distance of 0.33463389167836677
6: Die Hard 2 (1990), with distance of 0.33024592599604097
7: Top Gun (1986), with distance of 0.31983252759281455
8: Jurassic Park (1993), with distance of 0.3126321455063157
9: Die Hard: With a Vengeance (1995), with distance of 0.3053461989564902
10: Batman Returns (1992), with distance of 0.290836250228915


In [19]:
# calcuate total number of entries in the movie-user matrix
num_entries = moviemat.shape[0] * moviemat.shape[1]
# calculate total number of entries with zero values
num_zeros = (moviemat==0).sum(axis=1).sum()
# calculate ratio of number of zeros to number of entries
ratio_zeros = num_zeros / num_entries
print('There is about {:.2%} of ratings in our data is missing'.format(ratio_zeros))

There is about 93.70% of ratings in our data is missing
