##### source: https://towardsdatascience.com/prototyping-a-recommender-system-step-by-step-part-1-knn-item-based-collaborative-filtering

In [None]:
# import sys
# !{sys.executable} -m pip install numpy pandas matplotlib sklearn seaborn
# !{sys.executable} -m pip install fuzzywuzzy

In [110]:
import pprint
pp = pprint.PrettyPrinter(indent=2)

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import warnings
import time

warnings.filterwarnings('ignore')
%matplotlib inline

##### Loading in data

In [111]:
def load_ratings():
    COL_NAME = ['uid','mid','rating','timestamp']
    df = pd.read_csv('data/movie-ratings.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_movies():
    COL_NAME = ['mid','movie_name','movie_genre']
    df = pd.read_csv('data/movie-movies.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def load_users():
    COL_NAME = ['uid','user_fea1','user_fea2','user_fea3','user_fea4']
    df = pd.read_csv('data/movie-users.dat',sep='::', header=None, engine='python', names=COL_NAME)
    return df

def text2seq(text, n_genre):
    """ using tokenizer to encoded the multi-level categorical feature """
    tokenizer = Tokenizer(lower=True, split='|',filters='', num_words=n_genre)
    tokenizer.fit_on_texts(text)
    seq = tokenizer.texts_to_sequences(text)
    seq = pad_sequences(seq, maxlen=3,padding='post')
    return seq

In [112]:
users = load_users()
users.head()

Unnamed: 0,uid,user_fea1,user_fea2,user_fea3,user_fea4
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [113]:
users.shape

(6040, 5)

In [114]:
movies = load_movies()
movies.head()

Unnamed: 0,mid,movie_name,movie_genre
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


In [115]:
movies.shape

(3883, 3)

In [116]:
ratings = load_ratings()
ratings.head()

Unnamed: 0,uid,mid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [117]:
ratings.shape

(1000209, 4)

In [118]:
from scipy.sparse import csr_matrix

# pivot ratings into movie features
movie_features = ratings.pivot(index='mid', columns='uid', values='rating').fillna(0)
movie_features.head()

uid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [119]:
# convert dataframe of movie features to scipy sparse matrix
mat_movie_features = csr_matrix(movie_features.values)

In [120]:
# filter data
movies_cnt = pd.DataFrame(ratings.groupby('mid').size(), columns=['count'])
movies_cnt.head()

Unnamed: 0_level_0,count
mid,Unnamed: 1_level_1
1,2077
2,701
3,478
4,170
5,296


In [121]:
popular_movies = list(set(movies_cnt.query('count >= 1').index))  # noqa
# pp.pprint(popular_movies)
movies_filter = ratings.mid.isin(popular_movies).values
movies_filter

array([ True,  True,  True, ...,  True,  True,  True])

In [122]:
users_cnt = pd.DataFrame(ratings.groupby('uid').size(), columns=['count'])
users_cnt.head()

Unnamed: 0_level_0,count
uid,Unnamed: 1_level_1
1,53
2,129
3,51
4,21
5,198


In [123]:
active_users = list(set(users_cnt.query('count >= 1').index))  # noqa
# pp.pprint(active_users)
users_filter = ratings.uid.isin(active_users).values
users_filter

array([ True,  True,  True, ...,  True,  True,  True])

In [124]:
ratings_filtered = ratings[movies_filter & users_filter]
ratings_filtered.head()

Unnamed: 0,uid,mid,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [125]:
# pivot and create movie-user matrix
movie_user_mat = ratings_filtered.pivot(index='mid', columns='uid', values='rating').fillna(0)
movie_user_mat.head()

uid,1,2,3,4,5,6,7,8,9,10,...,6031,6032,6033,6034,6035,6036,6037,6038,6039,6040
mid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,4.0,0.0,4.0,5.0,5.0,...,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,2.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [126]:
# create mapper from movie title to index
hashmap = {  movie: i for i, movie in enumerate(list(movies.set_index('mid').loc[movie_user_mat.index].movie_name)) }
hashmap

{'Toy Story (1995)': 0,
 'Jumanji (1995)': 1,
 'Grumpier Old Men (1995)': 2,
 'Waiting to Exhale (1995)': 3,
 'Father of the Bride Part II (1995)': 4,
 'Heat (1995)': 5,
 'Sabrina (1995)': 6,
 'Tom and Huck (1995)': 7,
 'Sudden Death (1995)': 8,
 'GoldenEye (1995)': 9,
 'American President, The (1995)': 10,
 'Dracula: Dead and Loving It (1995)': 11,
 'Balto (1995)': 12,
 'Nixon (1995)': 13,
 'Cutthroat Island (1995)': 14,
 'Casino (1995)': 15,
 'Sense and Sensibility (1995)': 16,
 'Four Rooms (1995)': 17,
 'Ace Ventura: When Nature Calls (1995)': 18,
 'Money Train (1995)': 19,
 'Get Shorty (1995)': 20,
 'Copycat (1995)': 21,
 'Assassins (1995)': 22,
 'Powder (1995)': 23,
 'Leaving Las Vegas (1995)': 24,
 'Othello (1995)': 25,
 'Now and Then (1995)': 26,
 'Persuasion (1995)': 27,
 'City of Lost Children, The (1995)': 28,
 'Shanghai Triad (Yao a yao yao dao waipo qiao) (1995)': 29,
 'Dangerous Minds (1995)': 30,
 'Twelve Monkeys (1995)': 31,
 'Wings of Courage (1995)': 32,
 'Babe (1995)'

In [127]:
# transform matrix to scipy sparse matrix
movie_user_mat_sparse = csr_matrix(movie_user_mat.values)

In [128]:
from sklearn.neighbors import NearestNeighbors

model = NearestNeighbors(metric='cosine', algorithm='brute', n_neighbors=20, n_jobs=-1)
model.fit(mat_movie_features)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
                 metric_params=None, n_jobs=-1, n_neighbors=20, p=2,
                 radius=1.0)

In [137]:
# utils import
from fuzzywuzzy import fuzz

def fuzzy_matching(hashmap, fav_movie):
    """
        return the closest match via fuzzy ratio.
        If no match found, return None
        Parameters
        ----------
        hashmap: dict, map movie title name to index of the movie in data
        fav_movie: str, name of user input movie
        Return
        ------
        index of the closest match
    """
    match_tuple = []
    # get match
    for title, idx in hashmap.items():
        ratio = fuzz.ratio(title.lower(), fav_movie.lower())
        if ratio >= 60:
            match_tuple.append((title, idx, ratio))
    # sort
    match_tuple = sorted(match_tuple, key=lambda x: x[2])[::-1]
    if not match_tuple:
        return -1
    else:
        print('Found possible matches in our database: {0}\n'.format([x[0] for x in match_tuple]))
        return match_tuple[0][1]

In [149]:
def inference_movie(idx):
    # inference
    """
        return top n similar movie recommendations based on user's input movie
        ------
        list of top n similar movie recommendations
    """
    n_recommendations = 10
    print('Recommendation system start to make inference')
    print('......\n')
    t0 = time.time()
    distances, indices = model.kneighbors(mat_movie_features[idx], n_neighbors=n_recommendations+1)

    # get list of raw idx of recommendations
    raw_recommends = \
                sorted(
                    list(
                        zip(
                            indices.squeeze().tolist(),
                            distances.squeeze().tolist()
                        )
                    ),
                    key=lambda x: x[1]
                )[:0:-1]
    print('It took my system {:.2f}s to make inference \n'.format(time.time() - t0))

    # print results
    reverse_hashmap = {v: k for k, v in hashmap.items()}
    print('Recommendations for {}:'.format(fav_movie))
    for i, (idx, dist) in enumerate(raw_recommends):
        print('{0}: {1}, with distance of {2}'.format(i+1, reverse_hashmap[idx], dist))

fav_movie = 'Home Alone'
print('You have input movie:', fav_movie)

# get input movie index
idx = fuzzy_matching(hashmap, fav_movie)
if idx > 0:
    inference_movie(idx)
else:
    print('Oops! No match is found')

You have input movie: Home Alone
Found possible matches in our database: ['Home Alone (1990)', 'Home Alone 3 (1997)']

Recommendation system start to make inference
......

It took my system 0.11s to make inference 

Recommendations for Home Alone:
1: Sleepless in Seattle (1993), with distance of 0.5436423264407959
2: Robin Hood: Men in Tights (1993), with distance of 0.5410273023889198
3: Pretty Woman (1990), with distance of 0.5398136671020306
4: Dumb & Dumber (1994), with distance of 0.5330284245735715
5: Ace Ventura: Pet Detective (1994), with distance of 0.5309142829723836
6: Mighty Ducks, The (1992), with distance of 0.5152500739446273
7: Sister Act (1992), with distance of 0.4999993447115193
8: Liar Liar (1997), with distance of 0.48879571169174985
9: Mrs. Doubtfire (1993), with distance of 0.4720466900528837
10: Home Alone 2: Lost in New York (1992), with distance of 0.4246803937327782
