# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime as dt
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Dataset Analysis

### Current Dataset Info

In [7]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Rated Movies 

In [8]:
ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
ratings.sort_values(by=['No_of_ratings'], ascending=False).head(10)

Unnamed: 0_level_0,rating,No_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump,4.164134,329
"Shawshank Redemption, The",4.429022,317
Pulp Fiction,4.197068,307
"Silence of the Lambs, The",4.16129,279
"Matrix, The",4.192446,278
Star Wars: Episode IV - A New Hope,4.231076,251
Jurassic Park,3.75,238
Braveheart,4.031646,237
Terminator 2: Judgment Day,3.970982,224
Schindler's List,4.225,220


## Designing Basic K Nearest Neighbour Function

In [9]:
def print_knn(rating_matrix, movie_name, k=10, min_common_elements=20):
    user_movie_matrix = rating_matrix.pivot_table(index='user_id', columns='title', values='rating')
    corrs = user_movie_matrix.corr(method='pearson',min_periods=min_common_elements)
    movie_corrs = corrs[movie_name]
    movie_corrs.dropna(inplace=True)
    movies_alike = pd.DataFrame(movie_corrs)
    movies_alike.columns= ['correlation']
    movies_alike.sort_values(by='correlation', ascending=False, inplace=True)
    print(movies_alike.head(k))

In [10]:
print_knn(movie_ratings,"Shawshank Redemption, The", k=15)

                                              correlation
title                                                    
Shawshank Redemption, The                        1.000000
Intouchables                                     0.744396
Wallace & Gromit: A Close Shave                  0.619625
Shooter                                          0.601875
Spy Game                                         0.585164
Lives of Others, The (Das leben der Anderen)     0.571373
Rear Window                                      0.545149
Hancock                                          0.530761
My Cousin Vinny                                  0.523016
Bad Boys                                         0.516762
Cube                                             0.515635
Elf                                              0.510485
Swordfish                                        0.503165
Rounders                                         0.500780
To Die For                                       0.493755


## Designing KNN Function With Basic Filters

In [11]:
def print_filtered_knn(rating_matrix, movie_name, k=10, min_common_elements=20, filter_date=dt.strptime('01/01/18 13:55:26', '%m/%d/%y %H:%M:%S')):
    user_movie_matrix = rating_matrix[rating_matrix.timestamp < filter_date].pivot_table(index='user_id', columns='title', values='rating')
    corrs = user_movie_matrix.corr(method='pearson',min_periods=min_common_elements)
    movie_corrs = corrs[movie_name]
    movie_corrs.dropna(inplace=True)
    movies_alike = pd.DataFrame(movie_corrs)
    movies_alike.columns= ['correlation']
    movies_alike.sort_values(by='correlation', ascending=False, inplace=True)
    print(movies_alike.head(k))
print_filtered_knn(movie_ratings,"Shawshank Redemption, The", k=15)

                                              correlation
title                                                    
Shawshank Redemption, The                        1.000000
Intouchables                                     0.806946
Wallace & Gromit: A Close Shave                  0.619625
Shooter                                          0.609221
Lives of Others, The (Das leben der Anderen)     0.608295
Spy Game                                         0.585164
King's Speech, The                               0.550759
Rear Window                                      0.538809
Bad Boys                                         0.521934
My Cousin Vinny                                  0.516932
Madagascar                                       0.509593
Elf                                              0.506449
Swordfish                                        0.503165
Sherlock Holmes                                  0.501900
Rounders                                         0.500780
