# Basic Recommendation on Movielens Dataset

## Import Libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Dataset Analysis

### Current Dataset Info

In [7]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Rated Movies 

In [8]:
ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
ratings.sort_values(by=['No_of_ratings'], ascending=False).head(10)

Unnamed: 0_level_0,rating,No_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump,4.164134,329
"Shawshank Redemption, The",4.429022,317
Pulp Fiction,4.197068,307
"Silence of the Lambs, The",4.16129,279
"Matrix, The",4.192446,278
Star Wars: Episode IV - A New Hope,4.231076,251
Jurassic Park,3.75,238
Braveheart,4.031646,237
Terminator 2: Judgment Day,3.970982,224
Schindler's List,4.225,220


## Designing KNN Func With Bins

In [9]:
def get_knn_with_bin(rating_matrix, movie_name, k=10, min_common_elements=20, 
                         start_year=95,
                         end_year=20):
    user_movie_matrix = rating_matrix[(rating_matrix.timestamp >= datetime(start_year, 1, 15)) & (rating_matrix.timestamp < datetime(end_year, 1, 15))].pivot_table(index='user_id', columns='title', values='rating')
    corrs = user_movie_matrix.corr(method='pearson',min_periods=min_common_elements)
    movie_corrs = corrs[movie_name]
    movie_corrs.dropna(inplace=True)
    movies_alike = pd.DataFrame(movie_corrs)
    movies_alike.columns= ['correlation']
    movies_alike.sort_values(by='correlation', ascending=False, inplace=True)
    print(movies_alike.head(k))

In [10]:
#get_knn_with_bin(movie_ratings,"Shawshank Redemption, The", k=15)

In [11]:
def print_knn_with_bins(rating_matrix, movie_name, k=10, min_common_elements=20, 
                         start_year=2001,
                         end_year=2021,
                         bin_size_in_years=5):
    while(start_year < end_year):
        bin_end_year = start_year+bin_size_in_years
        bin_start_year = start_year
        print(f"\nRange:{bin_start_year}:{bin_end_year}-->\n")
        get_knn_with_bin(rating_matrix, movie_name, k=15, start_year=start_year, end_year=bin_end_year)
        start_year += bin_size_in_years


In [12]:
print_knn_with_bins(movie_ratings,"Shawshank Redemption, The", k=15)


Range:2001:2006-->

                           correlation
title                                 
Shawshank Redemption, The     1.000000
Matrix, The                   0.000000
Silence of the Lambs, The    -0.086519

Range:2006:2011-->

                                                  correlation
title                                                        
Shawshank Redemption, The                            1.000000
E.T. the Extra-Terrestrial                           0.606933
Schindler's List                                     0.603358
Terminator 2: Judgment Day                           0.587223
Gladiator                                            0.582625
Braveheart                                           0.578764
Die Hard                                             0.571752
Crouching Tiger, Hidden Dragon (Wo hu cang long)     0.567345
Snatch                                               0.552673
Forrest Gump                                         0.543680
Dances with Wolves 