# Basic Recommendation on Movielens Dataset

## Import Libraries

In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from math import ceil
from datetime import datetime
from collections import defaultdict
%matplotlib inline
sns.set_style('white')

## Load Dataset

In [2]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
ratings = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\ratings.csv', sep=',', header=1, names=col_names)
ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp
0,1,3,4.0,964981247
1,1,6,4.0,964982224
2,1,47,5.0,964983815
3,1,50,5.0,964982931
4,1,70,3.0,964982400


In [3]:
col_names = ['item_id', 'title', 'genres']
movies = pd.read_csv(r'C:\Users\Yukawa\datasets\ml-latest-small\movies.csv', sep=',', header=0, names=col_names)
movies.head()

Unnamed: 0,item_id,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
movies['year'] = movies.title.str.extract("\((\d{4})\)", expand=True)
movies.year = pd.to_datetime(movies.year, format='%Y')
movies.year = movies.year.dt.year # As there are some NaN years, resulting type will be float (decimals)
movies.title = movies.title.str[:-7]

In [5]:
ratings['timestamp'] = pd.to_datetime(ratings['timestamp'], unit='s', origin='unix')

In [6]:
movie_ratings = pd.merge(ratings, movies, on='item_id')
movie_ratings.head()

Unnamed: 0,user_id,item_id,rating,timestamp,title,genres,year
0,1,3,4.0,2000-07-30 18:20:47,Grumpier Old Men,Comedy|Romance,1995.0
1,6,3,5.0,1996-10-17 12:11:36,Grumpier Old Men,Comedy|Romance,1995.0
2,19,3,3.0,2000-08-08 04:07:16,Grumpier Old Men,Comedy|Romance,1995.0
3,32,3,3.0,1997-02-23 22:16:12,Grumpier Old Men,Comedy|Romance,1995.0
4,42,3,4.0,2001-07-27 08:04:05,Grumpier Old Men,Comedy|Romance,1995.0


## Dataset Analysis

### Current Dataset Info

In [7]:
print("Raw data size: ", ratings.shape,
    "\nNumber of Unique users: ", len(ratings['user_id'].unique()),
    "\nNumber of Unique movies: ", len(ratings['item_id'].unique()),
    "\nNumber of Unique ratings: ", len(ratings['rating'].unique()),     # 0, 0.5, 1, 1.5, ... 5.0
    "\nUnique ratings: ", ratings['rating'].sort_values().unique(),
     )

Raw data size:  (100835, 4) 
Number of Unique users:  610 
Number of Unique movies:  9724 
Number of Unique ratings:  10 
Unique ratings:  [0.5 1.  1.5 2.  2.5 3.  3.5 4.  4.5 5. ]


### Most Rated Movies 

In [8]:
ratings = pd.DataFrame(movie_ratings.groupby('title')['rating'].mean())
ratings['No_of_ratings'] = pd.DataFrame(movie_ratings.groupby('title')['rating'].count())
ratings.sort_values(by=['No_of_ratings'], ascending=False).head(10)

Unnamed: 0_level_0,rating,No_of_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Forrest Gump,4.164134,329
"Shawshank Redemption, The",4.429022,317
Pulp Fiction,4.197068,307
"Silence of the Lambs, The",4.16129,279
"Matrix, The",4.192446,278
Star Wars: Episode IV - A New Hope,4.231076,251
Jurassic Park,3.75,238
Braveheart,4.031646,237
Terminator 2: Judgment Day,3.970982,224
Schindler's List,4.225,220


## Designing KNN Func With Basic Decay

In [9]:
def get_bin(movie_ratings_data, movie_name, min_common_elements=20, 
                         start_year=1995,
                         end_year=2000):
    user_movie_matrix = movie_ratings_data[(movie_ratings_data.timestamp >= datetime(start_year, 1, 15)) & (movie_ratings_data.timestamp < datetime(end_year, 1, 15))].pivot_table(index='user_id', columns='title', values='rating')
    corrs = user_movie_matrix.corr(method='pearson', min_periods=min_common_elements)
    movie_corrs = corrs[movie_name]
    movie_corrs.dropna(inplace=True)
    movies_alike = pd.DataFrame(movie_corrs)
    movies_alike.columns= ['correlation']
    movies_alike.sort_values(by='correlation', ascending=False, inplace=True)
    return movies_alike

In [10]:
x = get_bin(movie_ratings,"Shawshank Redemption, The", start_year=2000, end_year=2020)
x.head(15)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
"Shawshank Redemption, The",1.0
Intouchables,0.744396
Dead Man Walking,0.655406
Crimson Tide,0.633063
Wallace & Gromit: A Close Shave,0.622129
Shooter,0.601875
Four Weddings and a Funeral,0.59596
Leaving Las Vegas,0.574386
"Lives of Others, The (Das leben der Anderen)",0.571045
Outbreak,0.565155


## Designing KNN Func With Bins

In [11]:
print("First Movie Rating: ", movie_ratings['timestamp'].min() )
print("Today: ", datetime.now())

First Movie Rating:  1996-03-29 18:36:55
Today:  2020-03-27 14:09:34.111482


In [107]:
def print_knn_decay(rating_data, movie_name, k=10, min_common_elements=20,decay_rate=1, decay_interval_in_years=5):
    """
    decay_rate = importance of newer bins, 1 means all equal, 2 means new one 2x imporant than the one before itself
    """
    start_dt = movie_ratings['timestamp'].min()
    end_dt = datetime.now()
    n_bins = ceil( (end_dt.year - start_dt.year) / decay_interval_in_years )
    correlation_dict = defaultdict(float)

    for i in range(n_bins):
        bin_start_dt = start_dt
        start_dt = datetime(start_dt.year+decay_interval_in_years,1,1) 
        bin_end_dt = start_dt
        #print(f"\nBin {i}: {bin_start_dt}-{bin_end_dt}")
        
        curr_bin = get_bin(movie_ratings,movie_name, start_year=bin_start_dt.year, end_year=bin_end_dt.year)
        
        for index,row in x.iterrows():
            correlation_dict[index] += row.correlation * ((decay_rate ** (i+1)) / ((decay_rate ** (n_bins+1) - 1)/(decay_rate-1)))
            
        #print(curr_bin.query(f"correlation > 0.2 & title != '{movie_name}'").head(k))
    dictlist = list()
    for key, value in correlation_dict.items():
        temp = [key,value]
        dictlist.append(temp)
    sum_correlations = pd.DataFrame(dictlist, columns = ['title','correlation'])

    print(sum_correlations.head(k))
    
    
    

In [108]:
print_knn_decay(movie_ratings,"Shawshank Redemption, The", k=15, decay_rate=2, decay_interval_in_years=5)

                                           title  correlation
0                      Shawshank Redemption, The     0.984127
1                                   Intouchables     0.732581
2                               Dead Man Walking     0.645003
3                                   Crimson Tide     0.623015
4                Wallace & Gromit: A Close Shave     0.612254
5                                        Shooter     0.592322
6                    Four Weddings and a Funeral     0.586500
7                              Leaving Las Vegas     0.565269
8   Lives of Others, The (Das leben der Anderen)     0.561981
9                                       Outbreak     0.556184
10                                     Quiz Show     0.554034
11                                   Rear Window     0.541410
12                               My Cousin Vinny     0.529676
13                                       Hancock     0.522336
14                                       Sabrina     0.511833


In [109]:
print_knn_decay(movie_ratings,"Shawshank Redemption, The", k=15, decay_rate=2, decay_interval_in_years=3)

                                           title  correlation
0                      Shawshank Redemption, The     0.998043
1                                   Intouchables     0.742940
2                               Dead Man Walking     0.654124
3                                   Crimson Tide     0.631824
4                Wallace & Gromit: A Close Shave     0.620911
5                                        Shooter     0.600698
6                    Four Weddings and a Funeral     0.594794
7                              Leaving Las Vegas     0.573262
8   Lives of Others, The (Das leben der Anderen)     0.569928
9                                       Outbreak     0.564049
10                                     Quiz Show     0.561869
11                                   Rear Window     0.549066
12                               My Cousin Vinny     0.537166
13                                       Hancock     0.529722
14                                       Sabrina     0.519070


In [110]:
print_knn_decay(movie_ratings,"Shawshank Redemption, The", k=15, decay_rate=2, decay_interval_in_years=10)

                                           title  correlation
0                      Shawshank Redemption, The     0.933333
1                                   Intouchables     0.694770
2                               Dead Man Walking     0.611713
3                                   Crimson Tide     0.590859
4                Wallace & Gromit: A Close Shave     0.580653
5                                        Shooter     0.561750
6                    Four Weddings and a Funeral     0.556229
7                              Leaving Las Vegas     0.536094
8   Lives of Others, The (Das leben der Anderen)     0.532976
9                                       Outbreak     0.527478
10                                     Quiz Show     0.525439
11                                   Rear Window     0.513467
12                               My Cousin Vinny     0.502338
13                                       Hancock     0.495377
14                                       Sabrina     0.485416
