# movie recomendation system webapplication (modelling notebook)

In [30]:
# libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore")

## dataset

In [31]:
movies = pd.read_csv("../dataset/movies.csv")
ratings = pd.read_csv("../dataset/ratings.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [32]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [33]:
movies.sample(10)

Unnamed: 0,movieId,title,genres
1833,2435,Hurlyburly (1998),Drama
191,224,Don Juan DeMarco (1995),Comedy|Drama|Romance
2322,3076,Irma la Douce (1963),Comedy
2934,3937,Runaway (1984),Sci-Fi|Thriller
3489,4770,"Glass House, The (2001)",Thriller
5384,8974,"SpongeBob SquarePants Movie, The (2004)",Adventure|Animation|Children|Comedy
4000,5644,"Pride of the Yankees, The (1942)",Drama
6649,56788,Charlie Wilson's War (2007),Comedy|Drama|War
917,1216,"Big Blue, The (Grand bleu, Le) (1988)",Adventure|Drama|Romance
1748,2346,"Stepford Wives, The (1975)",Mystery|Sci-Fi|Thriller


In [34]:
ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
19365,125,58559,4.0,1474296807
92447,597,2133,3.0,941641183
32776,222,64614,4.0,1391350017
67596,438,110,4.5,1105666133
49375,318,27773,3.5,1293980280
4113,27,1250,4.0,962686417
46525,305,106491,3.0,1460366218
80062,502,3101,3.0,1111757332
39293,274,327,3.5,1239122854
25793,177,112175,4.5,1449720916


In [35]:
print(movies.shape, ratings.shape)

(9742, 3) (100836, 4)


## exploratory data analysis

In [36]:
# checking for missing values
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [37]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [38]:
# checking the datatypes
print(movies.dtypes)
print("-"*50)
print(ratings.dtypes)

movieId     int64
title      object
genres     object
dtype: object
--------------------------------------------------
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object


In [39]:
# extracting year from movies title
movies["year"] = movies["title"].str.extract(r'\((\d{4})\)')
movies["year"] = movies["year"].fillna(0).astype(int)
movies["clean_title"] = movies["title"].str.replace(r'\s*\(\d{4}\)\s*$', '', regex=True)

In [40]:
print("Original titles with years:")
print(movies[['title', 'year', 'clean_title']].head(10))

Original titles with years:
                                title  year                  clean_title
0                    Toy Story (1995)  1995                    Toy Story
1                      Jumanji (1995)  1995                      Jumanji
2             Grumpier Old Men (1995)  1995             Grumpier Old Men
3            Waiting to Exhale (1995)  1995            Waiting to Exhale
4  Father of the Bride Part II (1995)  1995  Father of the Bride Part II
5                         Heat (1995)  1995                         Heat
6                      Sabrina (1995)  1995                      Sabrina
7                 Tom and Huck (1995)  1995                 Tom and Huck
8                 Sudden Death (1995)  1995                 Sudden Death
9                    GoldenEye (1995)  1995                    GoldenEye


In [41]:
movies.head()

Unnamed: 0,movieId,title,genres,year,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Father of the Bride Part II


In [42]:
# exploring the distribution of ratings
ratings["rating"].value_counts().sort_index()

rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64

In [43]:
# filter movies with sufficient ratings
min_ratings = 50
filter_movies = ratings['movieId'].value_counts() > min_ratings
filter_movies = filter_movies[filter_movies].index.tolist()

# filter users with sufficient ratings
min_user_ratings = 50
filter_users = ratings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

# applying filters
data_ratings = ratings[
    (ratings["movieId"].isin(filter_movies)) &
    (ratings["userId"].isin(filter_users))
]
print(f"Original ratings: {ratings.shape}, Filtered ratings: {data_ratings.shape}")

Original ratings: (100836, 4), Filtered ratings: (36214, 4)


In [44]:
# creating pivot table
movie_features = data_ratings.pivot(
    index = "movieId",
    columns = "userId",
    values = "rating"
).fillna(0)
print(f"Pivot table shape: {movie_features.shape}")

Pivot table shape: (436, 378)


In [45]:
movie_features.head()

userId,1,4,6,7,10,11,15,16,17,18,19,20,21,22,23,24,27,28,29,32,33,34,36,38,39,40,41,42,43,45,47,50,51,52,57,58,59,62,63,64,...,559,560,561,562,563,564,566,567,570,571,572,573,577,579,580,582,583,584,585,586,587,588,590,591,592,593,594,596,597,599,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,4.0,0.0,3.5,0.0,0.0,0.0,3.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,5.0,4.0,0.0,3.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,4.0,...,5.0,3.0,4.0,4.5,0.0,0.0,0.0,3.5,4.0,0.0,4.0,5.0,0.0,4.0,3.0,0.0,0.0,5.0,0.0,0.0,5.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,4.0,3.0,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,3.0,3.5,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,4.0,0.0,4.0,0.0,2.5,0.0,4.0,0.0,3.5,0.0,0.0,4.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,2.5,0.0,4.0,0.0,4.0,0.0,0.0,2.5,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,3.0,0.0,0.0,0.0,3.5,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,3.0,0.0,0.0,0.0,4.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,4.5,0.0,3.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,3.0,0.0,0.0,4.5,0.0,4.5,...,5.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,4.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,3.5,0.0,3.0,0.0,0.0,0.0,3.0,4.5,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
7,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,5.0,3.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2.5,3.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0


## building the recomendation models
### collaborative filtering with k-nearest-neighbors

In [46]:
# coverting to matrix
mat_movie_features = csr_matrix(movie_features)
mat_movie_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36214 stored elements and shape (436, 378)>