# movie recomendation system webapplication (modelling notebook)

In [60]:
# libraries
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import TruncatedSVD
import warnings
warnings.filterwarnings("ignore")

## dataset

In [61]:
movies = pd.read_csv("../dataset/movies.csv")
ratings = pd.read_csv("../dataset/ratings.csv")
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [62]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [63]:
movies.sample(10)

Unnamed: 0,movieId,title,genres
6967,66198,"International, The (2009)",Drama|Thriller
8681,122882,Mad Max: Fury Road (2015),Action|Adventure|Sci-Fi|Thriller
1968,2613,Night of the Comet (1984),Comedy|Horror|Sci-Fi
3050,4084,Beverly Hills Cop II (1987),Action|Comedy|Crime|Thriller
2297,3044,Dead Again (1991),Mystery|Romance|Thriller
3214,4340,"Animal, The (2001)",Comedy
7840,93320,Trailer Park Boys (1999),Comedy|Crime
1942,2574,"Out-of-Towners, The (1999)",Comedy
1577,2116,"Lord of the Rings, The (1978)",Adventure|Animation|Children|Fantasy
6495,53280,"Breed, The (2006)",Horror|Thriller


In [64]:
ratings.sample(10)

Unnamed: 0,userId,movieId,rating,timestamp
36261,247,8368,3.5,1467644475
85450,555,1275,5.0,978821670
260,2,131724,5.0,1445714851
14662,91,7090,4.5,1112712945
22087,144,6591,3.5,1136812912
34136,232,5299,3.5,1076952256
77710,483,8798,4.0,1263605292
77005,480,43708,3.5,1179160902
61789,409,2699,4.0,968978080
85867,555,4086,4.0,980125895


In [65]:
print(movies.shape, ratings.shape)

(9742, 3) (100836, 4)


## exploratory data analysis

In [66]:
# checking for missing values
movies.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [67]:
ratings.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [68]:
# checking the datatypes
print(movies.dtypes)
print("-"*50)
print(ratings.dtypes)

movieId     int64
title      object
genres     object
dtype: object
--------------------------------------------------
userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object


In [69]:
# extracting year from movies title
movies["year"] = movies["title"].str.extract(r'\((\d{4})\)')
movies["year"] = movies["year"].fillna(0).astype(int)
movies["clean_title"] = movies["title"].str.replace(r'\s*\(\d{4}\)\s*$', '', regex=True)

In [70]:
print("Original titles with years:")
print(movies[['title', 'year', 'clean_title']].head(10))

Original titles with years:
                                title  year                  clean_title
0                    Toy Story (1995)  1995                    Toy Story
1                      Jumanji (1995)  1995                      Jumanji
2             Grumpier Old Men (1995)  1995             Grumpier Old Men
3            Waiting to Exhale (1995)  1995            Waiting to Exhale
4  Father of the Bride Part II (1995)  1995  Father of the Bride Part II
5                         Heat (1995)  1995                         Heat
6                      Sabrina (1995)  1995                      Sabrina
7                 Tom and Huck (1995)  1995                 Tom and Huck
8                 Sudden Death (1995)  1995                 Sudden Death
9                    GoldenEye (1995)  1995                    GoldenEye


In [71]:
movies.head()

Unnamed: 0,movieId,title,genres,year,clean_title
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,Toy Story
1,2,Jumanji (1995),Adventure|Children|Fantasy,1995,Jumanji
2,3,Grumpier Old Men (1995),Comedy|Romance,1995,Grumpier Old Men
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,1995,Waiting to Exhale
4,5,Father of the Bride Part II (1995),Comedy,1995,Father of the Bride Part II


In [72]:
# exploring the distribution of ratings
ratings["rating"].value_counts().sort_index()

rating
0.5     1370
1.0     2811
1.5     1791
2.0     7551
2.5     5550
3.0    20047
3.5    13136
4.0    26818
4.5     8551
5.0    13211
Name: count, dtype: int64

In [73]:
# filter movies with sufficient ratings
min_ratings = 50
filter_movies = ratings['movieId'].value_counts() > min_ratings
filter_movies = filter_movies[filter_movies].index.tolist()

# filter users with sufficient ratings
min_user_ratings = 50
filter_users = ratings['userId'].value_counts() > min_user_ratings
filter_users = filter_users[filter_users].index.tolist()

# applying filters
data_ratings = ratings[
    (ratings["movieId"].isin(filter_movies)) &
    (ratings["userId"].isin(filter_users))
]
print(f"Original ratings: {ratings.shape}, Filtered ratings: {data_ratings.shape}")

Original ratings: (100836, 4), Filtered ratings: (36214, 4)


In [74]:
# creating pivot table
movie_features = data_ratings.pivot(
    index = "movieId",
    columns = "userId",
    values = "rating"
).fillna(0)
print(f"Pivot table shape: {movie_features.shape}")

Pivot table shape: (436, 378)


In [75]:
movie_features.head()

userId,1,4,6,7,10,11,15,16,17,18,...,600,601,602,603,604,605,606,607,608,610
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,0.0,4.5,0.0,0.0,2.5,0.0,4.5,3.5,...,2.5,4.0,0.0,4.0,3.0,4.0,2.5,4.0,2.5,5.0
2,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,3.0,...,4.0,0.0,4.0,0.0,5.0,3.5,0.0,0.0,2.0,0.0
3,4.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
6,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,4.0,...,0.0,0.0,3.0,4.0,3.0,0.0,0.0,0.0,0.0,5.0
7,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,3.5,0.0,0.0,0.0,0.0,0.0,2.5,0.0,0.0,0.0


## building the recomendation models
### collaborative filtering with k-nearest-neighbors

In [76]:
# coverting to matrix
mat_movie_features = csr_matrix(movie_features)
mat_movie_features

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 36214 stored elements and shape (436, 378)>

In [77]:
# building the k-nearest neighbors model
knn_model = NearestNeighbors(
    metric="cosine",
    algorithm="brute",
    n_neighbors=20,
    n_jobs=-1
)

In [78]:
# fitting the model
knn_model.fit(mat_movie_features)

0,1,2
,n_neighbors,20
,radius,1.0
,algorithm,'brute'
,leaf_size,30
,metric,'cosine'
,p,2
,metric_params,
,n_jobs,-1


In [79]:
# function too get movie recomendation based on collaborative filtering
def recomendation_knn(movie_id, n_recomendations=10):
    # finding k-nearest neighbors
    distances, indices = knn_model.kneighbors(
        movie_features.loc[movie_id].values.reshape(1, -1),
        n_neighbors = n_recomendations
    )

    # get movie id'set
    raw_recomends = sorted(
        list(zip(indices.squeeze().tolist(), distances.squeeze().tolist())),
        key=lambda x: x[1]
    )

    # preparing recomendation results
    recomends = []
    for i, (idx, dist) in enumerate(raw_recomends):
        movie_idx = movie_features.index[idx]
        recomends.append({
            "rank": i+1,
            "movieId": movie_idx,
            "title": movies[movies["movieId"] == movie_idx]["title"].values[0],
            "genres": movies[movies["movieId"] == movie_idx]["genres"].values[0],
            "distance": dist
        })
    return recomends

### matrix factorization with svd

In [80]:
# fitting the truncated svd
svd = TruncatedSVD(n_components=20, random_state=21)
matrix_svd = svd.fit_transform(movie_features)

In [81]:
# function to get movie recomendation based on svd
def recomendations_svd(movie_id, n_recomendations=10):
    # index of the movie
    if movie_id not in movie_features.index:
        return []
    movie_index = movie_features.index.get_loc(movie_id)

    # alculating cosine similarty
    movie_vector = matrix_svd[movie_index].reshape(1, -1)
    similarities = np.dot(matrix_svd, movie_vector.T).flatten()

    # top similar movies
    similar_indices = similarities.argsort()[::-1][1:n_recomendations+1]

    # preparing recomendation results
    recommends = []
    for i, idx in enumerate(similar_indices):
        movie_idx = movie_features.index[idx]
        recommends.append({
            'rank': i+1,
            'movieId': movie_idx,
            'title': movies[movies['movieId'] == movie_idx]['title'].values[0],
            'genres': movies[movies['movieId'] == movie_idx]['genres'].values[0],
            'similarity': similarities[idx]
        })
    return recommends

In [82]:
def test_recommendation_models():
    """Test both recommendation models with a sample movie"""
    print("=== TESTING RECOMMENDATION MODELS ===\n")
    
    # Let's test with a popular movie - Toy Story (ID: 1)
    test_movie_id = 1
    test_movie_title = movies[movies['movieId'] == test_movie_id]['title'].values[0]
    
    print(f"Testing with movie: {test_movie_title} (ID: {test_movie_id})\n")
    
    # Test K-NN recommendations
    print("--- Collaborative Filtering (K-NN) Recommendations ---")
    
    knn_recs = recomendation_knn(test_movie_id, 5)
    if knn_recs:
        for rec in knn_recs:
            print(f"{rec['rank']}. {rec['title']} (distance: {rec['distance']:.3f})")
    else:
        print("No recommendations found.")
    
    print("\n")
    
    # Test SVD recommendations
    print("--- Matrix Factorization (SVD) Recommendations ---")
    svd_recs = recomendations_svd(test_movie_id, 5)
    if svd_recs:
            for rec in svd_recs:
                print(f"{rec['rank']} {rec['title']} (similarity: {rec['similarity']:.3f})")
    else:
        print("No recommendations found.")
    
    return knn_recs, svd_recs

In [83]:
# Run the interactive tester
test_recommendation_models()

=== TESTING RECOMMENDATION MODELS ===

Testing with movie: Toy Story (1995) (ID: 1)

--- Collaborative Filtering (K-NN) Recommendations ---
1. Toy Story (1995) (distance: 0.000)
2. Jurassic Park (1993) (distance: 0.335)
3. Forrest Gump (1994) (distance: 0.357)
4. Toy Story 2 (1999) (distance: 0.372)
5. Star Wars: Episode IV - A New Hope (1977) (distance: 0.381)


--- Matrix Factorization (SVD) Recommendations ---
1 Shawshank Redemption, The (1994) (similarity: 2175.797)
2 Pulp Fiction (1994) (similarity: 2104.323)
3 Star Wars: Episode IV - A New Hope (1977) (similarity: 2023.263)
4 Silence of the Lambs, The (1991) (similarity: 1965.983)
5 Matrix, The (1999) (similarity: 1942.568)


([{'rank': 1,
   'movieId': np.int64(1),
   'title': 'Toy Story (1995)',
   'genres': 'Adventure|Animation|Children|Comedy|Fantasy',
   'distance': 0.0},
  {'rank': 2,
   'movieId': np.int64(480),
   'title': 'Jurassic Park (1993)',
   'genres': 'Action|Adventure|Sci-Fi|Thriller',
   'distance': 0.33488418488394356},
  {'rank': 3,
   'movieId': np.int64(356),
   'title': 'Forrest Gump (1994)',
   'genres': 'Comedy|Drama|Romance|War',
   'distance': 0.3565415477121445},
  {'rank': 4,
   'movieId': np.int64(3114),
   'title': 'Toy Story 2 (1999)',
   'genres': 'Adventure|Animation|Children|Comedy|Fantasy',
   'distance': 0.37163656407393986},
  {'rank': 5,
   'movieId': np.int64(260),
   'title': 'Star Wars: Episode IV - A New Hope (1977)',
   'genres': 'Action|Adventure|Sci-Fi',
   'distance': 0.3807886031957386}],
 [{'rank': 1,
   'movieId': np.int64(318),
   'title': 'Shawshank Redemption, The (1994)',
   'genres': 'Crime|Drama',
   'similarity': np.float64(2175.7973410929535)},
  {'r