Movie Recommendation System with Machine Learning

# Data Preprocessing

## Importing libraries

In [1]:
import numpy as np
import pandas as pd

## Loading datasets

In [5]:
credits = pd.read_csv("Dataset/tmdb_5000_credits.csv")
movies = pd.read_csv("Dataset/tmdb_5000_movies.csv")
print("Credits:",credits.shape)
print("Movies Dataframe:",movies.shape)

# Mergeing the datasets and dropping useless columns
credits_column_renamed = credits.rename(index=str, columns={"movie_id": "id"})
movies_merge = movies.merge(credits_column_renamed, on='id')
movies_cleaned = movies_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])

Credits: (4803, 4)
Movies Dataframe: (4803, 20)


In [141]:
movies_cleaned_df = movies_cleaned['overview'].tolist()
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
# 2656
movies_cleaned_df[2656] = "bla"
movies_cleaned_df[4140] = "bla"
movies_cleaned_df[4431] = "bla"
print(movies_cleaned_df[4431])
print(len(movies_cleaned_df))

bla
4803


# Training the model

In [143]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')

# Fitting the TF-IDF on the 'overview' text

tfv_matrix = tfv.fit_transform(movies_cleaned_df)
print(tfv_matrix)
print(tfv_matrix.shape)

from sklearn.metrics.pairwise import sigmoid_kernel

# Compute the sigmoid kernel
sig = sigmoid_kernel(tfv_matrix, tfv_matrix)
print(sig[0])

  (0, 148)	0.3091311403250014
  (0, 1671)	0.2781519888709605
  (0, 431)	0.2108476222845202
  (0, 7056)	0.26867652924731417
  (0, 6448)	0.25667782792456906
  (0, 3583)	0.21788257757629861
  (0, 9394)	0.24144219475319859
  (0, 5908)	0.17992707015426695
  (0, 9718)	0.24435395917916744
  (0, 6544)	0.2959108637414298
  (0, 5973)	0.2747332388389473
  (0, 2635)	0.2818968058308858
  (0, 5659)	0.2610479764815685
  (0, 1515)	0.20118856027389756
  (0, 147)	0.3091311403250014
  (1, 1811)	0.36793229072333994
  (1, 7160)	0.3031029042939378
  (1, 2917)	0.30082125582474917
  (1, 9610)	0.3355175975086389
  (1, 2849)	0.21556897928560054
  (1, 2873)	0.3232482593626707
  (1, 4206)	0.30803003793564465
  (1, 5264)	0.1333016217342793
  (1, 1807)	0.2104623271234315
  (1, 2319)	0.21892404835497967
  :	:
  (4802, 671)	0.15980375315748296
  (4802, 2361)	0.15010502652571864
  (4802, 677)	0.1348896650905315
  (4802, 3737)	0.14859055527694667
  (4802, 3481)	0.14340155530934393
  (4802, 1253)	0.1517317799962251
  (4

## Reverse mapping of indices and movie titles

In [None]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()
print(indices)
print(indices['Newlyweds'])
print(sig[1799])
print(list(enumerate(sig[indices['Newlyweds']])))
print(sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True))



In [146]:
def give_recomendations(title, sig=sig):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores
    sig_scores = list(enumerate(sig[idx]))

    # Sort the movies
    sig_scores = sorted(sig_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    sig_scores = sig_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in sig_scores]

    # Top 10 most similar movies
    return movies_cleaned['original_title'].iloc[movie_indices]

In [147]:
print(give_recomendations('Avatar'))

1341                Obitaemyy Ostrov
634                       The Matrix
3604                       Apollo 18
2130                    The American
775                        Supernova
529                 Tears of the Sun
151                          Beowulf
311     The Adventures of Pluto Nash
847                         Semi-Pro
942                 The Book of Life
Name: original_title, dtype: object
