Movie Recommendation System with Machine Learning

# Data Preprocessing

## Importing libraries

In [69]:
import numpy as np
import pandas as pd

## Loading datasets

In [70]:
credits = pd.read_csv("Dataset/tmdb_5000_credits.csv")
movies = pd.read_csv("Dataset/tmdb_5000_movies.csv")
print("Credits:",credits.shape)
print("Movies Dataframe:",movies.shape)

# Mergeing the datasets and dropping useless columns
credits_column_renamed = credits.rename(index=str, columns={"movie_id": "id"})
movies_merge = movies.merge(credits_column_renamed, on='id')
movies_cleaned = movies_merge.drop(columns=['homepage', 'title_x', 'title_y', 'status','production_countries'])

Credits: (4803, 4)
Movies Dataframe: (4803, 20)


In [71]:
movies_cleaned_df = movies_cleaned['overview'].tolist()
# 2656
movies_cleaned_df[2656] = "bla"
movies_cleaned_df[4140] = "bla"
movies_cleaned_df[4431] = "bla"
print(movies_cleaned_df[4431])
print(len(movies_cleaned_df))

bla
4803


# Training the model

In [80]:
######################################################
## Trying CountVectorizer... it's ok
# import re
# import nltk
# nltk.download('stopwords')
# from nltk.corpus import stopwords
# from nltk.stem.porter import PorterStemmer
# corpus = []
# for i in range(0, 4803):
#   overview = re.sub('[^a-zA-Z]', ' ', movies_cleaned_df[i])
#   overview = overview.lower()
#   overview = overview.split()
#   ps = PorterStemmer()
#   all_stopwords = stopwords.words('english')
#   all_stopwords.remove('not')
#   overview = [ps.stem(word) for word in overview if not word in set(all_stopwords)]
#   overview = ' '.join(overview)
#   corpus.append(overview)

# from sklearn.feature_extraction.text import CountVectorizer
# cv = CountVectorizer()

# matrix = cv.fit_transform(corpus)

###############################################################################

from sklearn.feature_extraction.text import TfidfVectorizer
tfv = TfidfVectorizer(min_df=3,  max_features=None,
            strip_accents='unicode', analyzer='word',token_pattern=r'\w{1,}',
            ngram_range=(1, 3),
            stop_words = 'english')


# Fitting the TF-IDF on the 'overview' text
matrix = tfv.fit_transform(movies_cleaned_df)

# Trying other kernels: laplacian_kernel, linear_kernel, pairwise_kernels, polynomial_kernel
from sklearn.metrics.pairwise import polynomial_kernel

# Compute the sigmoid kernel
euc_dist = polynomial_kernel(matrix, matrix)
print(euc_dist[0])

[1.00028799 1.         1.         ... 1.         1.         1.        ]


## Reverse mapping of indices and movie titles

In [81]:
# Reverse mapping of indices and movie titles
indices = pd.Series(movies_cleaned.index, index=movies_cleaned['original_title']).drop_duplicates()
print(indices)
print(indices['Newlyweds'])
print(euc_dist[1799])
# print(list(enumerate(sig[indices['Newlyweds']])))
# print(sorted(list(enumerate(sig[indices['Newlyweds']])), key=lambda x: x[1], reverse=True))



original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64
4799
[1.         1.00002681 1.00001702 ... 1.         1.         1.        ]


In [82]:
def give_recomendations(title, model=euc_dist):
    # Get the index corresponding to original_title
    idx = indices[title]

    # Get the pairwsie similarity scores
    model_scores = list(enumerate(model[idx]))

    # Sort the movies
    model_scores = sorted(model_scores, key=lambda x: x[1], reverse=True)

    # Scores of the 10 most similar movies
    model_scores = model_scores[1:11]

    # Movie indices
    movie_indices = [i[0] for i in model_scores]

    # Top 10 most similar movies
    return movies_cleaned['original_title'].iloc[movie_indices]

In [83]:
print(give_recomendations('Newlyweds'))

616                       Ted 2
2689         Our Family Wedding
869          You, Me and Dupree
3969           Something Wicked
1576                 Bride Wars
2290               Just Married
1032      America's Sweethearts
3145                      Amour
2531     Why Did I Get Married?
504     The Secret Life of Pets
Name: original_title, dtype: object
