In [1]:
import pandas as pd
import numpy as np
import ast
import nltk
import pickle
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kashy\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [5]:
movies = movies.merge(credits, on='title')
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'popularity', 'vote_count', 'vote_average']]

In [6]:
movies.dropna(inplace=True)

In [7]:
# Utility functions to clean and extract data
def get_name(obj):
    name = []
    for i in ast.literal_eval(obj):
        name.append(i['name'])
    return name

def get_top_cast(obj):
    counter = 0
    name = []
    for i in ast.literal_eval(obj):
        if counter != 3:
            name.append(i['name'])
            counter += 1
        else:
            break
    return name

def get_director_name(obj):
    name = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            name.append(i['name'])
            break
    return name

In [8]:
# Apply functions to clean the dataset
movies['genres'] = movies['genres'].apply(get_name)
movies['keywords'] = movies['keywords'].apply(get_name)
movies['cast'] = movies['cast'].apply(get_top_cast)
movies['crew'] = movies['crew'].apply(get_director_name)
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [9]:
# Clean strings by removing spaces
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

In [10]:
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [11]:
new_df = movies[['movie_id', 'title', 'tags', 'popularity', 'vote_count', 'vote_average']]
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x).lower())


In [12]:
# Lemmatization
lemmatizer = WordNetLemmatizer()

def lemmatize_tags(text):
    return ' '.join([lemmatizer.lemmatize(word) for word in text.split()])

new_df['tags'] = new_df['tags'].apply(lemmatize_tags)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lemmatize_tags)


In [13]:
# TF-IDF for textual data (tags)
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
tag_vectors = tfidf.fit_transform(new_df['tags']).toarray()

# Normalize numerical features (popularity, vote_count, vote_average)
scaler = MinMaxScaler()
new_df[['popularity', 'vote_count', 'vote_average']] = scaler.fit_transform(new_df[['popularity', 'vote_count', 'vote_average']])

TEXTUAL_WEIGHT = 0.7
POPULARITY_WEIGHT = 0.1
VOTE_COUNT_WEIGHT = 0.1
VOTE_AVERAGE_WEIGHT = 0.1

textual_similarity = cosine_similarity(tag_vectors)
popularity_similarity = cosine_similarity(new_df[['popularity']])
vote_count_similarity = cosine_similarity(new_df[['vote_count']])
vote_average_similarity = cosine_similarity(new_df[['vote_average']])

# Combine all similarities with weights
combined_similarity = (TEXTUAL_WEIGHT * textual_similarity +
                       POPULARITY_WEIGHT * popularity_similarity +
                       VOTE_COUNT_WEIGHT * vote_count_similarity +
                       VOTE_AVERAGE_WEIGHT * vote_average_similarity)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[['popularity', 'runtime', 'vote_average']] = scaler.fit_transform(new_df[['popularity', 'vote_count', 'vote_average']])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df[['popularity', 'runtime', 'vote_average']] = scaler.fit_transform(new_df[['popularity', 'vote_count', 'vote_average']])


In [14]:
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))
pickle.dump(combined_similarity, open('similarity.pkl', 'wb'))