Project overview - Creating a content based recoemdation system that recommends movies to users based on similar tags and keywords extracted from features like plot, cast, taglines etc.

In [1]:
#Import Libraries 
import numpy as np 
import pandas as pd

In [2]:
#Loading Dataset 
credits = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_credits.csv")
movies = pd.read_csv("../input/tmdb-movie-metadata/tmdb_5000_movies.csv")

In [3]:
#Column Names
print(movies.columns)
print(credits.columns)

In [4]:
# Combine the 2 Datasets Togethor based on movie title. 
combined_data = movies.merge(credits, on='title')

In [5]:
combined_data.info()

In [6]:
#Feature Selection 
features_cols = ['id', 'title', 'tagline', 'genres', 'keywords', 'cast', 'crew', 'overview']

final_df = combined_data[features_cols]
final_df.head()

We select features that we can use to extract select revevant keywords or tags from. Keywords from string features like genres,taglines,overview can help us in finding similar movies so we select these features. Movies can have similar budget, revenue etc but be completely different from each other so we drop we these features.

In [7]:
# Checking Missing Values
final_df.isnull().sum()

In [8]:
final_df['tagline'] = final_df['tagline'].fillna('NA')
final_df.dropna(inplace=True)
final_df.isnull().sum()

In [9]:
final_df.head()

In [10]:
#Overiview Column. 
final_df['overview'][0]

In [11]:
#Creating a new column "movie_tags", storing the content in the overiew column as tags in the movie tags column. 
final_df['overview'].astype(str)
final_df['movie_tags'] = final_df['overview'].apply(lambda x: x.lower() and x.split() )

In [12]:
final_df.head()

In [13]:
#Genre Column. 
final_df.iloc[0]['genres']

In [14]:
# Keywords Column
final_df.iloc[0]['keywords']

In [15]:
#Defining a function to convert string values (content) to list (tags)
import ast
def contenttotags(x):
    lst = []
    for i in ast.literal_eval(x):
        lst.append(i['name'].replace(" ","").lower())
    return lst

In [16]:
# Apply Function 
final_df['movie_tags'] = final_df['movie_tags'] + final_df['genres'].apply(contenttotags)
final_df['movie_tags'] = final_df['movie_tags'] + final_df['keywords'].apply(contenttotags)
final_df['movie_tags'][0]

In [17]:
# Defining Extract the first 5 names from cast
def extractcast(X):
    lst = []
    counter = 0
    for i in ast.literal_eval(X):
        if counter < 5:
            lst.append(i['name'].replace(" ","").lower())
            counter += 1

    return lst

In [18]:
#Adding tags from cast column to the movie tags column
final_df['movie_tags'] = final_df['movie_tags'] + final_df['cast'].apply(extractcast)
final_df['movie_tags'][0]

In [19]:
final_df['crew'][0]

In [20]:
# Defining function to extract Director name/
def director(x):
    lst = []
    for x in ast.literal_eval(x):
        if x['job'] == "Director":
            lst.append(x['name'].replace(" ","").lower())

    return lst

In [21]:
#Adding director tags to the movie tags column. 
final_df['movie_tags'] = final_df['movie_tags'] + final_df['crew'].apply(director)
final_df['movie_tags'][0]

In [22]:
# Convert movie title in list and concatenate with tags
final_df['movie_tags'] = final_df['movie_tags'] + final_df['title'].apply(lambda x: x.lower() and x.split() )

In [23]:
# Converting taglines (content) to a list of tags. 
final_df['tagline'] = final_df['tagline'].astype(str)
final_df['tagline'].apply(lambda x: x.lower() and x.split() )

In [24]:
#Adding tags from the tagline column to movie tags. 
final_df['movie_tags'] = final_df['movie_tags'] + final_df['tagline'].apply(lambda x: x.lower() and x.split() )
final_df['movie_tags'][0]

In [25]:
final_df.columns

In [26]:
# Create new DataFrame with required columns only
final = final_df[['id', 'title', 'movie_tags']]
final.head()

In [27]:
# Transform list of tags in the string and in lowercase
final['movie_tags'] = final['movie_tags'].apply(lambda x: " ".join(x).lower())
final['movie_tags'] = final['movie_tags'].str.lower()
final.head()

In [28]:
import nltk
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()

In [29]:
# Apply steming on the movie tags columns 
final['movie_tags'] = final['movie_tags'].apply(porter.stem)
final['movie_tags'][0]

In [30]:
# Remove Stop-words
from sklearn.feature_extraction.text import CountVectorizer
countVec = CountVectorizer(max_features=10000, stop_words='english')

In [31]:
# Convert Tags into vectors. 
Vectors = countVec.fit_transform(final['movie_tags']).toarray()
Vectors

In [32]:
Vectors[0]

In [33]:
countVec.get_feature_names()[130:250]

In [34]:
# Import cosine_similarity to Calulate Distance between the Vectors
from sklearn.metrics.pairwise import cosine_similarity

# Calculate the Distance of all Movies with eachothers
similarity = cosine_similarity(Vectors)

# Shape
similarity.shape

In [35]:
# Get the Array of Arrays for all Movies
similarity

In [36]:
# Similarity of 1st Movies with all
# Similarity with itself will be Index of 0

print("1st Sub Array Shape: ",similarity[0].shape)
print("\n", "-"*35, "\n Similarity of First Movie [Avatar] ","\n","-"*35, "\n", similarity[0])

In [37]:
# Recommender Function to Return Movie Names Only
def recommend(Movie):
    movie_index = final[final['title'] == Movie].index[0]
    distances = sorted(list(enumerate(similarity[movie_index])), reverse=True, key=lambda x: x[1])
    recommended_movie_names = []
    for i in distances[1:6]:
        recommended_movie_names.append(movies.iloc[i[0]].title)

    return recommended_movie_names       

In [38]:
#Movie Reccomedation 

movies1 = 'Avatar'

recommend(movies1)


In [None]:
import pickle
with open('Vectors.pkl', 'wb') as files:
    pickle.dump(Vectors, files)

In [None]:
with open('similarity.pkl', 'wb') as files:
    pickle.dump(similarity, files)

In [None]:
from numpy import savetxt
savetxt('Vectors.csv', Vectors, delimiter=',')