In [34]:
import numpy as np
import pandas as pd

### Read & Merge the tables 
### Column Discard

In [35]:
movies, credits = pd.read_csv('tmdb_5000_movies.csv'), pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')
# movies.head(1)

movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies['title'] = movies['title'].str.lower()  # Convert all movie titles to lowercase

# movies.head(1)

### Data Preprocessing

In [36]:
movies.dropna(inplace=True)
movies.drop_duplicates(inplace=True)

In [37]:
import ast
# Fetching genres, keywords, cast and crew from the json string
movies['genres'] = movies['genres'].apply(
    lambda text: [i['name'] for i in ast.literal_eval(text)])


movies['keywords'] = movies['keywords'].apply(
    lambda x: [i['name'] for i in ast.literal_eval(x)])

movies['cast'] = movies['cast'].apply(
    lambda text: [i['name'] for idx, i in enumerate(ast.literal_eval(text)) if idx < 3])

movies['crew'] = movies['crew'].apply(
    lambda text: [i['name'] for i in ast.literal_eval(text) if i['job'] == 'Director'])


# Converting the overview to a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [38]:
# Removing spaces from the genres, keywords, cast and crew

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

### Creating a new column - combining genres, overview, keywords, cast, crew

In [39]:
movies['tags'] = movies['overview'] + movies['genres'] + \
    movies['keywords'] + movies['cast'] + movies['crew']
updated_movies = movies[['id', 'title', 'tags']]

updated_movies['tags'] = updated_movies['tags'].apply(lambda x: " ".join(x))

updated_movies['tags'] = updated_movies['tags'].apply(lambda x: x.lower())

updated_movies.head(1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_movies['tags'] = updated_movies['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_movies['tags'] = updated_movies['tags'].apply(lambda x: x.lower())


Unnamed: 0,id,title,tags
0,19995,avatar,"in the 22nd century, a paraplegic marine is di..."


### Stem() function to trim words

In [40]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
def stem (text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y)

updated_movies['tags'] = updated_movies['tags'].apply(stem)
updated_movies.head(1)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_movies['tags'] = updated_movies['tags'].apply(stem)


Unnamed: 0,id,title,tags
0,19995,avatar,"in the 22nd century, a parapleg marin is dispa..."


### Model Creation

In [41]:
# Creating a count vectorizer to convert the text to vectors
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=3000, stop_words='english')
movie_vectors = cv.fit_transform(updated_movies['tags']).toarray()

In [42]:
# distance between the movies
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(movie_vectors)

In [43]:
def recommend(movie):
    index = updated_movies[updated_movies['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    
    for i in distances[1:6]:
        print(updated_movies.iloc[i[0]].title)

In [45]:
recommend('the matrix')

the matrix revolutions
hackers
the matrix reloaded
wargames
the thirteenth floor


In [46]:
import pickle
pickle.dump(updated_movies, open('movies.pkl', 'wb'))
pickle.dump(similarity,open('similarity.pkl','wb'))