In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
###### helper functions #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]
##################################################



In [None]:
##Step 1: Read CSV File
df=pd.read_csv("movie_dataset.csv")
print(df.head())
print(df.columns)


   index  ...           director
0      0  ...      James Cameron
1      1  ...     Gore Verbinski
2      2  ...         Sam Mendes
3      3  ...  Christopher Nolan
4      4  ...     Andrew Stanton

[5 rows x 24 columns]
Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')


In [None]:
##Step 2: Select Features
features=['keywords','cast','genres','director']
for feature in features:
  df[feature]=df[feature].fillna('')



In [None]:
##Step 3: Create a column in DF which combines all selected features
def combine_features(row):
  return row['keywords']+" "+row['cast']+" "+row['genres']+" "+row["director"]

df['combined_features']=df.apply(combine_features,axis=1)
print(df["combined_features"].head())


0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object


In [None]:
##Step 4: Create count matrix from this new combined column
cv=CountVectorizer()
count_matrix=cv.fit_transform(df['combined_features'])


In [None]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cos_sim=cosine_similarity(count_matrix)
movie_user_likes = "The Avengers"



In [None]:
## Step 6: Get index of this movie from its title
movie_index=get_index_from_title(movie_user_likes)
similar_movies=list(enumerate(cos_sim[movie_index]))



In [None]:
## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies=sorted(similar_movies,key=lambda x:x[1],reverse=True)


In [None]:
## Step 8: Print titles of first 20 movies
i=1
print("Top 20 movies found similar to: {}".format(movie_user_likes))
for movie in sorted_similar_movies:
  similar_movie = get_title_from_index(movie[0])
  if similar_movie == movie_user_likes:
    continue 
  print ("{} {}".format(i,similar_movie))
  i=i+1
  if i>20:
    break

Top 20 movies found similar to: The Avengers
1 Avengers: Age of Ultron
2 Iron Man 2
3 Captain America: The Winter Soldier
4 Captain America: Civil War
5 Thor: The Dark World
6 The Incredible Hulk
7 Ant-Man
8 X-Men
9 X2
10 Captain America: The First Avenger
11 X-Men: The Last Stand
12 X-Men: Days of Future Past
13 Iron Man 3
14 Iron Man
15 X-Men: Apocalypse
16 Thor
17 Deadpool
18 The Amazing Spider-Man 2
19 Man of Steel
20 Superman II
