In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
#df = data_file
df = pd.read_csv("movies_metadata.csv",low_memory=False, on_bad_lines='skip')

#we will work on a smaller sample so this will be faster
df = df.head(2000)
df.shape

(2000, 24)

In [None]:
print(df['title'].head(20))

0                          Toy Story
1                            Jumanji
2                   Grumpier Old Men
3                  Waiting to Exhale
4        Father of the Bride Part II
5                               Heat
6                            Sabrina
7                       Tom and Huck
8                       Sudden Death
9                          GoldenEye
10            The American President
11       Dracula: Dead and Loving It
12                             Balto
13                             Nixon
14                  Cutthroat Island
15                            Casino
16             Sense and Sensibility
17                        Four Rooms
18    Ace Ventura: When Nature Calls
19                       Money Train
Name: title, dtype: object


In [None]:
print(df['overview'].head(20))

0     Led by Woody, Andy's toys live happily in his ...
1     When siblings Judy and Peter discover an encha...
2     A family wedding reignites the ancient feud be...
3     Cheated on, mistreated and stepped on, the wom...
4     Just when George Banks has recovered from his ...
5     Obsessive master thief, Neil McCauley leads a ...
6     An ugly duckling having undergone a remarkable...
7     A mischievous young boy, Tom Sawyer, witnesses...
8     International action superstar Jean Claude Van...
9     James Bond must unmask the mysterious head of ...
10    Widowed U.S. president Andrew Shepherd, one of...
11    When a lawyer shows up at the vampire's doorst...
12    An outcast half-wolf risks his life to prevent...
13    An all-star cast powers this epic look at Amer...
14    Morgan Adams and her slave, William Shaw, are ...
15    The life of the gambling paradise â€“ Las Vegas ...
16    Rich Mr. Dashwood dies, leaving his second wif...
17    It's Ted the Bellhop's first night on th

In [None]:
#for simplicity,let's just use "overview" and "genres" for now.
#df = df[["overview","genres"]]
#df.head()

In [None]:
#we will fill missing values (naN)with empty strings
df['overview'] = df['overview'].fillna('')
df['genres'] = df['genres'].fillna('')

df['soup'] = df['overview']+ ' ' + df['genres']

In [None]:
#TF-IDF (Term Frequency-Inverse Document Frequency) is a crucial statistical method in NLP and information retrieval
#Define a TF - IDF Vectorizer object.
#We will remove all english stop words like 'the','a',etc.
tfidf = TfidfVectorizer(stop_words='english')

In [None]:
#construct the required TF-IDF matrix by fitting and transforming the data
tfidf_matrix = tfidf.fit_transform(df['soup'])
#print output
tfidf_matrix.shape

(2000, 13929)

In [None]:
#calculate the similaryty score between every pair of movies
cosine_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)



In [None]:
#construct a reverse map of indices and movie titles
#This helps us find the index of a movie from its title
indices = pd.Series(df.index,index=df['title'].str.lower()).drop_duplicates()

In [None]:
#this is the function that takes the movie title as input and output most similar movies
def get_recommendations(title, cosine_sim=cosine_sim):
  #get the index of the movie that matches the title
  idx = indices[title]

  #get the pairwise similarity scores of all movies with that movie
  sim_scores =  list(enumerate(cosine_sim[idx]))
  #sort the movies based on the similarity scores
  sim_scores = sorted(sim_scores,key=lambda x:x[1],reverse=True)

  #get the scores of the 10 most similar movies
  sim_scores = sim_scores[1:11]

  #get the movie indices
  movie_indices = [i[0] for i in sim_scores]

 #return the top 10 most similar movies
  return df['title'].iloc[movie_indices]


**Test the recommender**

In [None]:
print(get_recommendations('jumanji'))

1978                      Peter Pan
8                      Sudden Death
1506             The Innocent Sleep
363                        Maverick
976            D3: The Mighty Ducks
1951                    BASEketball
32                 Wings of Courage
1970                      Kidnapped
1611    The Man Who Knew Too Little
96                         Shopping
Name: title, dtype: object


In [None]:
#print(get_recommendations('the dark knight rises'))


In [None]:
print(get_recommendations('sense and sensibility'))

891          Meet Me in St. Louis
1550             A Thousand Acres
228           Eat Drink Man Woman
1057               Reservoir Dogs
644     Und keiner weint mir nach
802                     Diebinnen
1022               The Proprietor
1835        From Here to Eternity
32               Wings of Courage
833            Crows and Sparrows
Name: title, dtype: object
