In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import difflib # to get the best match

In [2]:
df = pd.read_csv('movies.csv')
df.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton


In [3]:
selected_features = ['genres', 'keywords', 'tagline', 'cast', 'director']
selected_features

['genres', 'keywords', 'tagline', 'cast', 'director']

In [4]:
# Filling missing values in df

for features in selected_features:
    df[features] = df[features].fillna('')

In [5]:
#Combining selected features

combined_features = df['genres'] + " " + df.keywords + " " + df.tagline + ' ' + df.cast + " " + df.director
combined_features

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance  A newlywed couple's honeymoon ...
4800    Comedy Drama Romance TV Movie date love at fir...
4801      A New Yorker in Shanghai Daniel Henney Eliza...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object

In [6]:
# Converting text data to feature vectors

vectorizer = TfidfVectorizer()
feature_vectors = vectorizer.fit_transform(combined_features)
feature_vectors

<4803x17318 sparse matrix of type '<class 'numpy.float64'>'
	with 124266 stored elements in Compressed Sparse Row format>

In [7]:
#Getting Similarity score using cosinge similarity

similarity = cosine_similarity(feature_vectors)
similarity

#Cosine similarity take one movie vector ad compare it with all other movie vectors giving this similarity score 
# i.e. how mucb similar is one movie with others. That's why there is a 1 in diagnols.

array([[1.        , 0.07219487, 0.037733  , ..., 0.        , 0.        ,
        0.        ],
       [0.07219487, 1.        , 0.03281499, ..., 0.03575545, 0.        ,
        0.        ],
       [0.037733  , 0.03281499, 1.        , ..., 0.        , 0.05389661,
        0.        ],
       ...,
       [0.        , 0.03575545, 0.        , ..., 1.        , 0.        ,
        0.02651502],
       [0.        , 0.        , 0.05389661, ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.02651502, 0.        ,
        1.        ]])

In [8]:
similarity.shape

(4803, 4803)

In [17]:
#getting movie name from user

movie_name = input('Enter a Movie name: ')

In [18]:
# creating list with all movie names given in dataset

list_of_all_movie_titles = df['title'].tolist()

In [19]:
# finding the closest match of movie name given by user

find_close_match = difflib.get_close_matches(movie_name, list_of_all_movie_titles)
find_close_match

['Man of Steel', 'Man of the Year', 'Man of the House']

In [20]:
close_match = find_close_match[0]
close_match

'Man of Steel'

In [21]:
# Finding the index of the movie with title

index_of_movie = df[df['title'] == close_match]['index'].values[0]
print(index_of_movie)

14


In [None]:
# Getting a list of similar movies

similarity_score = list(enumerate(similarity[index_of_movie]))

# enumerate function is used to run loop on the list with cosine similarity score and return two value in a list
# first value being the index of the particular movie
# second value of being the cosine similarity of that movie with mentioned movie name / index value

similarity_score

In [24]:
len(similarity_score)

4803

In [25]:
# Sorting the movies based on their similarity score

sorted_similar_movies = sorted(similarity_score, key = lambda x: x[1], reverse = True)
sorted_similar_movies

[(14, 1.0000000000000002),
 (9, 0.48698623700238786),
 (870, 0.3305170345458134),
 (813, 0.3188610536407353),
 (163, 0.2776787456260836),
 (10, 0.27040776537299577),
 (511, 0.2617021903908788),
 (203, 0.23670092244128663),
 (79, 0.22470205886208158),
 (33, 0.22210509182646504),
 (64, 0.2134054970282666),
 (101, 0.2068290987771686),
 (182, 0.1947190294206222),
 (4759, 0.19414012176997106),
 (7, 0.19376847075221765),
 (1296, 0.18840976163775702),
 (16, 0.1837336996264891),
 (174, 0.1824865764671004),
 (46, 0.1821476828532997),
 (1420, 0.17375726844658407),
 (126, 0.17333278382282674),
 (547, 0.16983426378745428),
 (531, 0.16937727616583928),
 (41, 0.16492284897151058),
 (788, 0.16253489417746317),
 (4401, 0.15368125868182766),
 (149, 0.15332672386823998),
 (302, 0.15282728827370734),
 (38, 0.15050295691505747),
 (484, 0.14794603703075423),
 (428, 0.14618736987340838),
 (72, 0.14364470473582439),
 (668, 0.1382004303651948),
 (4232, 0.136416149871048),
 (1785, 0.13430617721669724),
 (1192,

In [26]:
# print the name of similar movies based on index

print('Movies suggested for you: \n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = df[df.index == index]['title'].values[0]

    if i <11:
        print(i,'.', title_from_index)
        i += 1


Movies suggested for you: 

1 . Man of Steel
2 . Batman v Superman: Dawn of Justice
3 . Superman II
4 . Superman
5 . Watchmen
6 . Superman Returns
7 . X-Men
8 . X2
9 . Iron Man 2
10 . X-Men: The Last Stand


Movie Recommendation system

In [27]:

movie_name = input('Enter a Movie name: ')

print("You selected the Movie: ", movie_name, '\n\n')
list_of_all_movie_titles = df['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name, list_of_all_movie_titles)

close_match = find_close_match[0]

index_of_movie = df[df['title'] == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x: x[1], reverse = True)

print('Top 20 Movies suggested for you: \n')

i = 1

for movie in sorted_similar_movies:
    index = movie[0]
    title_from_index = df[df.index == index]['title'].values[0]

    if i <21:
        print(i,'.', title_from_index)
        i += 1


You selected the Movie:  Quantum of Solace 


Top 20 Movies suggested for you: 

1 . Quantum of Solace
2 . Troy
3 . Skyfall
4 . Die Another Day
5 . Puss in Boots
6 . The Blue Room
7 . Courage Under Fire
8 . Patriot Games
9 . Oblivion
10 . Need for Speed
11 . The November Man
12 . Hitman
13 . The Tuxedo
14 . Casino Royale
15 . 47 Ronin
16 . Jefferson in Paris
17 . Men of War
18 . Action Jackson
19 . The Damned United
20 . Spectre
