import pandas as pd
import numpy as np
import difflib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.metrics.pairwise import cosine_similarity

data = pd.read_csv("movies.csv" )

data.head()

In [4]:
null_values = data.isnull().sum()
null_values

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [5]:
selected_column = ['genres','keywords','tagline','cast','director']
print(selected_column)
for selected in selected_column:
    data[selected] = data[selected].fillna(' ')


['genres', 'keywords', 'tagline', 'cast', 'director']


In [6]:
combined_column = data['genres']+' '+data['keywords']+' '+data['tagline']+' '+data['cast']+' '+data['director']
print(combined_column)

0       Action Adventure Fantasy Science Fiction cultu...
1       Adventure Fantasy Action ocean drug abuse exot...
2       Action Adventure Crime spy based on novel secr...
3       Action Crime Drama Thriller dc comics crime fi...
4       Action Adventure Science Fiction based on nove...
                              ...                        
4798    Action Crime Thriller united states\u2013mexic...
4799    Comedy Romance   A newlywed couple's honeymoon...
4800    Comedy Drama Romance TV Movie date love at fir...
4801        A New Yorker in Shanghai Daniel Henney Eli...
4802    Documentary obsession camcorder crush dream gi...
Length: 4803, dtype: object


In [6]:
vectorizer = CountVectorizer()


In [7]:
feature_extraction = vectorizer.fit_transform(combined_column)
print(feature_extraction)

  (0, 201)	1
  (0, 274)	1
  (0, 5274)	1
  (0, 13599)	1
  (0, 5437)	1
  (0, 3678)	1
  (0, 3065)	1
  (0, 5836)	1
  (0, 14378)	2
  (0, 16587)	1
  (0, 3225)	1
  (0, 14271)	1
  (0, 4945)	1
  (0, 15261)	1
  (0, 16998)	1
  (0, 11192)	1
  (0, 11503)	1
  (0, 13349)	1
  (0, 17007)	1
  (0, 17290)	1
  (0, 13319)	1
  (0, 14064)	1
  (0, 16668)	1
  (0, 14608)	1
  (0, 8756)	1
  :	:
  (4801, 403)	1
  (4801, 4835)	1
  (4801, 17266)	1
  (4801, 13835)	1
  (4801, 13175)	1
  (4801, 17150)	1
  (4801, 3511)	1
  (4801, 13948)	1
  (4801, 7269)	1
  (4802, 11161)	1
  (4802, 4518)	1
  (4802, 2129)	2
  (4802, 4980)	1
  (4802, 6155)	1
  (4802, 3436)	1
  (4802, 4528)	1
  (4802, 1316)	1
  (4802, 12989)	1
  (4802, 4371)	1
  (4802, 6417)	1
  (4802, 4608)	1
  (4802, 2425)	1
  (4802, 3654)	1
  (4802, 5367)	1
  (4802, 6996)	2


In [8]:
similarity = cosine_similarity(feature_extraction)
print(similarity.shape)

(4803, 4803)


In [9]:
movie_name = input("Enter Your Favourite Movie Name : ")



Enter Your Favourite Movie Name : iron man


In [10]:
list_of_all_names = data['title'].tolist()
print(list_of_all_names)

['Avatar', "Pirates of the Caribbean: At World's End", 'Spectre', 'The Dark Knight Rises', 'John Carter', 'Spider-Man 3', 'Tangled', 'Avengers: Age of Ultron', 'Harry Potter and the Half-Blood Prince', 'Batman v Superman: Dawn of Justice', 'Superman Returns', 'Quantum of Solace', "Pirates of the Caribbean: Dead Man's Chest", 'The Lone Ranger', 'Man of Steel', 'The Chronicles of Narnia: Prince Caspian', 'The Avengers', 'Pirates of the Caribbean: On Stranger Tides', 'Men in Black 3', 'The Hobbit: The Battle of the Five Armies', 'The Amazing Spider-Man', 'Robin Hood', 'The Hobbit: The Desolation of Smaug', 'The Golden Compass', 'King Kong', 'Titanic', 'Captain America: Civil War', 'Battleship', 'Jurassic World', 'Skyfall', 'Spider-Man 2', 'Iron Man 3', 'Alice in Wonderland', 'X-Men: The Last Stand', 'Monsters University', 'Transformers: Revenge of the Fallen', 'Transformers: Age of Extinction', 'Oz: The Great and Powerful', 'The Amazing Spider-Man 2', 'TRON: Legacy', 'Cars 2', 'Green Lant

In [11]:
find_close_match = difflib.get_close_matches(movie_name,list_of_all_names)
print(find_close_match)

['Iron Man', 'Iron Man 3', 'Iron Man 2']


In [12]:
close_match = find_close_match[0]
print(close_match)

Iron Man


In [14]:
index_of_movie = data[data.title == close_match]['index'].values[0]
print(index_of_movie)

68


In [15]:
similarity_score = list(enumerate(similarity[index_of_movie]))
print(similarity_score)

[(0, 0.1270001270001905), (1, 0.10263160115815709), (2, 0.06913011298202835), (3, 0.033351867298253506), (4, 0.12320822072673948), (5, 0.06788442333021306), (6, 0.1408939907352694), (7, 0.33882260699853367), (8, 0.03592106040535498), (9, 0.13340746919301402), (10, 0.1796053020267749), (11, 0.0535479552601666), (12, 0.0704469953676347), (13, 0.06558258357839529), (14, 0.18215302221567486), (15, 0.032791291789197645), (16, 0.33882260699853367), (17, 0.06913011298202835), (18, 0.11503946170861017), (19, 0.11810771907149853), (20, 0.13576884666042613), (21, 0.05986843400892497), (22, 0.03225806451612904), (23, 0.029934217004462485), (24, 0.056796183424706485), (25, 0.0), (26, 0.3225806451612904), (27, 0.12903225806451615), (28, 0.16971105832553265), (29, 0.0704469953676347), (30, 0.12320822072673948), (31, 0.3751832396884334), (32, 0.060717674071891624), (33, 0.22953904252438354), (34, 0.0), (35, 0.14664711502135333), (36, 0.13116516715679058), (37, 0.03456505649101418), (38, 0.16675933649

In [16]:
len(similarity_score)

4803

In [17]:
sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)
print(sorted_similar_movies)

[(68, 0.9999999999999998), (79, 0.4237017130659009), (31, 0.3751832396884334), (7, 0.33882260699853367), (16, 0.33882260699853367), (26, 0.3225806451612904), (85, 0.3170114791543561), (182, 0.28575028575042866), (4401, 0.26940795304016235), (511, 0.26681493838602804), (94, 0.23346307108777456), (33, 0.22953904252438354), (101, 0.22953904252438354), (169, 0.22953904252438354), (46, 0.22718473369882594), (174, 0.22718473369882594), (203, 0.22718473369882594), (661, 0.21885688981825285), (242, 0.2155263624321299), (122, 0.2113409861029041), (618, 0.20131905799006777), (91, 0.20011120378952105), (39, 0.19674775073518588), (607, 0.19674775073518588), (166, 0.19596545041740515), (64, 0.19354838709677422), (126, 0.19050019050028574), (2442, 0.18759161984421674), (3623, 0.1872514715682846), (788, 0.18481233109010925), (1740, 0.18481233109010925), (353, 0.18338688097178615), (14, 0.18215302221567486), (10, 0.1796053020267749), (131, 0.1796053020267749), (4542, 0.1796053020267749), (232, 0.17960

In [18]:
print("Movies Suggested for you : ")
i = 0

for movie in sorted_similar_movies:
    index = movie[0]
    name_from_index = data[data.index == index]['title'].values[0]
    if (i<30):
        print(i, '.', name_from_index)
        i+=1

Movies Suggested for you : 
0 . Iron Man
1 . Iron Man 2
2 . Iron Man 3
3 . Avengers: Age of Ultron
4 . The Avengers
5 . Captain America: Civil War
6 . Captain America: The Winter Soldier
7 . Ant-Man
8 . The Helix... Loaded
9 . X-Men
10 . Guardians of the Galaxy
11 . X-Men: The Last Stand
12 . X-Men: First Class
13 . Captain America: The First Avenger
14 . X-Men: Days of Future Past
15 . The Incredible Hulk
16 . X2
17 . Zathura: A Space Adventure
18 . Fantastic Four
19 . X-Men Origins: Wolverine
20 . Mystery Men
21 . Independence Day: Resurgence
22 . TRON: Legacy
23 . Sky Captain and the World of Tomorrow
24 . G.I. Joe: Retaliation
25 . X-Men: Apocalypse
26 . Thor: The Dark World
27 . Southland Tales
28 . Made
29 . Deadpool


In [22]:
x = data['title']
y = data['genres']

x_train,x_test,y_train,y_test = train_test_split(x,y,test_size =0.9, random_state=42)
print("testing set size  : " , len(x_test))
print("trainig set size  : " , len(x_train))

testing set size  :  4323
trainig set size  :  480


In [23]:

x_train_transformed = vectorizer.fit_transform(x_train)
x_test_transformed = vectorizer.transform(x_test)

In [24]:
model = MultinomialNB()
model.fit(x_train_transformed,y_train)

In [25]:
y_pred = model.predict(x_test_transformed)

accuracy = accuracy_score(y_test,y_pred)

print(f"Accuracy : " , accuracy)

Accuracy :  0.07587323617857969


In [26]:
movie_name = input("Enter Your Favourite Movie Name : ")

list_of_all_names = data['title'].tolist()

find_close_match = difflib.get_close_matches(movie_name,list_of_all_names)

close_match = find_close_match[0]

index_of_movie = data[data.title == close_match]['index'].values[0]

similarity_score = list(enumerate(similarity[index_of_movie]))

sorted_similar_movies = sorted(similarity_score, key = lambda x:x[1], reverse = True)

print("Movies Suggested for you : ")
i = 0

for movie in sorted_similar_movies:
    index = movie[0]
    name_from_index = data[data.index == index]['title'].values[0]
    if (i<30):
        print(i, '.', name_from_index)
        i+=1


Enter Your Favourite Movie Name : spider man
Movies Suggested for you : 
0 . Spider-Man
1 . Spider-Man 3
2 . Spider-Man 2
3 . The Notebook
4 . The Count of Monte Cristo
5 . Highlander: Endgame
6 . The Beastmaster
7 . Oz: The Great and Powerful
8 . The Queen
9 . Clear and Present Danger
10 . Vampires
11 . Cold Mountain
12 . Seabiscuit
13 . Top Gun
14 . Wanted
15 . Whale Rider
16 . Bambi
17 . Avatar
18 . The Musketeer
19 . Hancock
20 . The One
21 . Daybreakers
22 . Kung Pow: Enter the Fist
23 . The Purge: Election Year
24 . Horrible Bosses
25 . The League of Extraordinary Gentlemen
26 . Hesher
27 . Inception
28 . Salvador
29 . Things We Lost in the Fire
