In [1]:
#Importing Dependencies
import numpy as np
import pandas as pd
import difflib
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
df = pd.read_csv("movies2.csv")

In [3]:
df.head(3)

Unnamed: 0,id,title,genre,original_language,overview,popularity,release_date,vote_average,vote_count
0,624963,A Babysitter's Guide to Monster Hunting,"Family,Fantasy,Adventure,Comedy",en,"Recruited by a secret society of babysitters, ...",39.595,14-10-2020,6.2,243
1,431530,A Bad Moms Christmas,Comedy,en,"Amy, Kiki and Carla – three under-appreciated ...",17.821,04-08-2017,6.3,1759
2,5486,A Bay of Blood,"Horror,Thriller",it,An elderly heiress is killed by her husband wh...,9.955,08-09-1971,6.7,286


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8000 entries, 0 to 7999
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   id                 8000 non-null   int64  
 1   title              8000 non-null   object 
 2   genre              8000 non-null   object 
 3   original_language  8000 non-null   object 
 4   overview           8000 non-null   object 
 5   popularity         8000 non-null   float64
 6   release_date       8000 non-null   object 
 7   vote_average       8000 non-null   float64
 8   vote_count         8000 non-null   int64  
dtypes: float64(2), int64(2), object(5)
memory usage: 562.6+ KB


In [5]:
#df.shape
df.isnull().sum()

id                   0
title                0
genre                0
original_language    0
overview             0
popularity           0
release_date         0
vote_average         0
vote_count           0
dtype: int64

## Selecting only Important Columns

In [6]:
important_features = ['genre', 'id', 'overview', 'title']

In [7]:
df[important_features].head(3)

Unnamed: 0,genre,id,overview,title
0,"Family,Fantasy,Adventure,Comedy",624963,"Recruited by a secret society of babysitters, ...",A Babysitter's Guide to Monster Hunting
1,Comedy,431530,"Amy, Kiki and Carla – three under-appreciated ...",A Bad Moms Christmas
2,"Horror,Thriller",5486,An elderly heiress is killed by her husband wh...,A Bay of Blood


In [8]:
#important_features = important_features.fillna('')

In [9]:
df[important_features].isnull().any().sum()

0

In [10]:
# replacing empty cells with null string
for i in important_features:
    df[important_features]=df[important_features].fillna('')

In [11]:
df[important_features].isnull().any().sum()

0

In [12]:
important_features

['genre', 'id', 'overview', 'title']

In [13]:
# Combining all important cols into a single col
combined_features = df[important_features]['genre']+' '+df[important_features]['overview']+' '+df[important_features]['title']

In [14]:
combined_features

0       Family,Fantasy,Adventure,Comedy Recruited by a...
1       Comedy Amy, Kiki and Carla – three under-appre...
2       Horror,Thriller An elderly heiress is killed b...
3       Drama,History An award-winning cynical journal...
4       Drama,Romance John Nash is a brilliant but aso...
                              ...                        
7995    Comedy Derek and Hansel are modelling again wh...
7996    Family,Comedy,Adventure,Action,Science Fiction...
7997    Animation,Adventure,Family,Comedy Determined t...
7998    Drama An uptight English writer traveling to C...
7999    Action,Drama,History,War In 1879, during the A...
Length: 8000, dtype: object

### Vectorizing the words of the important coulmns

In [15]:
# changing the text into vectors
Vectorizer = TfidfVectorizer()
feature_vectors = Vectorizer.fit_transform(combined_features)

In [16]:
print(feature_vectors)

  (0, 11385)	0.2250384966089167
  (0, 15347)	0.19519027784934098
  (0, 23547)	0.04355788568269363
  (0, 10294)	0.23643405188580377
  (0, 2000)	0.2651750083682307
  (0, 10449)	0.22002874258885285
  (0, 16462)	0.07643203355228011
  (0, 25290)	0.23220344868207976
  (0, 20899)	0.10602501460496305
  (0, 3091)	0.15019158198912708
  (0, 15708)	0.30672015299723654
  (0, 23339)	0.0957062161839008
  (0, 25459)	0.07863773400871059
  (0, 15350)	0.20634975434788008
  (0, 10998)	0.06393591051854441
  (0, 1193)	0.04589458160518645
  (0, 2966)	0.296793190142064
  (0, 23298)	0.07751177099513201
  (0, 2305)	0.229646226118392
  (0, 20402)	0.27748268847382357
  (0, 10924)	0.13779774275386045
  (0, 2001)	0.296793190142064
  (0, 16374)	0.04618374570103651
  (0, 21590)	0.2000584366214543
  (0, 20550)	0.14123318260110368
  :	:
  (7999, 6161)	0.185928945811724
  (7999, 11187)	0.18949313013591232
  (7999, 16689)	0.17547840291034608
  (7999, 16688)	0.2055196849203321
  (7999, 7186)	0.18949313013591232
  (7999, 2

### Getting the Similarity Score

In [17]:
#Taking Similarity Values 
cs = cosine_similarity(feature_vectors)
#print(cs)

In [18]:
cs.shape

(8000, 8000)

## Getting Movie Name from User

In [19]:
movie_name = input("Enter a Movie Name: ")

Enter a Movie Name: Batman


In [20]:
# getting list of all movies to check close match to the entered movie name
all_titles = df['title'].tolist()
#print(all_titles)

In [21]:
close_matches = difflib.get_close_matches(movie_name, all_titles)
print(close_matches)

['Batman', 'Batman', 'Fatman']


In [22]:
# getting the id/index of the top matched movie
movie_id = df[df['title']==close_matches[0]].index.values[0]
# without difflib have to enter exact name : movie_id = df[df['title']== movie_name]['index'].values
movie_id    

647

### Now, getting similar movies with respect to this movie id

In [23]:
similarity_score = list(enumerate(cs[movie_id]))
print(similarity_score)

[(0, 0.05885673429659029), (1, 0.05661046366549935), (2, 0.027016790331243636), (3, 0.011653160023494025), (4, 0.03361195531527873), (5, 0.030085325306606844), (6, 0.012506417524650067), (7, 0.047329162625879856), (8, 0.04296574284465621), (9, 0.021768893571006715), (10, 0.03270704708147841), (11, 0.027646811211445875), (12, 0.09239732535624713), (13, 0.04354896062105776), (14, 0.022299731331902738), (15, 0.053693993031741605), (16, 0.01739760799012267), (17, 0.008168744782326579), (18, 0.04314423332481827), (19, 0.03709311452876887), (20, 0.04428694891999924), (21, 0.028936881691257334), (22, 0.011667304969363891), (23, 0.03393138352526319), (24, 0.026389087747361772), (25, 0.0309251772589191), (26, 0.0346592275008827), (27, 0.04186965840272755), (28, 0.03429041539962618), (29, 0.03736512328054923), (30, 0.037334214464980885), (31, 0.01908092506789507), (32, 0.010169727429276474), (33, 0.024791495914396628), (34, 0.054628119065714034), (35, 0.04153203105279082), (36, 0.046678041078274

In [24]:
len(similarity_score)

8000

In [25]:
# Sorting them in Descending order to find highest similarity scores
sorted_scores = sorted(similarity_score, key=lambda x:x[1] ,reverse=True)
#sorted_scores = sorted_scores[1:]
print(sorted_scores)

[(647, 1.0), (5515, 0.1743137949302428), (669, 0.17383436958869639), (7710, 0.1696173860148532), (4501, 0.1681554139746652), (664, 0.16373404825135013), (5759, 0.15604841279934328), (667, 0.1501148959411755), (6038, 0.14876535625171444), (665, 0.14639327224871856), (670, 0.14568164901263772), (5514, 0.13913588416447403), (4786, 0.13781670857973793), (653, 0.13555587174116246), (4185, 0.1348946571224993), (662, 0.1332538051917207), (7714, 0.1326165094128802), (5263, 0.1322888269520851), (5991, 0.1308143235512361), (2991, 0.13055753691412322), (4462, 0.12949912867641003), (6978, 0.12827011305796585), (673, 0.1278343596306849), (672, 0.12593981226747564), (666, 0.12479617131504343), (655, 0.12464822945344135), (2021, 0.12416664875272451), (6043, 0.12323456681167932), (661, 0.12287230458879142), (3147, 0.12203583540688466), (668, 0.12196067842073323), (651, 0.12141530405636405), (6738, 0.12105362692736353), (656, 0.1203917792776362), (3407, 0.11802413351196551), (7068, 0.1173083452981798),

## Getting the top 5 matched movies

In [26]:
num = 0
print("The recommended movies are: ")
for i in sorted_scores:
    suggested = df[df.index==i[0]]['title'].values[0]
    print(suggested)
    num+=1
    if(num==5):
        break

The recommended movies are: 
Batman
Superman/Batman: Public Enemies
Batman: The Dark Knight Returns, Part 2
War of Likes
Police Story 2


## Or we can do it in a single function

In [27]:
def recommend(moviename):
    #movie_name = input("Enter a Movie Name: ")
    close_matches = difflib.get_close_matches(moviename, all_titles)
    movie_id = df[df['title']==close_matches[0]].index.values[0]
    similarity_score = list(enumerate(cs[movie_id]))
    sorted_scores = sorted(similarity_score, key=lambda x:x[1] ,reverse=True)
    num = 0
    print("The recommended movies are: ")
    for i in sorted_scores:
        suggested = df[df.index==i[0]]['title'].values[0]
        print(suggested)
        num+=1
        if(num==5):
            break

# Optput is :

In [34]:
recommend("iron man")

The recommended movies are: 
Iron Man
Iron Man 2
Iron Man 3
Clown
Spider-Man: Homecoming


# Now, saving this using pickle to use it to create streamit Website

In [29]:
import pickle

In [30]:
#making it easier to get movie list with index -- to use this in vscode streamlit coding
movie_cols = ['id', 'title']
Movies = df[movie_cols]
#Movies

In [31]:
pickle.dump(Movies, open('movies_list.pkl', 'wb'))

In [32]:
pickle.dump(cs, open('similarity.pkl', 'wb'))

In [33]:
#pickle.load(open('similarity.pkl', 'rb'))
# now that it is working, continuing creating a streamlet webpage in vscode