In [3]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

In [4]:
# Load and clean the dataset
df = pd.read_csv('../IMDB-Movie-Dataset(2023-1951).csv', low_memory=False)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,movie_id,movie_name,year,genre,overview,director,cast
0,0,tt15354916,Jawan,2023,"Action, Thriller",A high-octane action thriller which outlines t...,Atlee,"Shah Rukh Khan, Nayanthara, Vijay Sethupathi, ..."
1,1,tt15748830,Jaane Jaan,2023,"Crime, Drama, Mystery",A single mother and her daughter who commit a ...,Sujoy Ghosh,"Kareena Kapoor, Jaideep Ahlawat, Vijay Varma, ..."
2,2,tt11663228,Jailer,2023,"Action, Comedy, Crime",A retired jailer goes on a manhunt to find his...,Nelson Dilipkumar,"Rajinikanth, Mohanlal, Shivarajkumar, Jackie S..."
3,3,tt14993250,Rocky Aur Rani Kii Prem Kahaani,2023,"Comedy, Drama, Family",Flamboyant Punjabi Rocky and intellectual Beng...,Karan Johar,"Ranveer Singh, Alia Bhatt, Dharmendra, Shabana..."
4,4,tt15732324,OMG 2,2023,"Comedy, Drama",An unhappy civilian asks the court to mandate ...,Amit Rai,"Pankaj Tripathi, Akshay Kumar, Yami Gautam, Pa..."
...,...,...,...,...,...,...,...,...
2194,2195,tt11112474,Heeriye,,Thriller,Add a Plot,Subhash Ghai,"Shatrughan Sinha, Reena Roy, Ajit Khan, Premna..."
2195,2196,tt0332766,Sur: The Melody of Life,2002,"Drama, Musical, Romance",A renowned music teacher mentors a promising y...,Tanuja Chandra,"Lucky Ali, Simone Singh, Achint Kaur, Ehsan Khan"
2196,2197,tt8622232,Time to Dance,2021,"Musical, Romance",When a ballroom dancer's shot at a crucial tou...,Stanley D'Costa,"Sooraj Pancholi, Isabelle Kaif, Waluscha D'Sou..."
2197,2198,tt0187351,Nigahen: Nagina Part II,1989,"Drama, Family, Fantasy",After the tragic deaths of his son Ajit and da...,Harmesh Malhotra,"Sunny Deol, Sridevi, Anupam Kher, Gulshan Grover"


In [6]:
df[df.duplicated('director')]['director'].value_counts()

director
David Dhawan                                                         28
Ram Gopal Varma                                                      20
Priyadarshan                                                         19
Rohit Shetty                                                         17
Anurag Kashyap                                                       16
                                                                     ..
Abbas Alibhai Burmawalla, Mastan Alibhai Burmawalla, Abinash Rout     1
Jagan Shakti                                                          1
Pushpendra Singh                                                      1
Farogh Siddique                                                       1
Bumpy                                                                 1
Name: count, Length: 349, dtype: int64

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2199 entries, 0 to 2198
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  2199 non-null   int64 
 1   movie_id    2199 non-null   object
 2   movie_name  2199 non-null   object
 3   year        2134 non-null   object
 4   genre       2199 non-null   object
 5   overview    2199 non-null   object
 6   director    2199 non-null   object
 7   cast        2199 non-null   object
dtypes: int64(1), object(7)
memory usage: 137.6+ KB


In [8]:
df = df.drop(columns=['Unnamed: 0', 'movie_id'])

In [9]:
# df = df[['movie_name', 'year']].dropna().drop_duplicates().reset_index(drop=True)

In [10]:
# df.info()

In [11]:
# Convert overviews to TF-IDF vectors
vectorizer = TfidfVectorizer(stop_words='english')
tfidf_matrix_name = vectorizer.fit_transform(df['overview'])
tfidf_matrix_genre = vectorizer.fit_transform(df['genre'])

In [12]:
tfidf_matrix_genre

<2199x20 sparse matrix of type '<class 'numpy.float64'>'
	with 5337 stored elements in Compressed Sparse Row format>

In [13]:
# Compute cosine similarity matrix
cos_sim = cosine_similarity(tfidf_matrix_name)
cos_sim_genre = cosine_similarity(tfidf_matrix_genre)

In [14]:
cos_sim_genre

array([[1.        , 0.        , 0.30289947, ..., 0.        , 0.        ,
        0.36085961],
       [0.        , 1.        , 0.34443508, ..., 0.        , 0.06109089,
        0.11857699],
       [0.30289947, 0.34443508, 1.        , ..., 0.        , 0.        ,
        0.69633943],
       ...,
       [0.        , 0.        , 0.        , ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.06109089, 0.        , ..., 0.        , 1.        ,
        0.08779798],
       [0.36085961, 0.11857699, 0.69633943, ..., 0.        , 0.08779798,
        1.        ]])

In [15]:
# Choose the movie
movie_title = "Jawan"
movie_index = df[df['movie_name'] == movie_title].index[0]

In [16]:
movie_index

0

In [17]:
# Get similarity scores for the chosen movie
similarities = cos_sim[movie_index]
similarities_genre = cos_sim_genre[movie_index]

In [18]:
print(similarities.shape)

(2199,)


In [19]:
similarities_genre

array([1.        , 0.        , 0.30289947, ..., 0.        , 0.        ,
       0.36085961])

In [20]:
cosine_df=pd.DataFrame(cos_sim,index=df['movie_name'],columns=df['movie_name'])
cosine_df_genre=pd.DataFrame(cos_sim_genre,index=df['movie_name'],columns=df['movie_name'])

In [21]:
cosine_df_genre

movie_name,Jawan,Jaane Jaan,Jailer,Rocky Aur Rani Kii Prem Kahaani,OMG 2,Sukhee,The Great Indian Family,The BFG,Pathaan,Mastaney,...,Souten: The Other Woman,Humko Tumse Pyaar Hai,Faraar,Taxi Driver,Kalicharan,Heeriye,Sur: The Melody of Life,Time to Dance,Nigahen: Nagina Part II,Kyo Kii... Main Jhuth Nahin Bolta
movie_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Jawan,1.000000,0.000000,0.302899,0.000000,0.000000,0.000000,0.000000,0.000000,0.668184,0.217891,...,0.811484,0.000000,0.337758,0.000000,0.235127,0.811484,0.000000,0.000000,0.000000,0.360860
Jaane Jaan,0.000000,1.000000,0.344435,0.080307,0.150754,0.287241,0.000000,0.000000,0.000000,0.071598,...,0.000000,0.140405,0.495060,0.000000,0.267369,0.000000,0.085577,0.000000,0.061091,0.118577
Jailer,0.302899,0.344435,1.000000,0.254826,0.478366,0.000000,0.000000,0.000000,0.202393,0.193266,...,0.000000,0.000000,0.762908,0.000000,0.531089,0.000000,0.000000,0.000000,0.000000,0.696339
Rocky Aur Rani Kii Prem Kahaani,0.000000,0.080307,0.254826,1.000000,0.532701,0.279580,0.846303,0.473831,0.000000,0.069689,...,0.000000,0.136660,0.108026,0.000000,0.000000,0.000000,0.083294,0.000000,0.604311,0.419002
OMG 2,0.000000,0.150754,0.478366,0.532701,1.000000,0.524834,0.000000,0.000000,0.000000,0.130821,...,0.000000,0.256541,0.202789,0.000000,0.000000,0.000000,0.156362,0.000000,0.111623,0.786560
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Heeriye,0.811484,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.542220,0.000000,...,1.000000,0.000000,0.000000,0.000000,0.000000,1.000000,0.000000,0.000000,0.000000,0.000000
Sur: The Melody of Life,0.000000,0.085577,0.000000,0.083294,0.156362,0.297926,0.000000,0.000000,0.000000,0.074262,...,0.000000,0.609500,0.115115,0.954589,0.000000,0.000000,1.000000,0.954589,0.063363,0.122988
Time to Dance,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.485939,0.000000,1.000000,0.000000,0.000000,0.954589,1.000000,0.000000,0.000000
Nigahen: Nagina Part II,0.000000,0.061091,0.000000,0.604311,0.111623,0.212682,0.643799,0.830317,0.000000,0.053013,...,0.000000,0.103960,0.082177,0.000000,0.000000,0.000000,0.063363,0.000000,1.000000,0.087798


In [22]:
cosine_df['Jawan'].sort_values(ascending=False)

movie_name
Jawan                                          1.000000
Anek                                           0.214802
Lost                                           0.161868
Satya 2                                        0.158907
Untitled SRK-Suhana-Marflix movie              0.157197
                                                 ...   
Laaga Chunari Mein Daag: Journey of a Woman    0.000000
Khuda Gawah                                    0.000000
Julie 2                                        0.000000
U Turn                                         0.000000
Kyo Kii... Main Jhuth Nahin Bolta              0.000000
Name: Jawan, Length: 2199, dtype: float64

In [23]:
similarities[movie_index] = 1  # So it doesn't get picked
top_indices = np.argsort(similarities)[-5:][::-1]  # Top 5, highest first

# Show titles of similar movies
for i in top_indices:
    print(df['movie_name'][i],'-->', df['overview'][i])
    print('=============================')

Jawan --> A high-octane action thriller which outlines the emotional journey of a man who is set to rectify the wrongs in the society.
Anek --> A socio political, action thriller set against the geopolitical backdrop of Northeast India.
Lost --> An emotional thriller that represents a higher quest, a search for lost values of empathy and integrity.
Satya 2 --> To build a strong underworld, a man comes to Mumbai to rectify the errors made by previous mafia leaders.
Untitled SRK-Suhana-Marflix movie --> A Father-Daughter action thriller that delves into the intense and complex relationship between a father and daughter.


In [24]:
cosine_df_genre['Jawan'].sort_values(ascending=False)

movie_name
Jawan                     1.0
Bellbottom                1.0
Om - The Battle Within    1.0
Saaho                     1.0
Collar Bomb               1.0
                         ... 
Badmaa$h Company          0.0
Sharaabi                  0.0
Tutak Tutak Tutiya        0.0
What's Your Raashee?      0.0
Dosti: Friends Forever    0.0
Name: Jawan, Length: 2199, dtype: float64

In [61]:
similarities_genre[movie_index] = -1  # So it doesn't get picked
top_indices = np.argsort(similarities_genre)[-55:][::-1]  # Top 5, highest first

# Show titles of similar movies
for i in top_indices:
    print(df['movie_name'][i],'-->', df['genre'][i])
    print('=============================')

Collar Bomb --> Action, Thriller
D-Day --> Action, Thriller
Vishwaroopam 2 --> Action, Thriller
Vishwaroopam --> Action, Thriller
Kartoos --> Action, Thriller
Taish --> Action, Thriller
Thugs --> Action, Thriller
Ruslaan --> Action, Thriller
Dhaakad --> Action, Thriller
Sanak --> Action, Thriller
Kaala Sona --> Action, Thriller
Tehran --> Action, Thriller
The Burning Train --> Action, Thriller
IB 71 --> Action, Thriller
Chalta Purza --> Action, Thriller
Saaho --> Action, Thriller
Hijack --> Action, Thriller
Blank --> Action, Thriller
Rakshak --> Action, Thriller
Teesri Aankh --> Action, Thriller
Spy --> Action, Thriller
Genius --> Action, Thriller
Farz --> Action, Thriller
Amir Garib --> Action, Thriller
Akira --> Action, Thriller
Om - The Battle Within --> Action, Thriller
Bellbottom --> Action, Thriller
Dial 100 --> Action, Thriller
Aag Ke Sholay --> Action, Thriller
Don 3: The Final Chapter --> Action, Thriller
State of Siege: Temple Attack --> Action, Thriller
Faraaz --> Action, Th

In [26]:
df = df[['movie_name', 'overview', 'genre']]

import pickle
pickle.dump(df, open('movies.pkl', 'wb'))

In [27]:
pickle.dump(cos_sim, open('similarity_overview.pkl', 'wb'))

In [28]:
pickle.dump(cos_sim_genre, open('similarity_genre.pkl', 'wb'))