Data Loading & Exploration

In [3]:
import pandas as pd

# Dataset ko load karna
df = pd.read_csv('dataset.csv')

# Dataset ka initial inspection
df.head()  # Pehli 5 rows ko dekhein
df.info()  # Data types aur missing values ko dekhein


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

Data Cleaning (Missing Data Handling)

In [4]:
# Check for missing values
df.isnull().sum()

# Agar missing values hain, toh unhe drop ya fill karein
df = df.dropna(subset=['overview'])  # Overview ke missing values ko drop kar rahe hain


Text Preprocessing (Tokenization & Cleaning)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
import string

# Overview column ko clean karte hain
def clean_text(text):
    text = text.lower()  # Lowercase conversion
    text = ''.join([char for char in text if char not in string.punctuation])  # Remove punctuation
    return text

df['overview'] = df['overview'].apply(clean_text)

# TF-IDF Vectorizer setup
tfidf = TfidfVectorizer(stop_words='english')

# Overview column ko vectorize karna
tfidf_matrix = tfidf.fit_transform(df['overview'])


Cosine Similarity Calculation

In [6]:
from sklearn.metrics.pairwise import cosine_similarity

# Cosine Similarity calculate karna
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


Recommendation Function

In [7]:
def recommend_movie(title, cosine_sim=cosine_sim):
    # Movie index find karna
    idx = df[df['title'] == title].index[0]

    # Similarity scores ke saath movies ko sort karna
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Top 5 similar movies ko select karna
    sim_scores = sim_scores[1:6]
    movie_indices = [i[0] for i in sim_scores]

    return df['title'].iloc[movie_indices]

# Example usage
recommend_movie('The Dark Knight')


428                              Batman Returns
3                         The Dark Knight Rises
3854    Batman: The Dark Knight Returns, Part 2
119                               Batman Begins
299                              Batman Forever
Name: title, dtype: object

Final Testing

In [9]:
# Test with different movie titles
print(recommend_movie('Toy Story'))
print(recommend_movie('The Lego Movie'))
print(recommend_movie('Frozen'))


343                Toy Story 2
42                 Toy Story 3
2869    For Your Consideration
3383                 Losin' It
2569               Match Point
Name: title, dtype: object
4387                     A LEGO Brickumentary
2612                                Boat Trip
368            Percy Jackson: Sea of Monsters
1415                             Flash Gordon
1001    Street Fighter: The Legend of Chun-Li
Name: title, dtype: object
591         Stardust
3999             Ida
2299       Leap Year
1669     The Promise
809     Forrest Gump
Name: title, dtype: object
