# **MOVIE RECOMMENDATION SYSTEM**

## Importing the Datasets

In [3]:

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv
/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv


## Data Loading and Exploratory Data Analysis

In [4]:
import pandas as pd

# Load the datasets
movies_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

# Print shapes
print("Movies CSV Shape:", movies_df.shape)
print("Credits CSV Shape:", credits_df.shape)

# Show column headers
print("\nMovies CSV Columns:")
print(movies_df.columns.tolist())

print("\nCredits CSV Columns:")
print(credits_df.columns.tolist())

# Show first 3 rows for both datasets
print("\nSample rows from Movies CSV:")
print(movies_df.head(3))

print("\nSample rows from Credits CSV:")
print(credits_df.head(3))


Movies CSV Shape: (4803, 20)
Credits CSV Shape: (4803, 4)

Movies CSV Columns:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'original_title', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count']

Credits CSV Columns:
['movie_id', 'title', 'cast', 'crew']

Sample rows from Movies CSV:
      budget                                             genres  \
0  237000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   
1  300000000  [{"id": 12, "name": "Adventure"}, {"id": 14, "...   
2  245000000  [{"id": 28, "name": "Action"}, {"id": 12, "nam...   

                                       homepage      id  \
0                   http://www.avatarmovie.com/   19995   
1  http://disney.go.com/disneypictures/pirates/     285   
2   http://www.sonypictures.com/movies/spectre/  206647   

                                           

## Merging the Data

In [5]:
import pandas as pd

# Load both datasets
movies_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_movies.csv')
credits_df = pd.read_csv('/kaggle/input/tmdb-movie-metadata/tmdb_5000_credits.csv')

# Merge on 'id' (from movies_df) and 'movie_id' (from credits_df)
merged_df = movies_df.merge(credits_df, left_on='id', right_on='movie_id')

# Drop columns that are now redundant or unnecessary
merged_df.drop(columns=[
    'title_y',         # duplicate of title_x
    'movie_id',        # same as id
    'original_title'   # same as title for English movies
], inplace=True)

# Rename 'title_x' to just 'title' for clarity
merged_df.rename(columns={'title_x': 'title'}, inplace=True)

# Final merged shape and columns
print("Merged DataFrame Shape:", merged_df.shape)
print("\n Merged Columns:")
print(merged_df.columns.tolist())

# Display a sample row for verification
print("\nSample Merged Row:")
print(merged_df.iloc[0])


✅ Merged DataFrame Shape: (4803, 21)

📌 Merged Columns:
['budget', 'genres', 'homepage', 'id', 'keywords', 'original_language', 'overview', 'popularity', 'production_companies', 'production_countries', 'release_date', 'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title', 'vote_average', 'vote_count', 'cast', 'crew']

🔍 Sample Merged Row:
budget                                                          237000000
genres                  [{"id": 28, "name": "Action"}, {"id": 12, "nam...
homepage                                      http://www.avatarmovie.com/
id                                                                  19995
keywords                [{"id": 1463, "name": "culture clash"}, {"id":...
original_language                                                      en
overview                In the 22nd century, a paraplegic Marine is di...
popularity                                                     150.437577
production_companies    [{"name": "Ingenious Film 

**Selecting Important Columns**

In [23]:
# Select the required columns from merged_df
movies = merged_df[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'tagline', 'spoken_languages', 'vote_average', 'production_companies']]



1. **Feature Selection**
2. **Cleaning the Dataset**
3. **Creating the tags Column**

In [24]:
import pandas as pd
import ast

# Step 1: Feature selection
movies = merged_df[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'tagline', 'spoken_languages', 'vote_average', 'production_companies']].copy()

# Step 2: Helper functions
def convert(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)]
    except:
        return []

def get_top_cast(obj):
    try:
        return [i['name'] for i in ast.literal_eval(obj)[:3]]
    except:
        return []

def get_director(obj):
    try:
        for i in ast.literal_eval(obj):
            if i.get('job') == 'Director':
                return [i['name']]
        return []
    except:
        return []

def clean_list(l):
    return [i.replace(" ", "").lower() for i in l]

# Step 3: Apply transformations with .loc to avoid warnings
movies.loc[:, 'genres'] = movies['genres'].apply(convert).apply(clean_list)
movies.loc[:, 'keywords'] = movies['keywords'].apply(convert).apply(clean_list)
movies.loc[:, 'cast'] = movies['cast'].apply(get_top_cast).apply(clean_list)
movies.loc[:, 'crew'] = movies['crew'].apply(get_director).apply(clean_list)

# Step 4: Preprocess overview safely
movies['overview'] = movies['overview'].fillna('')
movies['overview'] = movies['overview'].apply(lambda x: x.lower().split())

# Step 5: Combine all features into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# ✅ Preview
print(movies[['title', 'tags']].head())


                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   
3                     The Dark Knight Rises   
4                               John Carter   

                                                tags  
0  in the 22nd century, a paraplegic marine is di...  
1  captain barbossa, long believed to be dead, ha...  
2  a cryptic message from bond’s past sends him o...  
3  following the death of district attorney harve...  
4  john carter is a war-weary, former military ca...  


## Converting Text to Numbers (Vectorization)

In [25]:
from sklearn.feature_extraction.text import CountVectorizer

# Step 5: Vectorize the 'tags' column using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')

# Convert 'tags' into vectors
vectors = cv.fit_transform(movies['tags']).toarray()

# Check the shape of the vectorized data
print(f"Shape of the vectorized data: {vectors.shape}")


Shape of the vectorized data: (4803, 5000)


## Calculating Similarity

In [26]:
from sklearn.metrics.pairwise import cosine_similarity

# Step 6: Calculate Cosine Similarity between movie vectors
similarity = cosine_similarity(vectors)

# Check the similarity between the first movie and the rest
print(similarity[0])


[1.         0.08980265 0.05986843 ... 0.0248452  0.02777778 0.        ]


## **Recommending Movies**

1. Find the movie’s index by its title.
2. Get its similarity scores with all movies.
3. Sort the movies by similarity.
4. Pick the top 10 most similar ones.
5. Return their titles.

In [27]:
def recommend(movie_title):
    # Step 7: Get the index of the movie based on its title
    idx = movies[movies['title'] == movie_title].index[0]
    
    # Step 8: Get similarity scores for that movie
    sim_scores = list(enumerate(similarity[idx]))
    
    # Step 9: Sort the movies based on similarity scores in descending order
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Step 10: Get the top 10 most similar movies (excluding the movie itself)
    sim_scores = sim_scores[1:11]
    
    # Step 11: Get the movie indices
    movie_indices = [i[0] for i in sim_scores]
    
    # Step 12: Return the top 10 most similar movies
    return movies['title'].iloc[movie_indices]


## **Output**

A list of 10 movies similar to the given input.

In [38]:
# Test the recommendation system with an example movie
recommended_movies = recommend('Jack the Giant Slayer')

# Display the recommended movies
print("Top 10 recommended movies : ")
print(recommended_movies)


Top 10 recommended movies : 
3087                Nicholas Nickleby
3663                       Henry & Me
1277                            Delgo
803                       DragonHeart
138                The Last Airbender
3544                          Animals
14                       Man of Steel
1669                      The Promise
1942    Cirque du Soleil: Worlds Away
812                        Pocahontas
Name: title, dtype: object
