In [None]:
!pip install tmdbv3api

In [2]:
# Importing relevant functions
from tmdbv3api import TMDb, Movie
import pandas as pd

# Loading dataset 
df = pd.read_csv("movies.csv")

In [3]:
import requests
import time
from typing import Dict, Optional

# API Initialization 
API_KEY = "880680398c355c1d81499545135a1dde"
BASE_URL = "https://api.themoviedb.org/3"

# Defines a function that finds a movie title and release year (optional) to find a match
def search_movie(title: str, year: Optional[int] = None) -> Optional[Dict]:
    """Search for a movie by title and optionally year"""
    search_url = f"{BASE_URL}/search/movie" # buliding the API URL to search for a movie 
    # Creating a dictionary of parameters to send with request
    params = {
        'api_key': API_KEY,
        'query': title
    }
    # If year was provided, add it to search parameters for more precise matching
    if year:
        params['year'] = year

    # Send GET request to TMDB API with parameters
    response = requests.get(search_url, params=params)
    
    # Checking if API request was successful
    if response.status_code == 200:
        data = response.json() # Converts JSON response to Python dictionary 
        if data['results']:
            return data['results'][0]  # Return first match (the most relevant) 
    # If API failed or no results found, return None
    return None

In [4]:
# Function finds information from movie ID found in the search 
def get_movie_details(movie_id: int) -> Optional[Dict]:
    """Get detailed movie information including budget, revenue, and runtime"""
    details_url = f"{BASE_URL}/movie/{movie_id}" # Building URL for specific movie information 
    params = {
        'api_key': API_KEY
    }
    # Sending request
    response = requests.get(details_url, params=params)
    # Checking if successful 
    if response.status_code == 200:
        return response.json() # Converts JSON response to Python dictionary
    return None # If API failed or no results found, return None

def extract_movie_data(title: str, year: Optional[int] = None) -> Dict:
    """Complete function to get budget, revenue, and runtime for a movie"""
    # First, search for the movie using title and year
    movie_search = search_movie(title, year)

    # If not found, return None
    if not movie_search:
        return {
            'title': title, # original title 
            'tmdb_id': None, 
            'budget': None,
            'revenue': None,
            'runtime': None,
            'genres': None, 
            'keywords': None,
            'tagline': None,
            'overview': None,
            'status': 'not_found' # no information found
        }

    # Get detailed information using movie ID from search result 
    movie_details = get_movie_details(movie_search['id'])

    # If details failed to be retreived, partial return success 
    if not movie_details:
        return {
            'title': title, # original title 
            'tmdb_id': movie_search['id'], # ID was found 
            'budget': None,
            'revenue': None,
            'runtime': None,
            'genres': None, 
            'keywords': None,
            'tagline': None,
            'overview': None,
            'status': 'details_failed' # no details found
        }
    # If found all requested details, return complete result 
    return {
        'title': title,
        'tmdb_id': movie_search['id'],
        'budget': movie_details.get('budget', 0),
        'revenue': movie_details.get('revenue', 0),
        'runtime': movie_details.get('runtime'),
        'genres': movie_details.get('genres'), 
        'keywords': movie_details.get('keywords'),
        'tagline': movie_details.get('tagline'),
        'overview': movie_details.get('overview'),
        'status': 'success'
    }

# Iterates through each movie, calls the extraction function, adds to original dataset 
def enrich_movie_dataset(df: pd.DataFrame, batch_size: int = 50) -> pd.DataFrame:
    """
    Enrich the movie dataset with TMDB data
    Processes in batches with rate limiting to respect API limits
    """
    enriched_data = [] # Empty list to store all new movie records

    # Loop through each row
    for index, row in df.iterrows():
        # Prints progress message showing current movie being processed
        print(f"Processing {index + 1}/{len(df)}: {row['Title']} ({row['Year']})")

        # Extract movie data from TMDB by calling the extraction function 
        movie_data = extract_movie_data(row['Title'], row['Year'])

        # Combine original row data with TMDB data
        enriched_row = row.to_dict() # converts to dictionary 
        enriched_row.update(movie_data) # merges the movie_data dictionary into enriched_row
        enriched_data.append(enriched_row) # adds enriched_row to results

        # Rate limiting - TMDB allows 40 requests per 10 seconds
        if (index + 1) % batch_size == 0: # checks if a full batch has been processed 
            print(f"Processed {index + 1} movies. Pausing for rate limiting...") # pauses longer if the batch limit has been reached
            time.sleep(10) # Waiting 10 seconds
        else:
            time.sleep(0.25)  # Small delay between requests when batch limit hasn't been met

    # Convert back to a dataframe 
    return pd.DataFrame(enriched_data)

In [None]:
# Processing all 9,515 movies in the original Kaggle Dataset
print("=== TMDB API Movie Data Enrichment for All Movies ===\n")

# Load Kaggle dataset 
df = pd.read_csv('movies.csv')

# Run the enrichment on the entire dataset
# Calls above function
enriched_df = enrich_movie_dataset(df)

# Display results
print("\n=== Final Results ===")
print(f"Movies processed: {len(enriched_df)}") # shows the total number of movies that we iterated over
print(f"Successfully matched: {len(enriched_df[enriched_df['status'] == 'success'])}") # movies with successfully found information 
print(f"Not found: {len(enriched_df[enriched_df['status'] == 'not_found'])}") # movies that couldn't be found in TMDb database
print(f"Details failed: {len(enriched_df[enriched_df['status'] == 'details_failed'])}") # movies that details couldn't be retreived 

# Save the final enriched dataset
enriched_df.to_csv('movies_with_tmdb_data_complete.csv', index=False)
print(f"\nComplete enriched dataset saved to 'movies_with_tmdb_data_complete.csv'")

# Show sample of enriched data
print("\nSample of enriched data:")
columns_to_show = ['Title', 'Year', 'budget', 'revenue', 'runtime', 'genres', 'keywords', 'tagline', 'overview', 'status']
print(enriched_df[columns_to_show].head(10).to_string(index=False))

# Prints completion message 
print("\n" + "="*60) # prints a blank line and then 60 equals signs
print("PROCESSING COMPLETE!") # message 
print("="*60) # 60 equals signs

In [None]:
# Reading in data with TMDb info
df = pd.read_csv("movies_with_tmdb_data_complete.csv")

# Dropping unuseful columns
df = df.drop(columns=["title"])
df = df.drop(columns=["keywords"])
df = df.drop(columns=["status"])
df = df.drop(columns=["tmdb_id"])

# filtering the data to only movies with a reported budget and revenue
df = df[df["budget"] >0]
df = df[df["revenue"] >0]

# Creating release age feature
df['Release_age'] = 2025 - df['Year']

# Previewing data
df.head(5)

In [None]:
# Keeping only rows with a tagline 
df_filtered = df[df['tagline'].notnull()]
df_filtered.head()

In [None]:
# Saving final dataframe to a CSV file 
df_filtered.to_csv('movies_with_tmdb_data_complete.csv', index=False)