In [3]:
pip install fpdf

Collecting fpdfNote: you may need to restart the kernel to use updated packages.

  Downloading fpdf-1.7.2.tar.gz (39 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: fpdf
  Building wheel for fpdf (setup.py): started
  Building wheel for fpdf (setup.py): finished with status 'done'
  Created wheel for fpdf: filename=fpdf-1.7.2-py2.py3-none-any.whl size=40713 sha256=6c57c9f93237979e8ef2a8d8c5ae53e5166fd24b21032f9b7dcd0f956b8bea2b
  Stored in directory: c:\users\dell\appdata\local\pip\cache\wheels\6e\62\11\dc73d78e40a218ad52e7451f30166e94491be013a7850b5d75
Successfully built fpdf
Installing collected packages: fpdf
Successfully installed fpdf-1.7.2


In [2]:
pip install requests

Note: you may need to restart the kernel to use updated packages.


In [6]:
pip install beautifulsoup4

Note: you may need to restart the kernel to use updated packages.


In [8]:
pip install fpdf

Note: you may need to restart the kernel to use updated packages.


In [18]:
pip install pandas

Note: you may need to restart the kernel to use updated packages.


In [20]:
pip install datetime

Collecting datetime
  Downloading DateTime-5.5-py3-none-any.whl.metadata (33 kB)
Downloading DateTime-5.5-py3-none-any.whl (52 kB)
   ---------------------------------------- 0.0/52.6 kB ? eta -:--:--
   ---------------------------------------- 52.6/52.6 kB 2.8 MB/s eta 0:00:00
Installing collected packages: datetime
Successfully installed datetime-5.5
Note: you may need to restart the kernel to use updated packages.


In [23]:
import requests
from bs4 import BeautifulSoup
import csv
from fpdf import FPDF
import re
from datetime import datetime

API_KEY = "d2446cb5"

# ------------------------- #
# 1️⃣ Web scrapping Phase   #
# ------------------------- #

def fetch_movie_titles():
    print("Fetch movie titles from Rotten tomatoes and Box Office Mojo...\n")

    try:
        #Website1: Rotten Tomatoes
        rotten_tomatoes_url = "https://www.rottentomatoes.com/browse/movies_in_theaters"
        #Mimic the browser
        headers = {"User-Agent": "Mozilla/5.0"}
        response_rotten_tomatoes = requests.get(rotten_tomatoes_url, headers = headers)
        #Check the url is working or not
        if response_rotten_tomatoes.status_code != 200:
            print(f"❌failed to fetch the data from Rotten Tomatoes. Status Code: {response_rotten_tomatoes.status_code}")
            return []       
        #Parsing the HTML content from Box Office Mojo
        soup1 = BeautifulSoup(response_rotten_tomatoes.content, "html.parser")
        #Extract movie titles from Rotten Tomatoes website
        rotten_tomatoes_titles = [tag.get("href").split("/")[-1].replace("-", " ").title() 
                                  for tag in soup1.select("a.js-tile-link")]
        print(f"Total ({len(rotten_tomatoes_titles)}) movie titles found in Rotten Tomatos\n")
        ##print(f"✅ Fetched Movie Titles from 🍅 Rotten Tomatoes website...\n")
        print(rotten_tomatoes_titles[:10], "\n") #It is not loading more than 10 movies

        #Website2: Box Office Mojo
        box_office_mojo_url = "https://www.boxofficemojo.com/weekend/"
        response_box_office_mojo = requests.get(box_office_mojo_url, headers = headers)

        #Check the url is working or not
        if response_box_office_mojo.status_code != 200:
            print(f"❌failed to fetch the data from Box Office Mojo. Status Code: {response_box_office_mojo.status_code}")
            return []
        
        #Parsing the HTML content from Box Office Mojo
        soup2 = BeautifulSoup(response_box_office_mojo.text, "html.parser")
        ##Extract movie titles from Box office mojo
        box_office_mojo_titles = [tag.get_text(strip=True)
                              for tag in soup2.select("td.a-text-left.mojo-field-type-release a.a-link-normal")]
        print(f"Total ({len(box_office_mojo_titles)}) movie titles found in Box Office Mojo Website\n")
        #print(soup.prettify())
        print(set(box_office_mojo_titles[:20]),"\n")
        #return box_office_mojo_titles
        total_movie_titles = list(set(rotten_tomatoes_titles + box_office_mojo_titles))
        ##print(f"Total ✅ Fetched movies from Rotten Tomatoes and Box Office Mojo is {total_movie_titles}\n")
        print("Combining the movie titles from Box Office Mojo and Rotten Tomatoes websites....\n")
        print(total_movie_titles,"\n")
        return total_movie_titles

    except requests.exceptions.RequestException as e:
        print(f"❌ Error while scapping Rotten Tomatoes: {e}")
        return[]

# Test the function
titles = fetch_movie_titles()
print(titles)

# --------------------- #
# 2️⃣ Extract Phase     #
# --------------------- #

def extract_movies(titles):
    print("🚀 Fetching Data from OMDB API....🎬")
    movies = []

    for title in titles:
        url = f"http://www.omdbapi.com/?t={title}&apikey={API_KEY}"

        try:
            response = requests.get(url, timeout=10)
            data = response.json()

            if data.get("Response") == "True":
                movies.append(data)
            else:
                print(f"❌ Movie not found: {titles}")
        except requests.exceptions.RequestException as e:
            print(f"❌ Request error for {title}: {e}")

    print(f"✅ Fetched {len(movies)} movies successfully!\n")
    return movies

# --------------------- #
# 3️⃣ Transform Phase   #
# --------------------- #

def transform_data(movies):
    print("🔄 Transforming Data...🧹")
    transformed_movies = []

    for movie in movies:
        #Task1 - Movie Title Transformation
            #Remove Special Characters: Use regular expressions to eliminate non-alphanumeric characters. 
            #Standardized Case: Convert all titles to Title Case for consistency. 
        #Trim Whitespaces: Remove leading/trailing spaces to maintain uniformity.

        title_clean = re.sub(r'[^\w\s]', '',movie.get('Title', "").strip()).title()

        #Task2 - Release Date Transformation
            #Date Formatting: Replace spaces with hyphens to standardize the date format (e.g., DD MMM YYYY to DD-MMM-YYYY).
            #Convert to Date Object: Use date parsing to convert text dates into proper date formats.
            #Handle Missing Dates: Replace missing dates with "Unknown." 

        release_date = movie.get("Released", "N/A")
        cleaned_release_date = (datetime.strptime(release_date, "%d %b %Y").strftime("%Y-%m-%d")
                                if release_date != "N/A" else "Unknown")

        #Task3 - Genre Transformation
            #Convert to Lowercase: Ensure all genres are in lowercase for consistency.
            #Split Genres: If multiple genres are present, separate them into a list.
            #Remove Duplicates: Ensure unique genre entries.

        genres = list(set([genre.strip().lower() for genre in movie.get("Genre", "").split(",")]))

        #Task4 -  IMDb Rating Transformation
            #Convert to Numeric: Change rating from text to a floating-point number.
            #Round Off: Round ratings to one decimal place. 
            #Normalize: Optionally, normalize ratings on a scale of 0 to 1.

        imdb_rating = round(float(movie.get("imdbRating",0)), 1) if movie.get("imdbRating") != "N/A" else None
        imdb_rating_normalized = imdb_rating / 10 if imdb_rating else None

        #Task5 -  Actors Transformation
            #Limit to Top 3: Display only the top three actors.
            #Trim Spaces: Remove extra spaces around names. 
            #Sort Alphabetically (Optional): For consistency in display.

        actors =  ", ".join(sorted([actor.strip() for actor in movie.get("Actors", "").split(",")[:3]]))

        #Task6 - Box Office Transformation 
            #Remove Currency Symbols: Eliminate $, ,, and other non-numeric characters.
            #Convert to Numeric: Store as an integer for analysis. 
            #Handle Missing Data: Replace missing values with 0 or N/A.

        box_office = re.sub(r'[^\d]', '', movie.get("BoxOffice", ""))
        cleaned_box_office = int(box_office) if box_office else "N/A"

        #Task7 - Awards Transformation
            #Extract Numbers: Identify and sum all numeric values related to awards won.
            #Standardize Format: Display total awards won. 
            #Handle Missing Awards: Set to 0 if no data is available.

        awards_text = movie.get("Awards", "")
        cleaned_awards = re.sub(r'[^\d]', '', awards_text)
        numbers_in_awards = [int(num) for num in cleaned_awards.split()] if cleaned_awards else []
        total_awards = sum(numbers_in_awards) if numbers_in_awards else 0

        #Task8 - Metascore Transformation
            #Convert to Integer: Change metascore to an integer for calculations.
            #Normalize: Convert to a 0-1 scale by dividing by 100.
            #Handle Missing Values: Replace "N/A" with None. 

        metascore = re.sub(r'[^\d]', '', movie.get("Metascore", ""))
        cleaned_metascore = round(100/(int(metascore))) if metascore.isdigit() else "N/A"

        #Task9 - Language Transformation
            #Convert to Lowercase: Ensure all language names are in lowercase.
            #Standardize Codes: Optionally convert to ISO language codes.
            #Handle Missing Data: Replace missing languages with "Unknown."

        language = re.sub(r'[^\w\s]', '', movie.get("Language", ""))
        cleaned_language = list(language.strip().lower() if language else "Unknown.")

        #Task10 - Production Transformation
            #Remove Special Characters: Clean the production company names.
            #Standardize Names: Correct common misspellings and standardize abbreviations. 
            #Handle Missing Data: Replace missing production companies with "Independent."

        production = movie.get("Production", "")
        cleaned_production = re.sub(r'[^\w\s]', '', production if production else "Independent.")

        #Appending the movies data to the movies
        transformed_movies.append({
            "Title": title_clean,
            "Released Date": cleaned_release_date,
            "Genre": genres,
            "IMDB Rating": imdb_rating,
            "Normalized IMDB Rating": imdb_rating_normalized,
            "Top 3 Actors": actors,
            "Box Office": cleaned_box_office,
            "Total Awards": total_awards,
            "Metascore": cleaned_metascore,
            "Language": cleaned_language,
            "Production": cleaned_production
        })

    print(f"✨ Transformation is complete for {len(transformed_movies)} movies!\n")
    return transformed_movies

# --------------------- #
# 4️⃣ Load Phase   #
# --------------------- #

def load_to_csv(data):
    print("💾 Saving data to csv...📊")
    timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
    filename = f"movies_{timestamp}.csv" #Add Time Stamp to Overwrite

    with open(filename, mode="w", newline="", encoding="utf-8") as file:
        writer = csv.DictWriter(file, fieldnames=data[0].keys())
        writer.writeheader()
        writer.writerows(data)

    print(f"✅ Data successfully saved to {filename}!\n")

# --------------------- #
# 🚀 Execute ETL Pipeline   #
# --------------------- #
if __name__ == "__main__":
    titles = fetch_movie_titles()
    if titles:
        movies = extract_movies(titles)
        if movies:
            transformed_data = transform_data(movies)
            load_to_csv(transformed_data)

        else:
            print("⚠️ No Movie data fetched from the API.")
    
    else:
        print("⚠️ No Movie titles found during scapping. ")

Fetch movie titles from Rotten tomatoes and Box Office Mojo...

Total (11) movie titles found in Rotten Tomatos

['Purple_Rain', 'My_Motherland', 'Open_Your_Eyes_Jeffrey', 'Guns_Of_Redemption', 'The_Visitor_2024', 'The_Way_My_Way', 'Play_It_As_It_Lays', 'The_Corpse', 'A_Match', 'You_Burn_Me'] 

Total (11) movie titles found in Box Office Mojo Website

{'Flight Risk', 'Dog Man', 'Den of Thieves: Pantera', 'Mufasa: The Lion King', 'Captain America: Brave New World'} 

Combining the movie titles from Box Office Mojo and Rotten Tomatoes websites....

['Purple_Rain', 'My_Motherland', 'The_Way_My_Way', 'Play_It_As_It_Lays', 'The_Visitor_2024', 'You_Burn_Me', 'Flight Risk', 'Dog Man', 'The_Corpse', 'Den of Thieves: Pantera', 'Mufasa: The Lion King', 'A_Match', 'Captain America: Brave New World', 'Ufc_313_Pereira_Vs_Ankalaev', 'Open_Your_Eyes_Jeffrey', 'Guns_Of_Redemption'] 

['Purple_Rain', 'My_Motherland', 'The_Way_My_Way', 'Play_It_As_It_Lays', 'The_Visitor_2024', 'You_Burn_Me', 'Flight Ris