## Extract

In [13]:
# Imports
import os
import time
from datetime import datetime

import pandas as pd
from imdb import IMDb
from simplejustwatchapi.justwatch import search

**Get Oscar Nominees/Winners**

In [14]:
# https://www.kaggle.com/datasets/unanimad/the-oscar-award Can be done with API, updated yearly
data_path = "the-oscar-award/full_data.csv"

oscar_awards_df = pd.read_csv(data_path, sep="\t") 
oscar_awards_df.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,NomId,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation,MultifilmNomination
0,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051251,The Noose,tt0019217,Richard Barthelmess,Richard Barthelmess,nm0001932,,Nickie Elkins,,,True
1,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051252,The Patent Leather Kid,tt0018253,Richard Barthelmess,Richard Barthelmess,nm0001932,,The Patent Leather Kid,,,True
2,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250a,The Last Command,tt0019071,Emil Jannings,Emil Jannings,nm0417837,True,General Dolgorucki [Grand Duke Sergius Alexander],,,True
3,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250b,The Way of All Flesh,tt0019553,Emil Jannings,Emil Jannings,nm0417837,True,August Schilling,,,True
4,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,an0051255,A Ship Comes In,tt0018389,Louise Dresser,Louise Dresser,nm0237571,,Mrs. Pleznik,,,


**Get Film's Information**

In [15]:
current_year = datetime.now().year 
# Obtain all the films from 2024
year_start = 2024
oscar_awards_df["YearStart"] = (
    oscar_awards_df["Year"].astype(str).str.split("/").str[0].astype(int))

recent_oscars = oscar_awards_df[ (oscar_awards_df["YearStart"] >= year_start) & (oscar_awards_df["YearStart"] <= current_year - 2) & (oscar_awards_df["FilmId"].notna()) ]
recent_oscars.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,NomId,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation,MultifilmNomination,YearStart
11876,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid012,The Brutalist,tt8999762,Adrien Brody,Adrien Brody,nm0004778,True,László Tóth,,,,2024
11877,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid010,A Complete Unknown,tt11563598,Timothée Chalamet,Timothée Chalamet,nm3154303,,Bob Dylan,,,,2024
11878,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid013,Sing Sing,tt28479262,Colman Domingo,Colman Domingo,nm0231458,,Divine G,,,,2024
11879,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid011,Conclave,tt20215234,Ralph Fiennes,Ralph Fiennes,nm0000146,,Lawrence,,,,2024
11880,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid014,The Apprentice,tt8368368,Sebastian Stan,Sebastian Stan,nm1659221,,Donald Trump,,,,2024


In [16]:
# Get unique FilmId from Oscar List
mask = ~recent_oscars["FilmId"].astype(str).str.startswith("tt")
recent_oscars.loc[mask, "FilmId"].unique()

# check to only get data that we don't have currently have
file_path = 'extracted_data/films_data.csv'
if os.path.exists(file_path):
    stored_film_ids = pd.read_csv(file_path)['imdb_id'].astype(str).unique()
    print(f"Loaded {len(stored_film_ids)} film IDs")
else:
    print(f"File not found: {file_path}")
    stored_film_ids = []  # Or initialize as empty
unique_films_ids = recent_oscars['FilmId'].str[2:].astype(str).unique()

combined_unique = list(set(stored_film_ids) ^ set(unique_films_ids))

# Movie web scraper
# Use the online IMDb interface
ia = IMDb('web')

film_rows = []

for film_id in combined_unique:
    film = ia.get_movie(film_id)
    time.sleep(0.5) # half‑second pause

    if film:
        # Extract budget from business info
        budget = None
        if 'business' in film.keys():
            business = film.get('business', {})
            # Budget is stored as a list in the 'budget' key
            budget_list = business.get('budget', [])
            if budget_list:
                budget = budget_list[0]  # Take first budget entry   

    if film:
        basic_info = {
            "title": film.get("title"),
            "year": film.get("year"),
            "rating": film.get("rating"),
            "genres": film.get("genres"),
            "runtime": film.get("runtimes"),
            "director": [d["name"] for d in film.get("directors", [])],
            "budget": budget,
            "imdb_id": film.movieID
        }
        print("Appending: ", basic_info['title'])
        film_rows.append(basic_info)
    else:
        print("Film was not found:", film_id)

films_information = pd.DataFrame(film_rows)
films_information.head(2)

File not found: extracted_data/films_data.csv
Appending:  Soundtrack to a Coup d'Etat
Appending:  Beautiful Men
Appending:  Elton John: Never Too Late
Appending:  Nickel Boys
Appending:  Instruments of a Beating Heart
Appending:  Gladiator II
Appending:  Anuja
Appending:  Nosferatu
Appending:  Wallace & Gromit: Vengeance Most Fowl
Appending:  Better Man
Appending:  I'm Not a Robot
Appending:  A Real Pain
Appending:  Sing Sing
Appending:  Inside Out 2
Appending:  The Apprentice
Appending:  Death by Numbers
Appending:  The Brutalist
Appending:  Beurk !
Appending:  Emilia Pérez
Appending:  Memoir of a Snail
Appending:  I am Ready, Warden
Appending:  Alien: Romulus
Appending:  Flow
Appending:  Dune: Part Two
Appending:  Kingdom of the Planet of the Apes
Appending:  Porcelain War
Appending:  The Substance
Appending:  The Last Ranger
Appending:  A Lien
Appending:  Anora
Appending:  The Only Girl in the Orchestra
Appending:  In the Shadow of the Cypress
Appending:  Black Box Diaries
Appending

Unnamed: 0,title,year,rating,genres,runtime,director,budget,imdb_id
0,Soundtrack to a Coup d'Etat,2024,7.8,"[Documentary, Music]",[150],[Johan Grimonprez],,14452174
1,Beautiful Men,2023,6.0,"[Animation, Short, Comedy, Drama]",[19],[Nicolas Keppens],,30835281


**Get Streaming Services where Films are Available**

In [17]:
film_identifiers = recent_oscars[["Film", "FilmId"]].drop_duplicates()

film_streaming_options = []

for row in film_identifiers.itertuples(index=False):
    film_name = row.Film
    film_id = row.FilmId

    results = search(film_name, "US", "en", 5)
    # Filter for specific IMDB ID
    target_imdb = film_id
    found = False
    for movie in results:
        if movie.imdb_id == target_imdb:
            found = True
            film_streaming_option = {
                "Name": movie.title,
                "Release Year": movie.release_year,
                "Poster": movie.poster,
                "IMdbId": movie.imdb_id,
                "Streaming": [offer.package.name for offer in movie.offers if offer.monetization_type == 'FLATRATE'],
                "Rent": [f"{offer.package.name} (${offer.price_value})" for offer in movie.offers if offer.monetization_type == 'RENT'],
                "Buy": [f"{offer.package.name} (${offer.price_value})" for offer in movie.offers if offer.monetization_type == 'BUY']
            }
            film_streaming_options.append(film_streaming_option)
    if not found: 
        print(f"FilmId was not found: {film_id}")        
    print(f"Film: {film_name}, ID: {film_id}")

film_streaming_options = pd.DataFrame(film_streaming_options)

Film: The Brutalist, ID: tt8999762
Film: A Complete Unknown, ID: tt11563598
Film: Sing Sing, ID: tt28479262
Film: Conclave, ID: tt20215234
Film: The Apprentice, ID: tt8368368
Film: Anora, ID: tt28607951
Film: A Real Pain, ID: tt21823606
Film: Wicked, ID: tt1262426
Film: Emilia Pérez, ID: tt20221436
Film: The Substance, ID: tt17526714
Film: I'm Still Here, ID: tt14961016
Film: Flow, ID: tt4772188
Film: Inside Out 2, ID: tt22022452
Film: Memoir of a Snail, ID: tt23770030
Film: Wallace & Gromit: Vengeance Most Fowl, ID: tt17163970
Film: The Wild Robot, ID: tt29623480
Film: Beautiful Men, ID: tt30835281
FilmId was not found: tt28768883
Film: In the Shadow of the Cypress, ID: tt28768883
FilmId was not found: tt31888603
Film: Magic Candies, ID: tt31888603
Film: Wander to Wonder, ID: tt28768679
Film: Yuck!, ID: tt28356173
Film: Dune: Part Two, ID: tt15239678
Film: Maria, ID: tt22893404
Film: Nosferatu, ID: tt5040012
Film: Gladiator II, ID: tt9218128
Film: Black Box Diaries, ID: tt30227076
Fil

In [18]:
# Paths
films_path = "extracted_data/films_data.csv"
oscars_path = "extracted_data/oscars_data.csv"
streaming_path = "extracted_data/streaming_data.csv"

# --- FILMS DATA ---
if not os.path.exists(films_path):
    # File doesn't exist → create it
    films_information.to_csv(films_path, index=False)
    print(f"Films CSV file has been created. Path: {films_path}")
else:
    # File exists → update only if needed
    if not films_information.empty:
        existing_ids = pd.read_csv(films_path)['imdb_id'].astype(str).unique()
        new_rows = films_information[~films_information['imdb_id'].astype(str).isin(existing_ids)]

        if not new_rows.empty:
            new_rows.to_csv(films_path, mode='a', header=False, index=False)
            print(f"Films CSV file has been upated with new films information. Path: {films_path}")
    else:
        print("Film CSV requires no new information.")

# --- OSCARS DATA ---
recent_oscars.to_csv(oscars_path, index=False)

# --- FILM STREAMING DATA ---
film_streaming_options.to_csv(streaming_path, index=False)

Films CSV file has been created. Path: extracted_data/films_data.csv
