## Extract

In [1]:
# Imports
import pandas as pd

**Get Oscar Winners**

In [2]:
# https://www.kaggle.com/datasets/unanimad/the-oscar-award Can be done with API, updated yearly
oscar_awards_df = pd.read_csv("../../the-oscar-award/full_data.csv", sep="\t") 
oscar_awards_df.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,NomId,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation,MultifilmNomination
0,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051251,The Noose,tt0019217,Richard Barthelmess,Richard Barthelmess,nm0001932,,Nickie Elkins,,,True
1,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051252,The Patent Leather Kid,tt0018253,Richard Barthelmess,Richard Barthelmess,nm0001932,,The Patent Leather Kid,,,True
2,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250a,The Last Command,tt0019071,Emil Jannings,Emil Jannings,nm0417837,True,General Dolgorucki [Grand Duke Sergius Alexander],,,True
3,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250b,The Way of All Flesh,tt0019553,Emil Jannings,Emil Jannings,nm0417837,True,August Schilling,,,True
4,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,an0051255,A Ship Comes In,tt0018389,Louise Dresser,Louise Dresser,nm0237571,,Mrs. Pleznik,,,


**Get movies**

In [3]:
# Movie web scraper
#%pip install git+https://github.com/alberanid/imdbpy
from imdb import IMDb

# Use the online IMDb interface
ia = IMDb('web')

In [4]:
from datetime import datetime 
current_year = datetime.now().year 

year_start = 2024
oscar_awards_df["YearStart"] = (
    oscar_awards_df["Year"].astype(str).str.split("/").str[0].astype(int))

recent_oscars = oscar_awards_df[ (oscar_awards_df["YearStart"] >= year_start) & (oscar_awards_df["YearStart"] <= current_year - 2) & (oscar_awards_df["FilmId"].notna()) ]
recent_oscars.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,NomId,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation,MultifilmNomination,YearStart
11876,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid012,The Brutalist,tt8999762,Adrien Brody,Adrien Brody,nm0004778,True,László Tóth,,,,2024
11877,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid010,A Complete Unknown,tt11563598,Timothée Chalamet,Timothée Chalamet,nm3154303,,Bob Dylan,,,,2024
11878,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid013,Sing Sing,tt28479262,Colman Domingo,Colman Domingo,nm0231458,,Divine G,,,,2024
11879,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid011,Conclave,tt20215234,Ralph Fiennes,Ralph Fiennes,nm0000146,,Lawrence,,,,2024
11880,97,2024,Acting,ACTOR IN A LEADING ROLE,ACTOR IN A LEADING ROLE,fake_nomid014,The Apprentice,tt8368368,Sebastian Stan,Sebastian Stan,nm1659221,,Donald Trump,,,,2024


In [5]:
mask = ~recent_oscars["FilmId"].astype(str).str.startswith("tt")
recent_oscars.loc[mask, "FilmId"].unique()

# check to only get data that we don't have currently have
stored_film_ids = pd.read_csv('extracted_data/films_data.csv')['imdb_id'].astype(str).unique()
stored_film_ids

array(['8999762', '11563598', '28479262', '20215234', '8368368',
       '28607951', '21823606', '1262426', '20221436', '17526714',
       '14961016', '4772188', '22022452', '23770030', '17163970',
       '29623480', '30835281', '28768883', '31888603', '28768679',
       '28356173', '15239678', '22893404', '5040012', '9218128',
       '30227076', '30953759', '30294282', '14452174', '30319854',
       '33385063', '31556921', '27990245', '32280470', '29497240',
       '10236164', '32178949', '21097228', '24458622', '20316978',
       '23055660', '27655666', '27654431', '19837932', '15802124',
       '20519854', '18412256', '14260836', '11389872', '28082769'],
      dtype=object)

In [6]:
import time
unique_films_ids = recent_oscars['FilmId'].str[2:].astype(str).unique()
combined_unique = list(set(stored_film_ids) ^ set(unique_films_ids))

film_rows = []

for film_id in combined_unique:
    film = ia.get_movie(film_id)
    time.sleep(0.5) # half‑second pause

    if film:
        basic_info = {
            "title": film.get("title"),
            "year": film.get("year"),
            "rating": film.get("rating"),
            "genres": film.get("genres"),
            "runtime": film.get("runtimes"),
            "director": [d["name"] for d in film.get("directors", [])],
            "imdb_id": film.movieID
        }
        print("Appending: ", basic_info['title'])
        film_rows.append(basic_info)
    else:
        print("Film was not found:", film_id)

films_information = pd.DataFrame(film_rows)


In [7]:
films_information.head(2)

In [None]:
import os
import pandas as pd

# Paths
films_path = "extracted_data/films_data.csv"
oscars_path = "extracted_data/oscars_data.csv"
# datasets -->

# --- FILMS DATA ---
if not os.path.exists(films_path):
    # File doesn't exist → create it
    films_information.to_csv(films_path, index=False)
    print(f"Films CSV file has been created. Path: {films_path}")
else:
    # File exists → update only if needed
    if not films_information.empty:
        existing_ids = pd.read_csv(films_path)['imdb_id'].astype(str).unique()
        new_rows = films_information[~films_information['imdb_id'].astype(str).isin(existing_ids)]

        if not new_rows.empty:
            new_rows.to_csv(films_path, mode='a', header=False, index=False)
            print(f"Films CSV file has been upated with new films information. Path: {films_path}")
    else:
        print("Film CSV requires no new information.")

# --- OSCARS DATA ---
recent_oscars.to_csv(oscars_path, index=False)

#if not os.path.exists(oscars_path):
#    recent_oscars.to_csv(oscars_path, index=False)
#else:
#    existing_ids = pd.read_csv(oscars_path)['FilmId'].astype(str).unique()
#    new_rows = recent_oscars[~recent_oscars['FilmId'].astype(str).isin(existing_ids)]

#    if not new_rows.empty:
#        new_rows.to_csv(oscars_path, mode='a', header=False, index=False)


Film CSV requires no new information.
