# **Dataset**

In [None]:
import pandas as pd

file_path = "oscars.csv"
df = pd.read_csv(file_path, delimiter='\t')

## **Exploring the dataset:**

In [None]:
df.head()

Unnamed: 0,Ceremony,Year,Class,CanonicalCategory,Category,NomId,Film,FilmId,Name,Nominees,NomineeIds,Winner,Detail,Note,Citation,MultifilmNomination
0,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051251,The Noose,tt0019217,Richard Barthelmess,Richard Barthelmess,nm0001932,,Nickie Elkins,,,True
1,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051252,The Patent Leather Kid,tt0018253,Richard Barthelmess,Richard Barthelmess,nm0001932,,The Patent Leather Kid,,,True
2,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250a,The Last Command,tt0019071,Emil Jannings,Emil Jannings,nm0417837,True,General Dolgorucki [Grand Duke Sergius Alexander],,,True
3,1,1927/28,Acting,ACTOR IN A LEADING ROLE,ACTOR,an0051250b,The Way of All Flesh,tt0019553,Emil Jannings,Emil Jannings,nm0417837,True,August Schilling,,,True
4,1,1927/28,Acting,ACTRESS IN A LEADING ROLE,ACTRESS,an0051255,A Ship Comes In,tt0018389,Louise Dresser,Louise Dresser,nm0237571,,Mrs. Pleznik,,,


In [None]:
df.shape

(11996, 16)

In [None]:
df.dtypes

Ceremony                int64
Year                   object
Class                  object
CanonicalCategory      object
Category               object
NomId                  object
Film                   object
FilmId                 object
Name                   object
Nominees               object
NomineeIds             object
Winner                 object
Detail                 object
Note                   object
Citation               object
MultifilmNomination    object
dtype: object

In [None]:
df.columns

Index(['Ceremony', 'Year', 'Class', 'CanonicalCategory', 'Category', 'NomId',
       'Film', 'FilmId', 'Name', 'Nominees', 'NomineeIds', 'Winner', 'Detail',
       'Note', 'Citation', 'MultifilmNomination'],
      dtype='object')

In [None]:
df.describe()

Unnamed: 0,Ceremony
count,11996.0
mean,50.785929
std,27.480915
min,1.0
25%,26.0
50%,51.0
75%,75.0
max,97.0


In [None]:
df.isnull().sum()

Ceremony                   0
Year                       0
Class                      0
CanonicalCategory          0
Category                   0
NomId                    514
Film                    1243
FilmId                  1243
Name                    1169
Nominees                 353
NomineeIds               866
Winner                  8561
Detail                  8845
Note                   11394
Citation               10829
MultifilmNomination    11956
dtype: int64

In [None]:
duplicates = df[df.duplicated(keep=False)]

## **Cleaning the dataset**

In [None]:
# Handling missing values:

df['Winner'] = df['Winner'].fillna(False).astype(bool)
df['Film'] = df['Film'].fillna("Unknown")
df['FilmId'] = df['FilmId'].fillna("Unknown")
df['Name'] = df['Name'].fillna("Unknown")
df['NomineeIds'] = df['NomineeIds'].fillna("Unknown")
df['Nominees'] = df['Nominees'].fillna(df['Name'])

In [None]:
# Dropping unnecessary columns
df = df.drop(columns=['Detail', 'Note', 'Citation', 'MultifilmNomination', 'Ceremony', 'Class', 'CanonicalCategory','Nominees'])

# Dropping rows where NomId is missing (since it's an identifier, we can't infer it)
df = df.dropna(subset=['NomId'])

In [None]:
# Standardizing Data Types
df['Year'] = df['Year'].astype(str).str.split('/').str[0].astype(int)

# Converting fully capitalized words in 'category' to title case
df['Category'] = df['Category'].apply(lambda x: x.title() if x.isupper() else x)

# Stripping whitespace from text columns
text_columns = [ 'Category', 'NomId', 'Film', 'FilmId', 'Name', 'NomineeIds']
df[text_columns] = df[text_columns].apply(lambda x: x.str.strip())

In [None]:
# Removing duplicates
df = df.drop_duplicates()

In [None]:
# Normalize column names
df.columns = df.columns.str.lower().str.replace(' ', '_')

In [None]:
# Define the categories of interest (case-insensitive matching)
# Define a dictionary to map category variations to standardized names
category_mapping = {
    "best picture": "Best Picture",
    "outstanding picture": "Best Picture",
    "outstanding production": "Best Picture",
    "best motion picture": "Best Picture",
    "actor in a leading role": "Best Actor",
    "actor": "Best Actor",
    "actress in a leading role": "Best Actress",
    "actress": "Best Actress",
    "actor in a supporting role": "Best Supporting Actor",
    "supporting actor": "Best Supporting Actor",
    "actress in a supporting role": "Best Supporting Actress",
    "supporting actress": "Best Supporting Actress",
    "directing": "Best Director",
    "directing (comedy picture)": "Best Director",
    "directing (dramatic picture)": "Best Director",
    "best director": "Best Director",
    "writing (original screenplay)": "Best Original Screenplay",
    "original screenplay": "Best Original Screenplay",
    "writing (screenplay--original)": "Best Original Screenplay",
    "writing (story and screenplay--written directly for the screen)": "Best Original Screenplay",
    "writing (screenplay written directly for the screen)": "Best Original Screenplay",
    "writing (screenplay written directly for the screen--based on factual material or on story material not previously published or produced)": "Best Original Screenplay",
    "writing (adapted screenplay)": "Best Adapted Screenplay",
    "writing (screenplay adapted from other material)": "Best Adapted Screenplay",
    "writing (screenplay--adapted)": "Best Adapted Screenplay",
    "writing (screenplay based on material from another medium)": "Best Adapted Screenplay",
    "writing (screenplay based on material previously produced or published)": "Best Adapted Screenplay",
    "cinematography": "Best Cinematography",
    "cinematography (black-and-white)": "Best Cinematography",
    "cinematography (color)": "Best Cinematography"
}

# Standardize category names in the dataset
df['category_standardized'] = df['category'].apply(lambda x: category_mapping.get(x.lower(), None) if isinstance(x, str) else None)

# Drop 'nomid' and 'nomineeid' columns
df.drop(columns=['nomid', 'nomineeids'], inplace=True, errors='ignore')

# Keep only rows that matched a relevant category
filtered_df = df.dropna(subset=['category_standardized'])

# Save the filtered dataset
filtered_file_path = "oscars_standardized.csv"
filtered_df.to_csv(filtered_file_path, index=False)

In [None]:
# Load the dataset
file_path = "oscars_standardized.csv"
df = pd.read_csv(file_path)

# Replace the 'category' column with 'category_standardized'
df['category'] = df['category_standardized']

# Drop the now redundant 'category_standardized' column
df.drop(columns=['category_standardized'], inplace=True)

# Save the cleaned dataset
cleaned_file_path = "oscars_clean_1.csv"
df.to_csv(cleaned_file_path, index=False)

# Provide the cleaned file path for download
cleaned_file_path

'oscars_clean_1.csv'

# **Data after cleaning**

In [None]:
df.head()

Unnamed: 0,year,category,film,filmid,name,winner
0,1927,Best Actor,The Noose,tt0019217,Richard Barthelmess,False
1,1927,Best Actor,The Patent Leather Kid,tt0018253,Richard Barthelmess,False
2,1927,Best Actor,The Last Command,tt0019071,Emil Jannings,True
3,1927,Best Actor,The Way of All Flesh,tt0019553,Emil Jannings,True
4,1927,Best Actress,A Ship Comes In,tt0018389,Louise Dresser,False


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   year      4168 non-null   int64 
 1   category  4168 non-null   object
 2   film      4168 non-null   object
 3   filmid    4168 non-null   object
 4   name      4168 non-null   object
 5   winner    4168 non-null   bool  
dtypes: bool(1), int64(1), object(4)
memory usage: 167.0+ KB


In [None]:
df.shape

(4168, 6)

In [None]:
df.describe()

Unnamed: 0,year
count,4168.0
mean,1978.56238
std,27.650584
min,1927.0
25%,1955.0
50%,1980.0
75%,2003.0
max,2024.0


In [None]:
df.dtypes

year         int64
category    object
film        object
filmid      object
name        object
winner        bool
dtype: object

# **TMDb API**

In [None]:
import requests
import pandas as pd
import time

TMDB_API_KEY = "0c2cf3ce66cd4072df3d2a7e0bde934f"

TMDB_BASE_URL = "https://api.themoviedb.org/3/find/"

# Function to fetch movie details from TMDb using IMDb ID
def fetch_tmdb_data(imdb_id):
    url = f"{TMDB_BASE_URL}{imdb_id}"
    params = {
        "api_key": TMDB_API_KEY,
        "external_source": "imdb_id"
    }

    response = requests.get(url, params=params)

    if response.status_code == 200:
        data = response.json()

        if data.get("movie_results"):
            movie = data["movie_results"][0]
            return {
                "tmdb_id": movie.get("id"),
                "popularity": movie.get("popularity"),
                "vote_count": movie.get("vote_count"),
                "vote_average": movie.get("vote_average"),
            }

    return {
        "tmdb_id": None,
        "popularity": None,
        "vote_count": None,
        "vote_average": None
    }


In [None]:
file_path = "oscars_clean_1.csv"
df = pd.read_csv(file_path)

# Add new columns to store TMDb data
df["tmdb_id"] = None
df["popularity"] = None
df["vote_count"] = None
df["vote_average"] = None

# Fetch TMDb data for each row
for index, row in df.iterrows():
    if pd.notna(row["filmid"]):  # Check if IMDb ID is present
        tmdb_data = fetch_tmdb_data(row["filmid"])
        df.at[index, "tmdb_id"] = tmdb_data["tmdb_id"]
        df.at[index, "popularity"] = tmdb_data["popularity"]
        df.at[index, "vote_count"] = tmdb_data["vote_count"]
        df.at[index, "vote_average"] = tmdb_data["vote_average"]

    time.sleep(0.5)

# Saving the updated dataset
tmdb_df.to_csv("oscars_with_tmdb.csv", index=False)

# **Exploring the IMDb dataset**

In [None]:
tmdb_df = pd.read_csv("oscars_with_tmdb.csv")
tmdb_df.head()

Unnamed: 0,year,category,film,filmid,name,winner,tmdb_id,popularity,vote_count,vote_average
0,1927,Best Actor,The Noose,tt0019217,Richard Barthelmess,False,113167,0.894,1,7.0
1,1927,Best Actor,The Patent Leather Kid,tt0018253,Richard Barthelmess,False,102541,1.557,11,6.182
2,1927,Best Actor,The Last Command,tt0019071,Emil Jannings,True,52679,5.954,84,7.3
3,1927,Best Actor,The Way of All Flesh,tt0019553,Emil Jannings,True,85499,1.397,6,6.9
4,1927,Best Actress,A Ship Comes In,tt0018389,Louise Dresser,False,104212,0.995,8,5.813


In [None]:
tmdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year          4168 non-null   int64  
 1   category      4168 non-null   object 
 2   film          4168 non-null   object 
 3   filmid        4168 non-null   object 
 4   name          4168 non-null   object 
 5   winner        4168 non-null   bool   
 6   tmdb_id       4168 non-null   int64  
 7   popularity    4168 non-null   float64
 8   vote_count    4168 non-null   int64  
 9   vote_average  4168 non-null   float64
dtypes: bool(1), float64(2), int64(3), object(4)
memory usage: 297.3+ KB


In [None]:
tmdb_df.shape

(4168, 10)

In [None]:
tmdb_df.describe()

Unnamed: 0,year,tmdb_id,popularity,vote_count,vote_average
count,4168.0,4168.0,4168.0,4168.0,4168.0
mean,1978.56238,88205.72,27.064698,2781.202015,7.121955
std,27.650584,182926.9,71.28021,4910.699146,0.663743
min,1927.0,11.0,0.263,1.0,2.0
25%,1955.0,3083.0,8.603,137.0,6.7
50%,1980.0,18089.5,14.1685,687.0,7.1675
75%,2003.0,52859.0,24.72,3005.0,7.562
max,2024.0,1211472.0,1520.298,36973.0,8.708


In [None]:
tmdb_df.dtypes

year              int64
category         object
film             object
filmid           object
name             object
winner             bool
tmdb_id           int64
popularity      float64
vote_count        int64
vote_average    float64
dtype: object

# **OMDb API**

In [None]:
import pandas as pd
import requests
import time
import numpy as np
import concurrent.futures

# Load dataset
df = pd.read_csv("oscars_clean_1.csv")

# OMDb API keys
api_keys = ["4060e300", "4525a990", "a7a585dd", "954c1ba4"]

# Add missing columns
for col in ["Genre", "Director", "IMDb_Rating"]:
    if col not in df.columns:
        df[col] = None

# Split dataset into 4 parts for parallel fetching
df_split = np.array_split(df, 4)

# Function to fetch data from OMDb
def fetch_omdb_data(row, api_key):
    url = f"http://www.omdbapi.com/?i={row['filmid']}&apikey={api_key}"
    response = requests.get(url)

    if response.status_code == 200:
        data = response.json()
        if data.get("Response") == "True":
            return {
                "Genre": data.get("Genre", None),
                "Director": data.get("Director", None),
                "IMDb_Rating": data.get("imdbRating", None)
            }
    return {"Genre": None, "Director": None, "IMDb_Rating": None}

# Function to process a dataset chunk with a specific API key
def process_chunk(df_chunk, api_key):
    for index, row in df_chunk.iterrows():
        if pd.isnull(row["Genre"]) or pd.isnull(row["Director"]) or pd.isnull(row["IMDb_Rating"]):
            fetched_data = fetch_omdb_data(row, api_key)
            df_chunk.at[index, "Genre"] = fetched_data["Genre"]
            df_chunk.at[index, "Director"] = fetched_data["Director"]
            df_chunk.at[index, "IMDb_Rating"] = fetched_data["IMDb_Rating"]
            time.sleep(0.5)  # Delay to prevent rate limits
    return df_chunk

# Run parallel fetching using multiple API keys
with concurrent.futures.ThreadPoolExecutor(max_workers=4) as executor:
    results = executor.map(process_chunk, df_split, api_keys)

# Combine the results back into a single DataFrame
df_updated = pd.concat(results, ignore_index=True)

# Save the updated dataset
df_updated.to_csv("omdb_data.csv", index=False)

print("Updated dataset saved as 'oscars_updated.csv'")

# **Exploring the OMDb dataset:**

In [None]:
omdb_df = pd.read_csv("omdb_data.csv")
omdb_df.head()

Unnamed: 0,year,category,film,filmid,name,winner,Genre,Director,IMDb_Rating
0,1927,Best Actor,The Noose,tt0019217,Richard Barthelmess,False,Drama,John Francis Dillon,7.0
1,1927,Best Actor,The Patent Leather Kid,tt0018253,Richard Barthelmess,False,"Drama, Romance, Sport",Alfred Santell,6.1
2,1927,Best Actor,The Last Command,tt0019071,Emil Jannings,True,"Drama, Romance, War",Josef von Sternberg,7.9
3,1927,Best Actor,The Way of All Flesh,tt0019553,Emil Jannings,True,Drama,Victor Fleming,6.8
4,1927,Best Actress,A Ship Comes In,tt0018389,Louise Dresser,False,Drama,William K. Howard,5.5


In [None]:
omdb_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4168 entries, 0 to 4167
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   year         4168 non-null   int64  
 1   category     4168 non-null   object 
 2   film         4168 non-null   object 
 3   filmid       4168 non-null   object 
 4   name         4168 non-null   object 
 5   winner       4168 non-null   bool   
 6   Genre        3126 non-null   object 
 7   Director     3125 non-null   object 
 8   IMDb_Rating  3123 non-null   float64
dtypes: bool(1), float64(1), int64(1), object(6)
memory usage: 264.7+ KB


In [None]:
omdb_df.shape

(4168, 9)

In [None]:
omdb_df.describe()

Unnamed: 0,year,IMDb_Rating
count,4168.0,3123.0
mean,1978.56238,7.410631
std,27.650584,0.603435
min,1927.0,4.6
25%,1955.0,7.1
50%,1980.0,7.4
75%,2003.0,7.8
max,2024.0,9.3


In [None]:
omdb_df.dtypes

year             int64
category        object
film            object
filmid          object
name            object
winner            bool
Genre           object
Director        object
IMDb_Rating    float64
dtype: object

# **Combining both the datasets:**

In [None]:
tmdb_df = pd.read_csv("oscars_with_tmdb.csv")
omdb_df = pd.read_csv("omdb_data.csv")

# Merge on 'filmid' (IMDb ID)
merged_df = tmdb_df.merge(omdb_df, on="filmid", how="left")

# Save merged dataset
merged_file_path = "oscars_with_tmdb_omdb.csv"
merged_df.to_csv(merged_file_path, index=False)

## **Exploring the merged dataset**

In [None]:
merged_df = pd.read_csv("oscars_with_tmdb_omdb.csv")
merged_df.head()

Unnamed: 0,year_x,category_x,film_x,filmid,name_x,winner_x,tmdb_id,popularity,vote_count,vote_average,year_y,category_y,film_y,name_y,winner_y,Genre,Director,IMDb_Rating
0,1927,Best Actor,The Noose,tt0019217,Richard Barthelmess,False,113167,0.894,1,7.0,1927,Best Actor,The Noose,Richard Barthelmess,False,Drama,John Francis Dillon,7.0
1,1927,Best Actor,The Patent Leather Kid,tt0018253,Richard Barthelmess,False,102541,1.557,11,6.182,1927,Best Actor,The Patent Leather Kid,Richard Barthelmess,False,"Drama, Romance, Sport",Alfred Santell,6.1
2,1927,Best Actor,The Last Command,tt0019071,Emil Jannings,True,52679,5.954,84,7.3,1927,Best Actor,The Last Command,Emil Jannings,True,"Drama, Romance, War",Josef von Sternberg,7.9
3,1927,Best Actor,The Way of All Flesh,tt0019553,Emil Jannings,True,85499,1.397,6,6.9,1927,Best Actor,The Way of All Flesh,Emil Jannings,True,Drama,Victor Fleming,6.8
4,1927,Best Actress,A Ship Comes In,tt0018389,Louise Dresser,False,104212,0.995,8,5.813,1927,Best Actress,A Ship Comes In,Louise Dresser,False,Drama,William K. Howard,5.5


In [None]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14530 entries, 0 to 14529
Data columns (total 18 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   year_x        14530 non-null  int64  
 1   category_x    14530 non-null  object 
 2   film_x        14530 non-null  object 
 3   filmid        14530 non-null  object 
 4   name_x        14530 non-null  object 
 5   winner_x      14530 non-null  bool   
 6   tmdb_id       14530 non-null  int64  
 7   popularity    14530 non-null  float64
 8   vote_count    14530 non-null  int64  
 9   vote_average  14530 non-null  float64
 10  year_y        14530 non-null  int64  
 11  category_y    14530 non-null  object 
 12  film_y        14530 non-null  object 
 13  name_y        14530 non-null  object 
 14  winner_y      14530 non-null  bool   
 15  Genre         10749 non-null  object 
 16  Director      10748 non-null  object 
 17  IMDb_Rating   10740 non-null  float64
dtypes: bool(2), float64(3), in

In [None]:
merged_df.shape

(14530, 18)

In [None]:
merged_df.describe()

Unnamed: 0,year_x,tmdb_id,popularity,vote_count,vote_average,year_y,IMDb_Rating
count,14530.0,14530.0,14530.0,14530.0,14530.0,14530.0,10740.0
mean,1981.325052,92755.79,31.022847,3386.02106,7.264549,1981.325052,7.542691
std,26.817444,193945.7,71.083027,5258.903537,0.612222,26.817444,0.549194
min,1927.0,11.0,0.263,1.0,2.0,1927.0,4.6
25%,1959.0,1715.0,10.236,257.0,6.9,1959.0,7.2
50%,1982.0,13847.0,17.871,1214.0,7.3,1982.0,7.6
75%,2005.0,45269.0,29.017,3843.0,7.7,2005.0,7.9
max,2024.0,1211472.0,1520.298,36973.0,8.708,2024.0,9.3


In [None]:
merged_df.dtypes

year_x            int64
category_x       object
film_x           object
filmid           object
name_x           object
winner_x           bool
tmdb_id           int64
popularity      float64
vote_count        int64
vote_average    float64
year_y            int64
category_y       object
film_y           object
name_y           object
winner_y           bool
Genre            object
Director         object
IMDb_Rating     float64
dtype: object

# **Cleaning the merged dataset**

In [None]:
import pandas as pd

file_path = "oscars_with_tmdb_omdb.csv"
df = pd.read_csv(file_path)

# duplicate rows
df = df.drop_duplicates()

# Removing redundant columns (e.g., duplicate year, category, film, name, winner)
df = df.drop(columns=['year_y', 'category_y', 'film_y', 'name_y', 'winner_y'])

# Renaming columns to more meaningful names
df = df.rename(columns={
    'year_x': 'Year',
    'category_x': 'Category',
    'film_x': 'Film',
    'filmid': 'Film_ID',
    'name_x': 'Nominee',
    'winner_x': 'Winner',
    'tmdb_id': 'TMDb_ID',
    'popularity': 'Popularity',
    'vote_count': 'Vote_Count',
    'vote_average': 'Vote_Average',
    'Genre': 'Genres',
    'Director': 'Director',
    'IMDb_Rating': 'IMDb_Rating'
})

# Filling missing genres with 'Unknown'
df['Genres'] = df['Genres'].fillna('Unknown')

# Filling missing director values with 'Unknown'
df['Director'] = df['Director'].fillna('Unknown')

# Filling missing IMDb ratings with the mean IMDb rating
df['IMDb_Rating'] = df['IMDb_Rating'].fillna(df['IMDb_Rating'].mean())

cleaned_file_path = "cleaned_oscars_dataset.csv"
df.to_csv(cleaned_file_path, index=False)

In [None]:
df_cleaned = pd.read_csv("cleaned_oscars_dataset.csv")

# Drop duplicate rows
df_cleaned = df_cleaned.drop_duplicates()

# Verify that duplicates are removed
duplicate_count_after = df_cleaned.duplicated().sum()

# Save the re-cleaned dataset
cleaned_file_path_final = "final_cleaned_oscars.csv"
df_cleaned.to_csv(cleaned_file_path_final, index=False)

# Display the final duplicate count after cleaning
duplicate_count_after, cleaned_file_path_final

(0, 'final_cleaned_oscars.csv')

# **Exploring the Final dataset**

In [None]:
import pandas as pd

cleaned_df = pd.read_csv("final_cleaned_oscars.csv")
cleaned_df.head()

Unnamed: 0,Year,Category,Film,Film_ID,Nominee,Winner,TMDb_ID,Popularity,Vote_Count,Vote_Average,Genres,Director,IMDb_Rating
0,1972,Best Cinematography,1776,tt0068156,"Harry Stradling, Jr.",False,14902,6.822,75,6.8,"Drama, History, Musical",Peter H. Hunt,7.2
1,2019,Best Cinematography,1917,tt8579674,Roger Deakins,True,530915,45.996,12542,8.0,"Action, Drama, War",Sam Mendes,8.2
2,2019,Best Director,1917,tt8579674,Sam Mendes,False,530915,45.996,12542,8.0,"Action, Drama, War",Sam Mendes,8.2
3,2019,Best Picture,1917,tt8579674,"Sam Mendes, Pippa Harris, Jayne-Ann Tenggren a...",False,530915,45.996,12542,8.0,"Action, Drama, War",Sam Mendes,8.2
4,2019,Best Original Screenplay,1917,tt8579674,Written by Sam Mendes & Krysty Wilson-Cairns,False,530915,45.996,12542,8.0,"Action, Drama, War",Sam Mendes,8.2


In [None]:
cleaned_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4118 entries, 0 to 4117
Data columns (total 13 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Year          4118 non-null   int64  
 1   Category      4118 non-null   object 
 2   Film          4118 non-null   object 
 3   Film_ID       4118 non-null   object 
 4   Nominee       4118 non-null   object 
 5   Winner        4118 non-null   bool   
 6   TMDb_ID       4118 non-null   int64  
 7   Popularity    4118 non-null   float64
 8   Vote_Count    4118 non-null   int64  
 9   Vote_Average  4118 non-null   float64
 10  Genres        4118 non-null   object 
 11  Director      4118 non-null   object 
 12  IMDb_Rating   4118 non-null   float64
dtypes: bool(1), float64(3), int64(3), object(6)
memory usage: 390.2+ KB


In [None]:
cleaned_df.shape

(4118, 13)

In [None]:
cleaned_df.describe()

Unnamed: 0,Year,TMDb_ID,Popularity,Vote_Count,Vote_Average,IMDb_Rating
count,4118.0,4118.0,4118.0,4118.0,4118.0,4118.0
mean,1978.720981,88987.67,27.23896,2801.691841,7.121164,7.426889
std,27.746959,183881.4,71.689654,4934.05901,0.664544,0.601777
min,1927.0,11.0,0.263,1.0,2.0,4.6
25%,1955.0,3083.0,8.639,137.0,6.7,7.1
50%,1980.0,18163.0,14.316,693.5,7.164,7.5
75%,2003.0,53149.0,24.843,3012.5,7.56425,7.8
max,2024.0,1211472.0,1520.298,36973.0,8.708,9.3


In [None]:
cleaned_df.dtypes

Unnamed: 0,0
Year,int64
Category,object
Film,object
Film_ID,object
Nominee,object
Winner,bool
TMDb_ID,int64
Popularity,float64
Vote_Count,int64
Vote_Average,float64


In [None]:
df['Genres'] = df['Genres'].str.split(',').apply(lambda x: [genre.strip() for genre in x] if isinstance(x, list) else x)

genre_counts = df[df['Winner'] == True]['Genres'].explode().value_counts()

df.to_csv('final_cleaned.csv', index=False)

In [None]:
genre_counts.head(10)

Unnamed: 0_level_0,count
Genres,Unnamed: 1_level_1
Drama,726
Romance,234
Comedy,162
Biography,160
Crime,110
War,78
Adventure,76
History,75
Thriller,59
Action,43
