This notebook puts our chosen model in action and uses it to see if we can predict the winner of the Best Picture Award at the 2024 Oscars. We also have a bit of fun and see if a few movies of our choosing that were never nominated actually deserved the award. 

In [1]:
# Import our dependencies
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression


In [2]:
# Check what unique values we have for our three Genre columns
def genre_list(dataframe):
    genres = []
    genres.extend(dataframe["Genre1"].values)
    genres.extend(dataframe["Genre2"].values)
    genres.extend(dataframe["Genre3"].values)
    global genres_list
    genres_list = list(set(genres))


In [3]:
# Perform one-hot encoding on Genres
def genre_encoding(dataframe):
    for g in genres_list:
        # Create a column for each genre
        dataframe[g] = 0
        # Columns will have a 0 or 1 if the movie is of the column's genre
        dataframe[g] = ((dataframe["Genre1"] == g) | (dataframe["Genre2"] == g) | (dataframe["Genre3"] == g)).astype(int)
        

In [4]:
def prepare_input_data(dataframe):
    # Save the movie titles to a series that will be joined to the prediction results later
    global nominees
    nominees = pd.DataFrame(dataframe["Title"])
    # Use previous functions to one-hot encode Genres
    genre_list(dataframe)
    genre_encoding(dataframe)
    dataframe.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
    # Run get_dummies on our Rated Column
    dataframe = pd.get_dummies(dataframe, columns=["Rated"])
    # Scaling the numeric columns
    scaled_data = StandardScaler().fit_transform(dataframe[["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"]])
    # Create a DataFrame of the scaled data
    scaled_data = pd.DataFrame(scaled_data, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"])
    # Replace the original data with the columns of information from the scaled Data
    dataframe["Runtime"] = scaled_data["Runtime"]
    dataframe["Metascore"] = scaled_data["Metascore"]
    dataframe["imdbRating"] = scaled_data["imdbRating"]
    dataframe["imdbVotes"] = scaled_data["imdbVotes"]
    dataframe["BoxOffice"] = scaled_data["BoxOffice"]
    global input_data
    input_data = dataframe.copy()


In [5]:
def run_prediction(dataframe):
    # Run model on our potential nominees
    predictions = classifier.predict(dataframe)
    predictions_df = pd.DataFrame({"Prediction": predictions})
    predictions_df["Prediction"] = predictions_df["Prediction"].replace({0: "It's an honor to be nominated", 1: "WINNER"})
    global oscars_prediction
    oscars_prediction = pd.concat([nominees, predictions_df], axis=1)
    return oscars_prediction


## Upload Training data and recreate the model

In [6]:
# Load in our training dataframe
model_training_data = pd.read_csv("Resources/model_training_data.csv")
model_training_data.head()


Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,-0.255022,0.66598,-1.441614,-0.768565,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-1.250497,-3.331114,0.463857,-0.7086,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,-0.573574,-3.331114,-1.018176,-0.758132,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.210678,-3.331114,0.040419,-0.78583,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.170859,-3.331114,-0.383019,-0.770181,-0.804997,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Recreate the model
X = model_training_data.drop(columns = "OscarsWinner")
y = model_training_data["OscarsWinner"]
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)


## Now we will use the model to see if we can predict what movie will win the best picture Oscar in 2024

In [8]:
# Load in our potential nominees dataframe
potential_nominees = pd.read_csv("Resources/potential_nominees_clean.csv")
potential_nominees


Unnamed: 0,Title,Rated,Released,Runtime,Director,Writer,Actors,Language,Country,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3
0,Barbie,PG-13,21 Jul 2023,114,Greta Gerwig,"Greta Gerwig, Noah Baumbach","Margot Robbie, Ryan Gosling, Issa Rae","English, Spanish","United States, United Kingdom",2 wins & 1 nomination,80,7.4,238040,541907382,,Adventure,Comedy,Fantasy
1,Oppenheimer,R,21 Jul 2023,180,Christopher Nolan,"Christopher Nolan, Kai Bird, Martin Sherwin","Cillian Murphy, Emily Blunt, Matt Damon","English, German, Italian","United States, United Kingdom",2 wins & 1 nomination,88,8.6,402067,302215760,,Biography,Drama,History
2,Past Lives,PG-13,23 Jun 2023,105,Celine Song,Celine Song,"Greta Lee, Teo Yoo, John Magaro","English, Korean","United States, South Korea",4 wins & 8 nominations,94,8.2,15336,10787719,,Drama,Romance,
3,Spider-Man: Across the Spider-Verse,PG,02 Jun 2023,140,"Joaquim Dos Santos, Kemp Powers, Justin K. Tho...","Phil Lord, Christopher Miller, Dave Callaham","Shameik Moore, Hailee Steinfeld, Brian Tyree H...",English,United States,3 wins & 1 nomination,86,8.7,252325,381281287,,Animation,Action,Adventure
4,Air,R,05 Apr 2023,111,Ben Affleck,Alex Convery,"Matt Damon, Jason Bateman, Ben Affleck",English,United States,3 wins & 5 nominations,73,7.4,140226,52460106,,Drama,Sport,


In [9]:
# Run the function to prepare the testing data
prepare_input_data(potential_nominees)


In [10]:
# Find out which columns are missing that the model was trained on
model_training_data
missing_columns = list(set(list(model_training_data.columns)) - set(list(input_data.columns)))
# Add missing columns to dataframe
input_data[missing_columns]=0
# Find out which columns are extra that our model doesn't use
extra_columns = list(set(list(input_data.columns))-set(list(model_training_data.columns)))
# Remove the extra columns
input_data.drop(columns=extra_columns, inplace=True)
# Reorder the columns so that they're in the same order as the model is trained on
input_data = input_data[list(model_training_data.columns)]
# Since this data is going to be used in place of the testing data, we drop the "OscarsWinner" column
input_data.drop("OscarsWinner",axis=1, inplace=True)
input_data


Unnamed: 0,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,Animation,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,-0.5772,-0.586053,-1.169654,0.221779,1.416934,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
1,1.80375,0.530238,0.95699,1.500829,0.221808,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.901875,1.367457,0.248108,-1.514823,-1.23128,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0.36075,0.251166,1.13421,0.333171,0.616036,0,0,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,-0.685425,-1.562808,-1.169654,-0.540956,-1.023498,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [11]:
# Run the function to get prediction results
run_prediction(input_data)


Unnamed: 0,Title,Prediction
0,Barbie,It's an honor to be nominated
1,Oppenheimer,It's an honor to be nominated
2,Past Lives,It's an honor to be nominated
3,Spider-Man: Across the Spider-Verse,It's an honor to be nominated
4,Air,It's an honor to be nominated


## According to the model, none of these movies will win the best picture Oscar.
## Just like the 2024 Oscars, the 2024 Golden Globes haven't happened yet.
## So would the prediction change if we said that the movies win at the globes?

In [12]:
# Change the input_data to show the prediction for each movie if it won the Golden Globe
input_data["GlobesWinner"] = 1


In [13]:
# Run the function to get prediction results
run_prediction(input_data)


Unnamed: 0,Title,Prediction
0,Barbie,It's an honor to be nominated
1,Oppenheimer,WINNER
2,Past Lives,It's an honor to be nominated
3,Spider-Man: Across the Spider-Verse,It's an honor to be nominated
4,Air,It's an honor to be nominated


## According to our model, if Oppenheimer wins the Golden Globe for best picture, it will win the Oscar too!

## Now we will use model attempt #5 to see if any of our "Overlooked" movies could win an Oscar

In [14]:
# Load in our overlooked nominees dataframe
overlooked = pd.read_csv("Resources/overlooked_clean.csv")
overlooked


Unnamed: 0,Title,Rated,Released,Runtime,Director,Writer,Actors,Language,Country,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3
0,Hocus Pocus,PG,16 Jul 1993,96,Kenny Ortega,"David Kirschner, Mick Garris, Neil Cuthbert","Bette Midler, Sarah Jessica Parker, Kathy Najimy","English, French",United States,2 wins & 11 nominations,43,6.9,147242,44342713,,Comedy,Family,Fantasy
1,Mean Girls,PG-13,30 Apr 2004,97,Mark Waters,"Rosalind Wiseman, Tina Fey","Lindsay Lohan, Jonathan Bennett, Rachel McAdams","English, German, Vietnamese, Swahili","United States, Canada",7 wins & 25 nominations,66,7.1,406583,86058055,,Comedy,,
2,Dazed and Confused,R,24 Sep 1993,103,Richard Linklater,Richard Linklater,"Jason London, Wiley Wiggins, Matthew McConaughey",English,United States,1 win & 4 nominations,80,7.6,193521,7993039,,Comedy,,
3,Die Hard,R,20 Jul 1988,132,John McTiernan,"Roderick Thorp, Jeb Stuart, Steven E. de Souza","Bruce Willis, Alan Rickman, Bonnie Bedelia","English, German, Italian, Japanese",United States,Nominated for 4 Oscars. 8 wins & 6 nominations...,72,8.2,911376,83844093,,Action,Thriller,
4,The Rocky Horror Picture Show,R,31 Aug 1975,100,Jim Sharman,"Richard O'Brien, Jim Sharman","Tim Curry, Susan Sarandon, Barry Bostwick",English,"United Kingdom, United States",3 wins & 4 nominations,65,7.4,160258,112892319,,Comedy,Horror,Musical
5,The Princess Bride,PG,09 Oct 1987,98,Rob Reiner,William Goldman,"Cary Elwes, Mandy Patinkin, Robin Wright",English,United States,Nominated for 1 Oscar. 7 wins & 10 nominations...,77,8.0,440060,30857814,,Adventure,Comedy,Family
6,Singin' in the Rain,G,10 Apr 1952,103,"Stanley Donen, Gene Kelly","Betty Comden, Adolph Green","Gene Kelly, Donald O'Connor, Debbie Reynolds",English,United States,Nominated for 2 Oscars. 8 wins & 9 nominations...,99,8.3,252643,1884537,,Comedy,Musical,Romance


In [15]:
# Run the function to prepare the testing data
prepare_input_data(overlooked)


In [16]:
# Find out which columns are missing that the model was trained on
model_training_data
missing_columns = list(set(list(model_training_data.columns)) - set(list(input_data.columns)))
# Add missing columns to dataframe
input_data[missing_columns]=0
# Find out which columns are extra that our model doesn't use
extra_columns = list(set(list(input_data.columns))-set(list(model_training_data.columns)))
# Remove the extra columns
input_data.drop(columns=extra_columns, inplace=True)
# Reorder the columns so that they're in the same order as the model is trained on
input_data = input_data[list(model_training_data.columns)]
# Since this data is going to be used in place of the testing data, we drop the "OscarsWinner" column
input_data.drop("OscarsWinner",axis=1, inplace=True)
input_data


Unnamed: 0,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,Animation,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,-0.698664,-1.81947,-1.474325,-0.846774,-0.208706,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,-0.612863,-0.362084,-1.077391,0.191196,0.85167,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,-0.098058,0.525021,-0.085057,-0.66155,-1.13269,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2.390165,0.018104,1.105744,2.211549,0.795393,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
4,-0.35546,-0.425448,-0.481991,-0.79468,1.533779,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,-0.527062,0.334927,0.70881,0.325183,-0.551483,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,-0.098058,1.728949,1.30421,-0.424924,-1.287964,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
# Run the function to get prediction results
run_prediction(input_data)


Unnamed: 0,Title,Prediction
0,Hocus Pocus,It's an honor to be nominated
1,Mean Girls,It's an honor to be nominated
2,Dazed and Confused,It's an honor to be nominated
3,Die Hard,It's an honor to be nominated
4,The Rocky Horror Picture Show,It's an honor to be nominated
5,The Princess Bride,It's an honor to be nominated
6,Singin' in the Rain,WINNER


In [None]:
## According to our model, Singin' in the Rain should have won an Oscar for best picture. 
## Of 