This notebook is for trialing different learning models. The models we've tested include logistic regression and random forest. For a summary of iterative changes made to the model and the resulting changes in model performance please see the excel file, oscars_model_comparison.xlsx

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
import numpy as np


In [2]:
# Declare a random state for all models
random_state=29

In [3]:
# Read in main dataset for model creation
df = pd.read_csv("Resources/combined_clean.csv")
df.head()

Unnamed: 0,year_film,year_ceremony,category,name,film,OscarsWinner,Rated,Released,Runtime,Director,...,Country,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3
0,1929,1930,OUTSTANDING PRODUCTION,Warner Bros.,Disraeli,False,Passed,01 Nov 1929,90,Alfred E. Green,...,United States,Won 1 Oscar. 4 wins & 2 nominations total,0.0,6.1,1338,0,A & E,Biography,Drama,History
1,1929,1930,OUTSTANDING PRODUCTION,Paramount Famous Lasky,The Love Parade,False,Passed,18 Jan 1930,107,Ernst Lubitsch,...,United States,Nominated for 6 Oscars. 1 win & 6 nominations ...,0.0,7.0,2500,0,,Comedy,Musical,Romance
2,1931,1932,OUTSTANDING PRODUCTION,Samuel Goldwyn Productions,Arrowsmith,False,Approved,26 Dec 1931,108,John Ford,...,United States,Nominated for 4 Oscars. 4 nominations total,0.0,6.2,1862,0,,Drama,,
3,1931,1932,OUTSTANDING PRODUCTION,Fox,Bad Girl,False,Passed,13 Sep 1931,90,Frank Borzage,...,United States,Won 2 Oscars. 2 wins & 1 nomination total,0.0,6.5,1504,0,,Drama,Romance,
4,1931,1932,OUTSTANDING PRODUCTION,Metro-Goldwyn-Mayer,The Champ,False,Passed,21 Nov 1931,86,King Vidor,...,United States,Won 2 Oscars. 2 wins & 3 nominations total,0.0,7.3,3416,0,,Drama,Family,Sport


In [4]:
# Check unique values for each column to help decide which to include in each model attempt
df.nunique()

year_film         92
year_ceremony     92
category           4
name             367
film             521
OscarsWinner       2
Rated             13
Released         504
Runtime          116
Director         305
Writer           514
Actors           526
Language         135
Country           78
Awards           494
Metascore         53
imdbRating        33
imdbVotes        527
BoxOffice        341
Production         4
Genre1            12
Genre2            18
Genre3            16
dtype: int64

In [5]:
# Drop columns for first attemt
df_1 = df.drop(["year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production"],
              axis=1)


In [None]:
df_1.nunique()

In [None]:
# Check what unique values we have for our three Genre columns
def genre_list(dataframe):
    genres = []
    genres.extend(dataframe["Genre1"].values)
    genres.extend(dataframe["Genre2"].values)
    genres.extend(dataframe["Genre3"].values)
    global genres_list
    genres_list = list(set(genres))
    print(genres_list)
genre_list(df_1)

In [None]:
# Perform one-hot encoding on Genres
def genre_encoding(dataframe):
    for g in genres_list:
        # Create a column for each genre
        dataframe[g] = 0
        # Columns will have a 0 or 1 if the movie is of the column's genre
        dataframe[g] = ((dataframe["Genre1"] == g) | (dataframe["Genre2"] == g) | (dataframe["Genre3"] == g)).astype(int)
genre_encoding(df_1)
df_1.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_1.columns)

In [None]:
# Get rid of the nan column that was created for when a movie had less than 3 genres
df_1 = df_1[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Fantasy',
 'Family',
 'Thriller',
 'Crime',
 'Western',
 'Musical',
 'Drama',
 'War',
 'Mystery',
 'Film-Noir',
 'Action',
 'Horror',
 'Adventure',
 'Sport',
 'Short',
 'Biography',
 'History',
 'Music',
 'Comedy',
 'Sci-Fi',
 'Romance',
 'Animation']]

In [None]:
list(df_1.columns)

In [None]:
df_1.dtypes

In [None]:
# Change the boolean values of True/False to 1/0 for the OscarsWinner column
df_1["OscarsWinner"] = df_1["OscarsWinner"].astype(int)

In [None]:
df_1.dtypes

In [None]:
# Run get_dummies on our Rated Column
df_1 = pd.get_dummies(df_1, columns=["Rated"])

In [None]:
list(df_1.columns)

In [None]:
df_1_copy = df_1.copy()

In [None]:
list(df_1_copy.columns)

In [None]:
# Change year_ceremony to string since these will not have math done on them
df_1 = df_1.astype({"year_ceremony":"str"})

In [None]:
# Run get_dummies on our year_ceremony Column
df_1 = pd.get_dummies(df_1, columns=["year_ceremony"])

In [None]:
df_1.shape

In [None]:
df_1.head()

## Attempt 1: logistic regression
Accuracy Score : 0.84

Balanced Accuracy Score : 0.5

In [None]:
# define model results as y and features as X
y = df_1["OscarsWinner"]
X = df_1.drop(columns = "OscarsWinner")


In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_1_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_1_df

In [None]:
pred_1_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## Attempt 2: logistic regression, limiting to past 50 years of data
Our theory is that movies that are >50 years old are more likely to have NaN values for some of our features, such as critical reviews and Box Office.

Accuracy Score : 0.85

Balanced Accuracy Score : 0.5

In [None]:
# Create the dataframe by selecting only data from 1973 and beyond
df_2 = df_1_copy.loc[df_1_copy["year_ceremony"]>=1973]
df_2.head()

In [None]:
# Change year_ceremony to string since these will not have math done on them
df_2 = df_2.astype({"year_ceremony":"str"})

In [None]:
# Run get_dummies on our year_ceremony Column
df_2 = pd.get_dummies(df_2, columns=["year_ceremony"])

In [None]:
# Attempt 2 has 238 less rows of data than Attempt 1
df_2.shape

In [None]:
df_2.head()

In [None]:
# define model results as y and features as X
y = df_2["OscarsWinner"]
X = df_2.drop(columns = "OscarsWinner")


In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_2_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_2_df

In [None]:
pred_2_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

# Attempt 3: logistic regression. Data added: boolean value for if movie won best picture at the Golden Globes. Data from the Golden Globes are for the years 1944-2020, so all other years will be eliminated from this attempt.

Accuracy Score : 0.65

Balanced Accuracy Score : 0.51

In [None]:
df_3 = pd.read_csv("Resources/combined_with_globes_clean.csv")
df_3.head()

In [None]:
# Drop columns for first attempt
df_3 = df_3.drop(["Title/Year","year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production"],
              axis=1)

In [None]:
df_3.columns

In [None]:
# Run previously defined function to one-hot encode the genres
genre_list(df_3)

In [None]:
# Run previously defined function to one-hot encode the genres
genre_encoding(df_3)
df_3.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_3.columns)

In [None]:
# Drop the nan column
df_3 = df_3[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror']]

In [None]:
df_3.dtypes

In [None]:
# Adjust datatypes and get_dummies on necessary columns
df_3 = df_3.astype({"OscarsWinner": "int", "GlobesWinner":"int",
                                                   "year_ceremony":"str"})
df_3 = pd.get_dummies(df_3, columns=["Rated", "year_ceremony"])
df_3.dtypes

In [None]:
# define model results as y and features as X
y = df_3["OscarsWinner"]
X = df_3.drop(columns = "OscarsWinner")


In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_3_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_3_df

In [None]:
pred_3_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

# Attempt 4: logistic regression. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.
Data from the Golden Globes are for the years 1944-2020, so all other years will be eliminated from this attempt.

Box Office values not great to compare across all movies due to inflation

Accuracy Score : 0.75

Balanced Accuracy Score : 0.52

In [None]:
# Copy the 3rd attempt (includes Globes data) but remove BoxOffice column
df_4 = df_3.drop(columns = "BoxOffice")

In [None]:
df_4.head()

In [None]:
# define model results as y and features as X
y = df_4["OscarsWinner"]
X = df_4.drop(columns = "OscarsWinner")

In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_4_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_4_df

In [None]:
pred_4_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

# Attempt 5: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes.

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [None]:
# Copy the 3rd attempt (includes Globes data)
df_5 = df_3.copy()


In [None]:
# Scaling the numeric columns
df_5_scaled = StandardScaler().fit_transform(df_5[["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"]])

# Review the scaled data
df_5_scaled

In [None]:
# Create a DataFrame of the scaled data
df_5_scaled = pd.DataFrame(df_5_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"])

# Replace the original data with the columns of information from the scaled Data
df_5["Runtime"] = df_5_scaled["Runtime"]
df_5["Metascore"] = df_5_scaled["Metascore"]
df_5["imdbRating"] = df_5_scaled["imdbRating"]
df_5["imdbVotes"] = df_5_scaled["imdbVotes"]
df_5["BoxOffice"] = df_5_scaled["BoxOffice"]

# Review the DataFrame
df_5.head()

In [None]:
# define model results as y and features as X
y = df_5["OscarsWinner"]
X = df_5.drop(columns = "OscarsWinner")

In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_5_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_5_df

In [None]:
pred_5_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## Attempt 6: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [None]:
df_6 = df_3.drop("BoxOffice", axis=1)
df_6.head()

In [None]:
# Scaling the numeric columns
df_6_scaled = StandardScaler().fit_transform(df_6[["Runtime", "Metascore", "imdbRating", "imdbVotes"]])

# Review the scaled data
df_6_scaled

In [None]:
# Create a DataFrame of the scaled data
df_6_scaled = pd.DataFrame(df_6_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes"])

# Replace the original data with the columns of information from the scaled Data
df_6["Runtime"] = df_6_scaled["Runtime"]
df_6["Metascore"] = df_6_scaled["Metascore"]
df_6["imdbRating"] = df_6_scaled["imdbRating"]
df_6["imdbVotes"] = df_6_scaled["imdbVotes"]


# Review the DataFrame
df_6.head()

In [None]:
# define model results as y and features as X
y = df_6["OscarsWinner"]
X = df_6.drop(columns = "OscarsWinner")

In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_6_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_6_df

In [None]:
pred_6_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## Attempt 7: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column limiting to past 50 years of data

Accuracy Score : 0.77

Balanced Accuracy Score : 0.53

In [None]:
df_7 = pd.read_csv("Resources/combined_with_globes_clean.csv")
df_7 = df_7.loc[df_7["year_ceremony"]>=1973]
df_7.head()

In [None]:
df_7 = df_7.drop(["Title/Year","year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production", "BoxOffice"],
              axis=1)

In [None]:
df_7.columns

In [None]:
# Run previously defined function to one-hot encode the genres
genre_list(df_7)

In [None]:
# Run previously defined function to one-hot encode the genres
genre_encoding(df_7)
df_7.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_7.columns)

In [None]:
# Drop the nan column
df_7 = df_7[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'GlobesWinner',
 'Family',
 'Animation',
 'War',
 'Musical',
 'Horror',
 'Romance',
 'Fantasy',
 'Comedy',
 'Thriller',
 'Crime',
 'Adventure',
 'Music',
 'Sci-Fi',
 'Western',
 'Drama',
 'Mystery',
 'Sport',
 'Action',
 'Biography',
 'History']]

In [None]:
df_7.dtypes

In [None]:
# Adjust datatypes and get_dummies on necessary columns
df_7 = df_7.astype({"OscarsWinner": "int", "GlobesWinner":"int",
                                                   "year_ceremony":"str"})
df_7 = pd.get_dummies(df_7, columns=["Rated", "year_ceremony"])
df_7.dtypes

In [None]:
# Scaling the numeric columns
df_7_scaled = StandardScaler().fit_transform(df_7[["Runtime", "Metascore", "imdbRating", "imdbVotes"]])

# Review the scaled data
df_7_scaled

In [None]:
df_7.shape

In [None]:
# define model results as y and features as X
y = df_7["OscarsWinner"]
X = df_7.drop(columns = "OscarsWinner")


In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_7_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_7_df

In [None]:
pred_7_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## Attempt 8: random forest with data scaling

Accuracy Score : 0.86

Balanced Accuracy Score : 0.59

In [None]:
df_8 = df_1.copy()

In [None]:
# Define features set
X = df_8.drop("OscarsWinner", axis=1)
y = df_8["OscarsWinner"].ravel()

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=random_state)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

## Attempt 9: logistic regression with data scaling
Accuracy Score : 0.86

Balanced Accuracy Score : 0.59

In [None]:
df_9 = df_1.copy()
df_9.head()

In [None]:
# Scaling the numeric columns
df_9_scaled = StandardScaler().fit_transform(df_9[["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"]])

# Review the scaled data
df_9_scaled

In [None]:
# Create a DataFrame of the scaled data
df_9_scaled = pd.DataFrame(df_9_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"])

# Replace the original data with the columns of information from the scaled Data
df_9["Runtime"] = df_9_scaled["Runtime"]
df_9["Metascore"] = df_9_scaled["Metascore"]
df_9["imdbRating"] = df_9_scaled["imdbRating"]
df_9["imdbVotes"] = df_9_scaled["imdbVotes"]
df_9["BoxOffice"] = df_9_scaled["BoxOffice"]

# Review the DataFrame
df_9.head()

In [None]:
# define model results as y and features as X
y = df_9["OscarsWinner"]
X = df_9.drop(columns = "OscarsWinner")

In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_9_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_9_df

In [None]:
pred_9_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## Attempt 10: random forest with data scaling, Data added: boolean value for if movie won best picture at the Golden Globes.

Accuracy Score : 0.75

Balanced Accuracy Score : 0.52

In [None]:
df_10 = df_3.copy()

In [None]:
# Define features set
X = df_10.drop("OscarsWinner", axis=1)
y = df_10["OscarsWinner"].ravel()

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [None]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [None]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=random_state)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

## Attempt 11: random forest with data scaling, Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.

Accuracy Score : 0.75

Balanced Accuracy Score : 0.5

In [None]:
df_11 = df_6.copy()

In [None]:
# Define features set
X = df_11.drop("OscarsWinner", axis=1)
y = df_11["OscarsWinner"].ravel()

In [None]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=random_state)

In [None]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [None]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

In [None]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

## Attempt 12: logistic regression with data scaling, Data Added: boolean value for if movie won best picture at the Golden Globes, director, country, and producer

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [None]:
df_12 = df_5.copy()
df_12.head()

In [None]:
# Add in director, country, and productioncompany columns
df_12["Director"] = df["Director"]
df_12["Producer"] = df["name"]
df_12["Country"] = df["Country"]
df_12.head()

In [None]:
# Split director column into 3 columns since up to 3 directors can be featured
df_12[["Director1", "Director2", "Director3"]] = df_12["Director"].str.split(', ', expand=True)
df_12.drop(columns=["Director"], axis=1, inplace=True)
list(df_12.columns)

In [None]:
# Check what unique values we have for our three directors columns
directors = []
directors.extend(df_12["Director1"].values)
directors.extend(df_12["Director2"].values)
directors.extend(df_12["Director3"].values)
directors_list = list(set(directors))
print(directors_list)


In [None]:
# Perform one-hot encoding on directors
for d in directors_list:
    # Create a column for each genre
    df_12[d] = 0
    # Columns will have a 0 or 1 if the movie is of the column's genre
    df_12[d] = ((df_12["Director1"] == d) | (df_12["Director2"] == d) | (df_12["Director3"] == d)).astype(int)
df_12.drop(columns=["Director1", "Director2", "Director3"], inplace=True)
list(df_12.columns)

In [None]:
# Remove the 'None' column
df_12 = df_12[['OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror',
 'Rated_Approved',
 'Rated_G',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_Unrated',
 'Rated_X',
 'year_ceremony_1945',
 'year_ceremony_1946',
 'year_ceremony_1948',
 'year_ceremony_1949',
 'year_ceremony_1951',
 'year_ceremony_1953',
 'year_ceremony_1955',
 'year_ceremony_1957',
 'year_ceremony_1958',
 'year_ceremony_1959',
 'year_ceremony_1960',
 'year_ceremony_1961',
 'year_ceremony_1962',
 'year_ceremony_1963',
 'year_ceremony_1964',
 'year_ceremony_1965',
 'year_ceremony_1966',
 'year_ceremony_1967',
 'year_ceremony_1968',
 'year_ceremony_1969',
 'year_ceremony_1970',
 'year_ceremony_1971',
 'year_ceremony_1972',
 'year_ceremony_1973',
 'year_ceremony_1974',
 'year_ceremony_1975',
 'year_ceremony_1976',
 'year_ceremony_1977',
 'year_ceremony_1978',
 'year_ceremony_1979',
 'year_ceremony_1980',
 'year_ceremony_1981',
 'year_ceremony_1982',
 'year_ceremony_1983',
 'year_ceremony_1984',
 'year_ceremony_1985',
 'year_ceremony_1986',
 'year_ceremony_1987',
 'year_ceremony_1988',
 'year_ceremony_1989',
 'year_ceremony_1990',
 'year_ceremony_1991',
 'year_ceremony_1992',
 'year_ceremony_1993',
 'year_ceremony_1994',
 'year_ceremony_1995',
 'year_ceremony_1996',
 'year_ceremony_1997',
 'year_ceremony_1998',
 'year_ceremony_1999',
 'year_ceremony_2000',
 'year_ceremony_2001',
 'year_ceremony_2002',
 'year_ceremony_2003',
 'year_ceremony_2004',
 'year_ceremony_2005',
 'year_ceremony_2006',
 'year_ceremony_2007',
 'year_ceremony_2008',
 'year_ceremony_2009',
 'year_ceremony_2010',
 'year_ceremony_2011',
 'year_ceremony_2012',
 'year_ceremony_2013',
 'year_ceremony_2014',
 'year_ceremony_2015',
 'year_ceremony_2016',
 'year_ceremony_2017',
 'year_ceremony_2018',
 'year_ceremony_2019',
 'year_ceremony_2020',
 'Producer',
 'Country',
 'Fred Fleck',
 'Joseph L. Mankiewicz',
 'Sidney Lanfield',
 'Charles Walters',
 'John Schlesinger',
 'Edward Dmytryk',
 'Frank Borzage',
 'Michael Curtiz',
 'Vincente Minnelli',
 'J. Lee Thompson',
 'Ralph Nelson',
 'Victor Schertzinger',
 'Laurence Olivier',
 'Ernst Lubitsch',
 'Clarence Brown',
 'John Huston',
 'Leo McCarey',
 'Mark Robson',
 'Andrew Marton',
 'Arthur Penn',
 'John Ford',
 'Fred Coe',
 'John Farrow',
 'Stanley Kramer',
 'Jerome Robbins',
 'Orson Welles',
 'W.S. Van Dyke',
 'Jack Cardiff',
 'Elia Kazan',
 'Richard Thorpe',
 'George Sidney',
 'Carol Reed',
 'William Wyler',
 'Joseph Barbera',
 'David Lean',
 'Anthony Harvey',
 'Compton Bennett',
 'Billy Wilder',
 'Max Reinhardt',
 'John Wayne',
 'Cecil B. DeMille',
 'Robert Z. Leonard',
 'Alexander Hall',
 'Richard Fleischer',
 'Alfred E. Green',
 'Daniel Mann',
 'Howard Hawks',
 'Walter Lang',
 'Robert Mulligan',
 'George Seaton',
 'Gregory La Cava',
 'Henry Koster',
 'Norman Taurog',
 'Tony Richardson',
 'Emeric Pressburger',
 'Jack Conway',
 'Fred Zinnemann',
 'William Dieterle',
 'Robert Wise',
 'Anthony Asquith',
 'Michael Cacoyannis',
 'Gustav Machatý',
 'Joshua Logan',
 'Robert Stevenson',
 'King Vidor',
 'Gerd Oswald',
 'Tay Garnett',
 'Sidney Franklin',
 'Sam Wood',
 'Mitchell Leisen',
 'Morton DaCosta',
 'Michael Anderson',
 'Edward F. Cline',
 'Henry Hathaway',
 'Delbert Mann',
 'John M. Stahl',
 'Jean Negulesco',
 'William Keighley',
 'William A. Wellman',
 'Stanley Donen',
 'Alfred L. Werker',
 'Victor Fleming',
 'Leslie Howard',
 'Anthony Mann',
 'Michael Powell',
 'Anatole Litvak',
 'Charles Chaplin',
 'John Cromwell',
 'Stanley Kubrick',
 'Irving Cummings',
 'Mervyn LeRoy',
 'Albert Lewin',
 'Hal Mohr',
 'Roy Del Ruth',
 'Edmund Goulding',
 'Lloyd Bacon',
 'Sidney Lumet',
 'Robert Rossen',
 'Peter Glenville',
 'Frank Capra',
 'Otto Lang',
 'Alfred Hitchcock',
 'Frank Lloyd',
 'Otto Preminger',
 'Irving Rapper',
 'Norman Jewison',
 'Henry King',
 'Mike Nichols',
 'Lewis Gilbert',
 'Irving Pichel',
 'Richard Brooks',
 'Mark Sandrich',
 'George Cukor',
 'Herman Shumlin',
 'George Stevens',
 'William Hanna',
 'Lewis Milestone',
 'Ken Annakin']]

In [None]:
# Run get_dummies on our ProductionCompany Column
df_12 = pd.get_dummies(df_12, columns=["Producer", "Country"])


In [None]:
# define model results as y and features as X
y = df_12["OscarsWinner"]
X = df_12.drop(columns = "OscarsWinner")

In [None]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

In [None]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=random_state)

In [None]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=random_state)
classifier.fit(X_train, y_train)

In [None]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_12_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_12_df

In [None]:
pred_12_df["Prediction"].value_counts()

In [None]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

## The best attempt was Attempt 5, which used logistic regression, scaled the data, and included Golden globes results. For the random state that we used throughout this project (random_state = 29), attempt 6 and 12 had the exact classification result, confusion matrix, and accuracy scores as Attempt 5. We ran the models with 10 other random seeds, and in those Attempt 12 was worse than Attempt 5 but Attempts 6 and 5 were always very similar.

## We'll use attempt 5 as our model since, unlike attempt 6, it still includes the Box Office column, which the random forest models we created seemed to give importance to.

In [None]:
model_training_df = df_5.copy()
model_training_df.to_csv("Resources/model_training_data.csv", index=False)