This notebook is for trialing different learning models. The models we've tested include logistic regression and random forest.

In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score, confusion_matrix, classification_report, accuracy_score
import numpy as np


In [2]:
# Read in main dataset for model creation
df = pd.read_csv("Resources/combined_clean.csv")
df.head()

Unnamed: 0,year_film,year_ceremony,category,name,film,OscarsWinner,Rated,Released,Runtime,Director,...,Country,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3
0,1929,1930,OUTSTANDING PRODUCTION,Warner Bros.,Disraeli,False,Passed,01 Nov 1929,90,Alfred E. Green,...,United States,Won 1 Oscar. 4 wins & 2 nominations total,0.0,6.1,1338,0,A & E,Biography,Drama,History
1,1929,1930,OUTSTANDING PRODUCTION,Paramount Famous Lasky,The Love Parade,False,Passed,18 Jan 1930,107,Ernst Lubitsch,...,United States,Nominated for 6 Oscars. 1 win & 6 nominations ...,0.0,7.0,2500,0,,Comedy,Musical,Romance
2,1931,1932,OUTSTANDING PRODUCTION,Samuel Goldwyn Productions,Arrowsmith,False,Approved,26 Dec 1931,108,John Ford,...,United States,Nominated for 4 Oscars. 4 nominations total,0.0,6.2,1862,0,,Drama,,
3,1931,1932,OUTSTANDING PRODUCTION,Fox,Bad Girl,False,Passed,13 Sep 1931,90,Frank Borzage,...,United States,Won 2 Oscars. 2 wins & 1 nomination total,0.0,6.5,1504,0,,Drama,Romance,
4,1931,1932,OUTSTANDING PRODUCTION,Metro-Goldwyn-Mayer,The Champ,False,Passed,21 Nov 1931,86,King Vidor,...,United States,Won 2 Oscars. 2 wins & 3 nominations total,0.0,7.3,3416,0,,Drama,Family,Sport


In [3]:
# Check unique values for each column to help decide which to include in each model attempt
df.nunique()

year_film         92
year_ceremony     92
category           4
name             367
film             521
OscarsWinner       2
Rated             13
Released         504
Runtime          116
Director         305
Writer           514
Actors           526
Language         135
Country           78
Awards           494
Metascore         53
imdbRating        33
imdbVotes        527
BoxOffice        341
Production         4
Genre1            12
Genre2            18
Genre3            16
dtype: int64

In [4]:
# Drop columns for first attemt
df_1 = df.drop(["year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production"],
              axis=1)


In [5]:
df_1.nunique()

year_ceremony     92
OscarsWinner       2
Rated             13
Runtime          116
Metascore         53
imdbRating        33
imdbVotes        527
BoxOffice        341
Genre1            12
Genre2            18
Genre3            16
dtype: int64

In [6]:
# Check what unique values we have for our three Genre columns
def genre_list(dataframe):
    genres = []
    genres.extend(dataframe["Genre1"].values)
    genres.extend(dataframe["Genre2"].values)
    genres.extend(dataframe["Genre3"].values)
    global genres_list
    genres_list = list(set(genres))
    print(genres_list)
genre_list(df_1)

['Romance', 'Sport', 'Film-Noir', 'Western', 'Musical', 'Music', 'Comedy', 'War', 'Drama', 'Thriller', 'Crime', 'Animation', 'Horror', 'Adventure', 'Biography', nan, 'Short', 'Mystery', 'Action', 'History', 'Fantasy', 'Sci-Fi', 'Family']


In [7]:
# Perform one-hot encoding on Genres
def genre_encoding(dataframe):
    for g in genres_list:
        # Create a column for each genre
        dataframe[g] = 0
        # Columns will have a 0 or 1 if the movie is of the column's genre
        dataframe[g] = ((dataframe["Genre1"] == g) | (dataframe["Genre2"] == g) | (dataframe["Genre3"] == g)).astype(int)
genre_encoding(df_1)
df_1.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_1.columns)

['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Romance',
 'Sport',
 'Film-Noir',
 'Western',
 'Musical',
 'Music',
 'Comedy',
 'War',
 'Drama',
 'Thriller',
 'Crime',
 'Animation',
 'Horror',
 'Adventure',
 'Biography',
 nan,
 'Short',
 'Mystery',
 'Action',
 'History',
 'Fantasy',
 'Sci-Fi',
 'Family']

In [8]:
# Get rid of the nan column that was created for when a movie had less than 3 genres
df_1 = df_1[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Fantasy',
 'Family',
 'Thriller',
 'Crime',
 'Western',
 'Musical',
 'Drama',
 'War',
 'Mystery',
 'Film-Noir',
 'Action',
 'Horror',
 'Adventure',
 'Sport',
 'Short',
 'Biography',
 'History',
 'Music',
 'Comedy',
 'Sci-Fi',
 'Romance',
 'Animation']]

In [9]:
list(df_1.columns)

['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Fantasy',
 'Family',
 'Thriller',
 'Crime',
 'Western',
 'Musical',
 'Drama',
 'War',
 'Mystery',
 'Film-Noir',
 'Action',
 'Horror',
 'Adventure',
 'Sport',
 'Short',
 'Biography',
 'History',
 'Music',
 'Comedy',
 'Sci-Fi',
 'Romance',
 'Animation']

In [10]:
df_1.dtypes

year_ceremony      int64
OscarsWinner        bool
Rated             object
Runtime            int64
Metascore        float64
imdbRating       float64
imdbVotes          int64
BoxOffice          int64
Fantasy            int32
Family             int32
Thriller           int32
Crime              int32
Western            int32
Musical            int32
Drama              int32
War                int32
Mystery            int32
Film-Noir          int32
Action             int32
Horror             int32
Adventure          int32
Sport              int32
Short              int32
Biography          int32
History            int32
Music              int32
Comedy             int32
Sci-Fi             int32
Romance            int32
Animation          int32
dtype: object

In [11]:
# Change the boolean values of True/False to 1/0 for the OscarsWinner column
df_1["OscarsWinner"] = df_1["OscarsWinner"].astype(int)

In [12]:
df_1.dtypes

year_ceremony      int64
OscarsWinner       int32
Rated             object
Runtime            int64
Metascore        float64
imdbRating       float64
imdbVotes          int64
BoxOffice          int64
Fantasy            int32
Family             int32
Thriller           int32
Crime              int32
Western            int32
Musical            int32
Drama              int32
War                int32
Mystery            int32
Film-Noir          int32
Action             int32
Horror             int32
Adventure          int32
Sport              int32
Short              int32
Biography          int32
History            int32
Music              int32
Comedy             int32
Sci-Fi             int32
Romance            int32
Animation          int32
dtype: object

In [13]:
# Run get_dummies on our Rated Column
df_1 = pd.get_dummies(df_1, columns=["Rated"])

In [14]:
list(df_1.columns)

['year_ceremony',
 'OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Fantasy',
 'Family',
 'Thriller',
 'Crime',
 'Western',
 'Musical',
 'Drama',
 'War',
 'Mystery',
 'Film-Noir',
 'Action',
 'Horror',
 'Adventure',
 'Sport',
 'Short',
 'Biography',
 'History',
 'Music',
 'Comedy',
 'Sci-Fi',
 'Romance',
 'Animation',
 'Rated_Approved',
 'Rated_G',
 'Rated_GP',
 'Rated_M/PG',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_TV-MA',
 'Rated_TV-PG',
 'Rated_Unrated',
 'Rated_X']

In [15]:
df_1_copy = df_1.copy()

In [16]:
list(df_1_copy.columns)

['year_ceremony',
 'OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'Fantasy',
 'Family',
 'Thriller',
 'Crime',
 'Western',
 'Musical',
 'Drama',
 'War',
 'Mystery',
 'Film-Noir',
 'Action',
 'Horror',
 'Adventure',
 'Sport',
 'Short',
 'Biography',
 'History',
 'Music',
 'Comedy',
 'Sci-Fi',
 'Romance',
 'Animation',
 'Rated_Approved',
 'Rated_G',
 'Rated_GP',
 'Rated_M/PG',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_TV-MA',
 'Rated_TV-PG',
 'Rated_Unrated',
 'Rated_X']

In [17]:
# Change year_ceremony to string since these will not have math done on them
df_1 = df_1.astype({"year_ceremony":"str"})

In [18]:
# Run get_dummies on our year_ceremony Column
df_1 = pd.get_dummies(df_1, columns=["year_ceremony"])

In [19]:
df_1.shape

(527, 133)

In [20]:
df_1.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Fantasy,Family,Thriller,Crime,...,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020,year_ceremony_2021,year_ceremony_2022,year_ceremony_2023
0,0,90,0.0,6.1,1338,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,107,0.0,7.0,2500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,108,0.0,6.2,1862,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,90,0.0,6.5,1504,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,86,0.0,7.3,3416,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


## Attempt 1: logistic regression
Accuracy Score : 0.84

Balanced Accuracy Score : 0.5

In [21]:
# define model results as y and features as X
y = df_1["OscarsWinner"]
X = df_1.drop(columns = "OscarsWinner")


In [22]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    443
1     84
Name: OscarsWinner, dtype: int64

In [23]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [24]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [25]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_1_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_1_df

Unnamed: 0,Prediction,Actual
377,0,1
450,0,1
226,0,1
434,0,0
122,0,0
332,0,0
31,0,0
445,0,0
129,0,1
427,0,0


In [26]:
pred_1_df["Prediction"].value_counts()

0    10
Name: Prediction, dtype: int64

In [27]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,111,0
Actual winner,21,0


Accuracy Score : 0.8409090909090909
Balanced Accuracy Score : 0.5

Classification Report
              precision    recall  f1-score   support

       loser       0.84      1.00      0.91       111
      winner       0.00      0.00      0.00        21

    accuracy                           0.84       132
   macro avg       0.42      0.50      0.46       132
weighted avg       0.71      0.84      0.77       132



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Attempt 2: logistic regression, limiting to past 50 years of data
Our theory is that movies that are >50 years old are more likely to have NaN values for some of our features, such as critical reviews and Box Office.

Accuracy Score : 0.85

Balanced Accuracy Score : 0.5

In [28]:
# Create the dataframe by selecting only data from 1973 and beyond
df_2 = df_1_copy.loc[df_1_copy["year_ceremony"]>=1973]
df_2.head()

Unnamed: 0,year_ceremony,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Fantasy,Family,Thriller,...,Rated_M/PG,Rated_Not Rated,Rated_PG,Rated_PG-13,Rated_Passed,Rated_R,Rated_TV-MA,Rated_TV-PG,Rated_Unrated,Rated_X
238,1973,0,124,80.0,7.8,57679,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
239,1973,0,109,80.0,7.7,115756,0,0,0,1,...,0,0,0,0,0,1,0,0,0,0
240,1973,1,175,100.0,9.2,1945639,136381073,0,0,0,...,0,0,0,0,0,1,0,0,0,0
241,1973,0,105,80.0,7.5,4638,3100601,0,1,0,...,0,0,0,0,0,0,0,0,0,0
242,1974,0,110,97.0,7.4,95294,115000000,0,0,0,...,0,0,1,0,0,0,0,0,0,0


In [29]:
# Change year_ceremony to string since these will not have math done on them
df_2 = df_2.astype({"year_ceremony":"str"})

In [30]:
# Run get_dummies on our year_ceremony Column
df_2 = pd.get_dummies(df_2, columns=["year_ceremony"])

In [31]:
# Attempt 2 has 238 less rows of data than Attempt 1
df_2.shape

(289, 92)

In [32]:
df_2.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Fantasy,Family,Thriller,Crime,...,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020,year_ceremony_2021,year_ceremony_2022,year_ceremony_2023
238,0,124,80.0,7.8,57679,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
239,0,109,80.0,7.7,115756,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
240,1,175,100.0,9.2,1945639,136381073,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
241,0,105,80.0,7.5,4638,3100601,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
242,0,110,97.0,7.4,95294,115000000,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [33]:
# define model results as y and features as X
y = df_2["OscarsWinner"]
X = df_2.drop(columns = "OscarsWinner")


In [34]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    241
1     48
Name: OscarsWinner, dtype: int64

In [35]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [36]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [37]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_2_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_2_df

Unnamed: 0,Prediction,Actual
521,0,1
243,0,0
421,0,0
246,0,0
470,0,0
509,0,0
497,0,0
516,0,0
268,0,0
334,0,0


In [38]:
pred_2_df["Prediction"].value_counts()

0    10
Name: Prediction, dtype: int64

In [39]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,62,0
Actual winner,11,0


Accuracy Score : 0.8493150684931506
Balanced Accuracy Score : 0.5

Classification Report
              precision    recall  f1-score   support

       loser       0.85      1.00      0.92        62
      winner       0.00      0.00      0.00        11

    accuracy                           0.85        73
   macro avg       0.42      0.50      0.46        73
weighted avg       0.72      0.85      0.78        73



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Attempt 3: logistic regression. Data added: boolean value for if movie won best picture at the Golden Globes. Data from the Golden Globes are for the years 1944-2020, so all other years will be eliminated from this attempt.

Accuracy Score : 0.65

Balanced Accuracy Score : 0.51

In [40]:
df_3 = pd.read_csv("Resources/combined_with_globes_clean.csv")
df_3.head()

Unnamed: 0,year_film,year_ceremony,category,name,film,OscarsWinner,Title/Year,Rated,Released,Runtime,...,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3,GlobesWinner
0,1944,1945,BEST MOTION PICTURE,Paramount,Going My Way,True,Going My Way-1944,Passed,02 Oct 1944,126,...,Won 7 Oscars. 19 wins & 5 nominations total,90.0,7.0,12703,0,,Comedy,Drama,Music,True
1,1945,1946,BEST MOTION PICTURE,Paramount,The Lost Weekend,True,The Lost Weekend-1945,Passed,01 Jan 1946,101,...,Won 4 Oscars. 16 wins & 3 nominations total,0.0,7.9,39085,0,,Drama,Film-Noir,,True
2,1947,1948,BEST MOTION PICTURE,20th Century-Fox,Gentleman's Agreement,True,Gentleman's Agreement-1947,Not Rated,01 Mar 1948,118,...,Won 3 Oscars. 12 wins & 8 nominations total,0.0,7.2,17293,0,,Drama,Romance,,True
3,1948,1949,BEST MOTION PICTURE,Warner Bros.,Johnny Belinda,False,Johnny Belinda-1948,Unrated,14 Sep 1948,102,...,Won 1 Oscar. 6 wins & 13 nominations total,0.0,7.7,5107,0,,Drama,,,True
4,1950,1951,BEST MOTION PICTURE,Columbia,Born Yesterday,False,Born Yesterday-1950,Not Rated,01 Feb 1951,103,...,Won 1 Oscar. 5 wins & 10 nominations total,0.0,7.5,11992,0,,Comedy,Drama,Romance,False


In [41]:
# Drop columns for first attempt
df_3 = df_3.drop(["Title/Year","year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production"],
              axis=1)

In [42]:
df_3.columns

Index(['year_ceremony', 'OscarsWinner', 'Rated', 'Runtime', 'Metascore',
       'imdbRating', 'imdbVotes', 'BoxOffice', 'Genre1', 'Genre2', 'Genre3',
       'GlobesWinner'],
      dtype='object')

In [43]:
# Run previously defined function to one-hot encode the genres
genre_list(df_3)

['Romance', 'Sport', 'Film-Noir', 'Western', 'Musical', 'Music', 'Comedy', 'War', 'Thriller', 'Drama', 'Crime', 'Animation', 'Horror', 'Adventure', 'Biography', nan, 'Mystery', 'Action', 'History', 'Fantasy', 'Sci-Fi', 'Family']


In [44]:
# Run previously defined function to one-hot encode the genres
genre_encoding(df_3)
df_3.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_3.columns)

['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Romance',
 'Sport',
 'Film-Noir',
 'Western',
 'Musical',
 'Music',
 'Comedy',
 'War',
 'Thriller',
 'Drama',
 'Crime',
 'Animation',
 'Horror',
 'Adventure',
 'Biography',
 nan,
 'Mystery',
 'Action',
 'History',
 'Fantasy',
 'Sci-Fi',
 'Family']

In [45]:
# Drop the nan column
df_3 = df_3[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror']]

In [46]:
df_3.dtypes

year_ceremony      int64
OscarsWinner        bool
Rated             object
Runtime            int64
Metascore        float64
imdbRating       float64
imdbVotes          int64
BoxOffice          int64
GlobesWinner        bool
Fantasy            int32
Action             int32
Adventure          int32
Animation          int32
Romance            int32
Musical            int32
Biography          int32
Family             int32
Comedy             int32
Sci-Fi             int32
Film-Noir          int32
War                int32
Sport              int32
Music              int32
Drama              int32
Mystery            int32
Thriller           int32
History            int32
Western            int32
Crime              int32
Horror             int32
dtype: object

In [47]:
# Adjust datatypes and get_dummies on necessary columns
df_3 = df_3.astype({"OscarsWinner": "int", "GlobesWinner":"int",
                                                   "year_ceremony":"str"})
df_3 = pd.get_dummies(df_3, columns=["Rated", "year_ceremony"])
df_3.dtypes

OscarsWinner            int32
Runtime                 int64
Metascore             float64
imdbRating            float64
imdbVotes               int64
                       ...   
year_ceremony_2016      uint8
year_ceremony_2017      uint8
year_ceremony_2018      uint8
year_ceremony_2019      uint8
year_ceremony_2020      uint8
Length: 108, dtype: object

In [48]:
# define model results as y and features as X
y = df_3["OscarsWinner"]
X = df_3.drop(columns = "OscarsWinner")


In [49]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    170
1     50
Name: OscarsWinner, dtype: int64

In [50]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [51]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [52]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_3_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_3_df

Unnamed: 0,Prediction,Actual
180,0,0
25,1,1
141,0,0
7,0,1
80,0,1
48,1,0
76,0,0
61,0,0
166,0,0
124,0,1


In [53]:
pred_3_df["Prediction"].value_counts()

0    8
1    2
Name: Prediction, dtype: int64

In [54]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,33,8
Actual winner,11,3


Accuracy Score : 0.6545454545454545
Balanced Accuracy Score : 0.509581881533101

Classification Report
              precision    recall  f1-score   support

       loser       0.75      0.80      0.78        41
      winner       0.27      0.21      0.24        14

    accuracy                           0.65        55
   macro avg       0.51      0.51      0.51        55
weighted avg       0.63      0.65      0.64        55



# Attempt 4: logistic regression. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.
Data from the Golden Globes are for the years 1944-2020, so all other years will be eliminated from this attempt.

Box Office values not great to compare across all movies due to inflation

Accuracy Score : 0.75

Balanced Accuracy Score : 0.52

In [55]:
# Copy the 3rd attempt (includes Globes data) but remove BoxOffice column
df_4 = df_3.drop(columns = "BoxOffice")

In [56]:
df_4.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,GlobesWinner,Fantasy,Action,Adventure,Animation,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,126,90.0,7.0,12703,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,101,0.0,7.9,39085,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,118,0.0,7.2,17293,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,102,0.0,7.7,5107,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,103,0.0,7.5,11992,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [57]:
# define model results as y and features as X
y = df_4["OscarsWinner"]
X = df_4.drop(columns = "OscarsWinner")

In [58]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    170
1     50
Name: OscarsWinner, dtype: int64

In [59]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [60]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [61]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_4_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_4_df

Unnamed: 0,Prediction,Actual
180,0,0
25,0,1
141,0,0
7,0,1
80,0,1
48,0,0
76,0,0
61,0,0
166,0,0
124,0,1


In [62]:
pred_4_df["Prediction"].value_counts()

0    10
Name: Prediction, dtype: int64

In [63]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,40,1
Actual winner,13,1


Accuracy Score : 0.7454545454545455
Balanced Accuracy Score : 0.5235191637630662

Classification Report
              precision    recall  f1-score   support

       loser       0.75      0.98      0.85        41
      winner       0.50      0.07      0.12        14

    accuracy                           0.75        55
   macro avg       0.63      0.52      0.49        55
weighted avg       0.69      0.75      0.67        55



# Attempt 5: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes.

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [64]:
# Copy the 3rd attempt (includes Globes data)
df_5 = df_3.copy()


In [65]:
# Scaling the numeric columns
df_5_scaled = StandardScaler().fit_transform(df_5[["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"]])

# Review the scaled data
df_5_scaled

array([[-0.25502248,  0.66598049, -1.44161403, -0.76856495, -0.80499656],
       [-1.25049704, -3.33111367,  0.46385712, -0.70860014, -0.80499656],
       [-0.57357434, -3.33111367, -1.018176  , -0.75813214, -0.80499656],
       ...,
       [ 0.18298632,  0.84362912,  0.46385712, -0.04439133, -0.80499656],
       [-0.53375536,  0.1330346 ,  1.09901417,  0.65735061,  0.73867953],
       [-0.01610859,  0.93245343,  1.73417122,  1.21156938, -0.28758888]])

In [66]:
# Create a DataFrame of the scaled data
df_5_scaled = pd.DataFrame(df_5_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"])

# Replace the original data with the columns of information from the scaled Data
df_5["Runtime"] = df_5_scaled["Runtime"]
df_5["Metascore"] = df_5_scaled["Metascore"]
df_5["imdbRating"] = df_5_scaled["imdbRating"]
df_5["imdbVotes"] = df_5_scaled["imdbVotes"]
df_5["BoxOffice"] = df_5_scaled["BoxOffice"]

# Review the DataFrame
df_5.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,-0.255022,0.66598,-1.441614,-0.768565,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-1.250497,-3.331114,0.463857,-0.7086,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,-0.573574,-3.331114,-1.018176,-0.758132,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.210678,-3.331114,0.040419,-0.78583,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.170859,-3.331114,-0.383019,-0.770181,-0.804997,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [67]:
# define model results as y and features as X
y = df_5["OscarsWinner"]
X = df_5.drop(columns = "OscarsWinner")

In [68]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    170
1     50
Name: OscarsWinner, dtype: int64

In [69]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [70]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [71]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_5_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_5_df

Unnamed: 0,Prediction,Actual
180,0,0
25,0,1
141,0,0
7,0,1
80,1,1
48,1,0
76,0,0
61,0,0
166,0,0
124,0,1


In [72]:
pred_5_df["Prediction"].value_counts()

0    8
1    2
Name: Prediction, dtype: int64

In [73]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,38,3
Actual winner,9,5


Accuracy Score : 0.7818181818181819
Balanced Accuracy Score : 0.64198606271777

Classification Report
              precision    recall  f1-score   support

       loser       0.81      0.93      0.86        41
      winner       0.62      0.36      0.45        14

    accuracy                           0.78        55
   macro avg       0.72      0.64      0.66        55
weighted avg       0.76      0.78      0.76        55



## Attempt 6: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [74]:
df_6 = df_3.drop("BoxOffice", axis=1)
df_6.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,GlobesWinner,Fantasy,Action,Adventure,Animation,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,126,90.0,7.0,12703,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,101,0.0,7.9,39085,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,118,0.0,7.2,17293,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,102,0.0,7.7,5107,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,103,0.0,7.5,11992,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
# Scaling the numeric columns
df_6_scaled = StandardScaler().fit_transform(df_6[["Runtime", "Metascore", "imdbRating", "imdbVotes"]])

# Review the scaled data
df_6_scaled

array([[-2.55022482e-01,  6.65980487e-01, -1.44161403e+00,
        -7.68564954e-01],
       [-1.25049704e+00, -3.33111367e+00,  4.63857118e-01,
        -7.08600143e-01],
       [-5.73574341e-01, -3.33111367e+00, -1.01817600e+00,
        -7.58132141e-01],
       [-1.21067806e+00, -3.33111367e+00,  4.04190850e-02,
        -7.85830237e-01],
       [-1.17085908e+00, -3.33111367e+00, -3.83018948e-01,
        -7.70181017e-01],
       [-1.88760076e+00,  6.21568329e-01,  6.75576134e-01,
        -5.52282960e-01],
       [-9.71764165e-01,  7.10392644e-01,  8.87295151e-01,
        -4.33469354e-01],
       [ 1.69610765e+00, -2.66674817e-01, -2.07677108e+00,
        -7.31727349e-01],
       [ 2.73140119e+00,  3.99507543e-01, -1.71299932e-01,
        -7.04156538e-01],
       [ 5.81176147e-01, -3.33111367e+00, -1.44161403e+00,
        -7.80904768e-01],
       [-6.53212306e-01,  4.42102839e-02,  1.52245220e+00,
        -4.98134613e-01],
       [ 4.21900218e-01, -3.33111367e+00,  4.63857118e-01,
      

In [76]:
# Create a DataFrame of the scaled data
df_6_scaled = pd.DataFrame(df_6_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes"])

# Replace the original data with the columns of information from the scaled Data
df_6["Runtime"] = df_6_scaled["Runtime"]
df_6["Metascore"] = df_6_scaled["Metascore"]
df_6["imdbRating"] = df_6_scaled["imdbRating"]
df_6["imdbVotes"] = df_6_scaled["imdbVotes"]


# Review the DataFrame
df_6.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,GlobesWinner,Fantasy,Action,Adventure,Animation,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,-0.255022,0.66598,-1.441614,-0.768565,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-1.250497,-3.331114,0.463857,-0.7086,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,-0.573574,-3.331114,-1.018176,-0.758132,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.210678,-3.331114,0.040419,-0.78583,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.170859,-3.331114,-0.383019,-0.770181,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [77]:
# define model results as y and features as X
y = df_6["OscarsWinner"]
X = df_6.drop(columns = "OscarsWinner")

In [78]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    170
1     50
Name: OscarsWinner, dtype: int64

In [79]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [80]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [81]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_6_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_6_df

Unnamed: 0,Prediction,Actual
180,0,0
25,0,1
141,0,0
7,0,1
80,1,1
48,1,0
76,0,0
61,0,0
166,0,0
124,0,1


In [82]:
pred_6_df["Prediction"].value_counts()

0    8
1    2
Name: Prediction, dtype: int64

In [83]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,38,3
Actual winner,9,5


Accuracy Score : 0.7818181818181819
Balanced Accuracy Score : 0.64198606271777

Classification Report
              precision    recall  f1-score   support

       loser       0.81      0.93      0.86        41
      winner       0.62      0.36      0.45        14

    accuracy                           0.78        55
   macro avg       0.72      0.64      0.66        55
weighted avg       0.76      0.78      0.76        55



## Attempt 7: logistic regression with data scaling. Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column limiting to past 50 years of data

Accuracy Score : 0.77

Balanced Accuracy Score : 0.53

In [84]:
df_7 = pd.read_csv("Resources/combined_with_globes_clean.csv")
df_7 = df_7.loc[df_7["year_ceremony"]>=1973]
df_7.head()

Unnamed: 0,year_film,year_ceremony,category,name,film,OscarsWinner,Title/Year,Rated,Released,Runtime,...,Awards,Metascore,imdbRating,imdbVotes,BoxOffice,Production,Genre1,Genre2,Genre3,GlobesWinner
48,1972,1973,BEST PICTURE,"Cy Feuer, Producer",Cabaret,False,Cabaret-1972,PG,13 Feb 1972,124,...,Won 8 Oscars. 35 wins & 17 nominations total,80.0,7.8,57679,0,,Drama,Music,Musical,True
49,1972,1973,BEST PICTURE,"John Boorman, Producer",Deliverance,False,Deliverance-1972,R,18 Aug 1972,109,...,Nominated for 3 Oscars. 2 wins & 14 nomination...,80.0,7.7,115756,0,,Adventure,Drama,Thriller,False
50,1973,1974,BEST PICTURE,"Francis Ford Coppola, Producer; Gary Kurtz, C...",American Graffiti,False,American Graffiti-1973,PG,11 Aug 1973,110,...,Nominated for 5 Oscars. 9 wins & 13 nomination...,97.0,7.4,95294,115000000,,Comedy,Drama,,True
51,1974,1975,BEST PICTURE,"Robert Evans, Producer",Chinatown,False,Chinatown-1974,R,20 Jun 1974,130,...,Won 1 Oscar. 21 wins & 24 nominations total,92.0,8.2,339335,29200000,,Drama,Mystery,Thriller,True
52,1975,1976,BEST PICTURE,"Stanley Kubrick, Producer",Barry Lyndon,False,Barry Lyndon-1975,PG,18 Dec 1975,185,...,Won 4 Oscars. 17 wins & 14 nominations total,89.0,8.1,175953,0,,Adventure,Drama,War,False


In [85]:
df_7 = df_7.drop(["Title/Year","year_film","category","name","film","Released", "Director", "Writer",
               "Actors", "Language", "Country", "Awards", "Production", "BoxOffice"],
              axis=1)

In [86]:
df_7.columns

Index(['year_ceremony', 'OscarsWinner', 'Rated', 'Runtime', 'Metascore',
       'imdbRating', 'imdbVotes', 'Genre1', 'Genre2', 'Genre3',
       'GlobesWinner'],
      dtype='object')

In [87]:
# Run previously defined function to one-hot encode the genres
genre_list(df_7)

['Romance', 'Sport', 'Western', 'Musical', 'Music', 'Comedy', 'War', 'Thriller', 'Drama', 'Crime', 'Animation', 'Horror', 'Adventure', 'Biography', nan, 'Mystery', 'Action', 'History', 'Fantasy', 'Sci-Fi', 'Family']


In [88]:
# Run previously defined function to one-hot encode the genres
genre_encoding(df_7)
df_7.drop(columns=["Genre1", "Genre2", "Genre3"], inplace=True)
list(df_7.columns)

['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'GlobesWinner',
 'Romance',
 'Sport',
 'Western',
 'Musical',
 'Music',
 'Comedy',
 'War',
 'Thriller',
 'Drama',
 'Crime',
 'Animation',
 'Horror',
 'Adventure',
 'Biography',
 nan,
 'Mystery',
 'Action',
 'History',
 'Fantasy',
 'Sci-Fi',
 'Family']

In [89]:
# Drop the nan column
df_7 = df_7[['year_ceremony',
 'OscarsWinner',
 'Rated',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'GlobesWinner',
 'Family',
 'Animation',
 'War',
 'Musical',
 'Horror',
 'Romance',
 'Fantasy',
 'Comedy',
 'Thriller',
 'Crime',
 'Adventure',
 'Music',
 'Sci-Fi',
 'Western',
 'Drama',
 'Mystery',
 'Sport',
 'Action',
 'Biography',
 'History']]

In [90]:
df_7.dtypes

year_ceremony      int64
OscarsWinner        bool
Rated             object
Runtime            int64
Metascore        float64
imdbRating       float64
imdbVotes          int64
GlobesWinner        bool
Family             int32
Animation          int32
War                int32
Musical            int32
Horror             int32
Romance            int32
Fantasy            int32
Comedy             int32
Thriller           int32
Crime              int32
Adventure          int32
Music              int32
Sci-Fi             int32
Western            int32
Drama              int32
Mystery            int32
Sport              int32
Action             int32
Biography          int32
History            int32
dtype: object

In [91]:
# Adjust datatypes and get_dummies on necessary columns
df_7 = df_7.astype({"OscarsWinner": "int", "GlobesWinner":"int",
                                                   "year_ceremony":"str"})
df_7 = pd.get_dummies(df_7, columns=["Rated", "year_ceremony"])
df_7.dtypes

OscarsWinner            int32
Runtime                 int64
Metascore             float64
imdbRating            float64
imdbVotes               int64
                       ...   
year_ceremony_2016      uint8
year_ceremony_2017      uint8
year_ceremony_2018      uint8
year_ceremony_2019      uint8
year_ceremony_2020      uint8
Length: 78, dtype: object

In [92]:
# Scaling the numeric columns
df_7_scaled = StandardScaler().fit_transform(df_7[["Runtime", "Metascore", "imdbRating", "imdbVotes"]])

# Review the scaled data
df_7_scaled

array([[-2.93162982e-01, -4.94781057e-02,  1.43765917e-01,
        -8.11779514e-01],
       [-9.58387690e-01, -4.94781057e-02, -8.10316989e-02,
        -6.86533249e-01],
       [-9.14039377e-01,  1.35512365e+00, -7.55424548e-01,
        -7.30660685e-01],
       [-2.70730986e-02,  9.42005488e-01,  1.04295638e+00,
        -2.04372763e-01],
       [ 2.41208417e+00,  6.94134590e-01,  8.18158766e-01,
        -5.56715087e-01],
       [-2.48814668e-01,  4.46263691e-01,  5.93361150e-01,
        -3.61514481e-01],
       [-2.93162982e-01,  5.28887324e-01,  8.18158766e-01,
         4.39811883e-01],
       [ 1.30337632e+00,  1.27250002e+00, -3.05829315e-01,
        -8.76320902e-01],
       [ 7.26848238e-01, -8.75714434e-01, -9.80222164e-01,
        -9.24455217e-01],
       [-4.26207924e-01,  1.98392793e-01,  8.18158766e-01,
        -5.80159000e-01],
       [-4.70556238e-01, -8.75714434e-01,  8.18158766e-01,
         3.74824088e-01],
       [-1.66796071e+00,  9.42005488e-01,  5.93361150e-01,
      

In [93]:
df_7.shape

(172, 78)

In [94]:
# define model results as y and features as X
y = df_7["OscarsWinner"]
X = df_7.drop(columns = "OscarsWinner")


In [95]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    136
1     36
Name: OscarsWinner, dtype: int64

In [96]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [97]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [98]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_7_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_7_df

Unnamed: 0,Prediction,Actual
209,0,0
79,0,1
96,0,0
120,0,0
108,0,1
101,0,0
204,0,0
157,0,0
78,0,0
122,0,0


In [99]:
pred_7_df["Prediction"].value_counts()

0    10
Name: Prediction, dtype: int64

In [100]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,32,1
Actual winner,9,1


Accuracy Score : 0.7674418604651163
Balanced Accuracy Score : 0.5348484848484849

Classification Report
              precision    recall  f1-score   support

       loser       0.78      0.97      0.86        33
      winner       0.50      0.10      0.17        10

    accuracy                           0.77        43
   macro avg       0.64      0.53      0.52        43
weighted avg       0.72      0.77      0.70        43



## Attempt 8: random forest with data scaling

Accuracy Score : 0.86

Balanced Accuracy Score : 0.59

In [101]:
df_8 = df_1.copy()

In [102]:
# Define features set
X = df_8.drop("OscarsWinner", axis=1)
y = df_8["OscarsWinner"].ravel()

In [103]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [104]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [105]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [106]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [107]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=29)

In [108]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [109]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [110]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,110,1
Actual winner,21,0


Accuracy Score : 0.8333333333333334
Balanced Accuracy Score : 0.4954954954954955

Classification Report
              precision    recall  f1-score   support

       loser       0.84      0.99      0.91       111
      winner       0.00      0.00      0.00        21

    accuracy                           0.83       132
   macro avg       0.42      0.50      0.45       132
weighted avg       0.71      0.83      0.76       132



In [111]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.1166600475922927, 'imdbVotes'),
 (0.09414758596199196, 'Metascore'),
 (0.09199878796212892, 'Runtime'),
 (0.0909659103505914, 'imdbRating'),
 (0.0860520635011237, 'BoxOffice'),
 (0.018010648079632187, 'Comedy'),
 (0.0176390460025469, 'Biography'),
 (0.014535732315484775, 'Rated_Approved'),
 (0.014328385824776722, 'Drama'),
 (0.014186688430786711, 'Musical'),
 (0.013619599286350113, 'Rated_G'),
 (0.013552154416853854, 'Rated_PG'),
 (0.012270513538959069, 'Adventure'),
 (0.011825761373293013, 'Romance'),
 (0.01174217975726186, 'Rated_PG-13'),
 (0.011723562719219151, 'History'),
 (0.01098329746531927, 'Rated_R'),
 (0.010976503255180412, 'War'),
 (0.009668190749806049, 'Sport'),
 (0.009555811573668976, 'Family'),
 (0.009251543697116055, 'Action'),
 (0.009009730935325606, 'Crime'),
 (0.008078103260834827, 'Thriller'),
 (0.007513504879908022, 'year_ceremony_1948'),
 (0.007266468079340025, 'year_ceremony_1978'),
 (0.007209277230501458, 'year_ceremony_1942'),
 (0.006611215369001602, 'year_

## Attempt 9: logistic regression with data scaling
Accuracy Score : 0.86

Balanced Accuracy Score : 0.59

In [112]:
df_9 = df_1.copy()
df_9.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Fantasy,Family,Thriller,Crime,...,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020,year_ceremony_2021,year_ceremony_2022,year_ceremony_2023
0,0,90,0.0,6.1,1338,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,107,0.0,7.0,2500,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,108,0.0,6.2,1862,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,90,0.0,6.5,1504,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,86,0.0,7.3,3416,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [113]:
# Scaling the numeric columns
df_9_scaled = StandardScaler().fit_transform(df_9[["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"]])

# Review the scaled data
df_9_scaled

array([[-1.34341974, -1.7936508 , -2.32858706, -0.62060012, -0.5755966 ],
       [-0.71607038, -1.7936508 , -0.89125124, -0.61761478, -0.5755966 ],
       [-0.67916748, -1.7936508 , -2.16888308, -0.61925389, -0.5755966 ],
       ...,
       [ 0.1326964 ,  0.44962695,  1.1849005 ,  0.98708486,  7.08477533],
       [ 0.76004576,  0.01822738, -0.4121393 , -0.23706559, -0.52648276],
       [-0.82677909,  0.47838692, -1.05095522, -0.53799189, -0.51744   ]])

In [114]:
# Create a DataFrame of the scaled data
df_9_scaled = pd.DataFrame(df_9_scaled, columns=["Runtime", "Metascore", "imdbRating", "imdbVotes", "BoxOffice"])

# Replace the original data with the columns of information from the scaled Data
df_9["Runtime"] = df_9_scaled["Runtime"]
df_9["Metascore"] = df_9_scaled["Metascore"]
df_9["imdbRating"] = df_9_scaled["imdbRating"]
df_9["imdbVotes"] = df_9_scaled["imdbVotes"]
df_9["BoxOffice"] = df_9_scaled["BoxOffice"]

# Review the DataFrame
df_9.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,Fantasy,Family,Thriller,Crime,...,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020,year_ceremony_2021,year_ceremony_2022,year_ceremony_2023
0,0,-1.34342,-1.793651,-2.328587,-0.6206,-0.575597,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,-0.71607,-1.793651,-0.891251,-0.617615,-0.575597,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,-0.679167,-1.793651,-2.168883,-0.619254,-0.575597,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.34342,-1.793651,-1.689771,-0.620174,-0.575597,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.491031,-1.793651,-0.412139,-0.615261,-0.575597,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [115]:
# define model results as y and features as X
y = df_9["OscarsWinner"]
X = df_9.drop(columns = "OscarsWinner")

In [116]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    443
1     84
Name: OscarsWinner, dtype: int64

In [117]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [118]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [119]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_9_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_9_df

Unnamed: 0,Prediction,Actual
377,1,1
450,0,1
226,0,1
434,0,0
122,0,0
332,0,0
31,0,0
445,0,0
129,0,1
427,0,0


In [120]:
pred_9_df["Prediction"].value_counts()

0    9
1    1
Name: Prediction, dtype: int64

In [121]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,109,2
Actual winner,17,4


Accuracy Score : 0.8560606060606061
Balanced Accuracy Score : 0.5862290862290862

Classification Report
              precision    recall  f1-score   support

       loser       0.87      0.98      0.92       111
      winner       0.67      0.19      0.30        21

    accuracy                           0.86       132
   macro avg       0.77      0.59      0.61       132
weighted avg       0.83      0.86      0.82       132



## Attempt 10: random forest with data scaling, Data added: boolean value for if movie won best picture at the Golden Globes.

Accuracy Score : 0.75

Balanced Accuracy Score : 0.52

In [122]:
df_10 = df_3.copy()

In [123]:
# Define features set
X = df_10.drop("OscarsWinner", axis=1)
y = df_10["OscarsWinner"].ravel()

In [124]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [125]:
# Creating StandardScaler instance
scaler = StandardScaler()

In [126]:
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

In [127]:
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [128]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=29)

In [129]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [130]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

In [131]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,40,1
Actual winner,13,1


Accuracy Score : 0.7454545454545455
Balanced Accuracy Score : 0.5235191637630662

Classification Report
              precision    recall  f1-score   support

       loser       0.75      0.98      0.85        41
      winner       0.50      0.07      0.12        14

    accuracy                           0.75        55
   macro avg       0.63      0.52      0.49        55
weighted avg       0.69      0.75      0.67        55



In [132]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.09789160943241212, 'BoxOffice'),
 (0.09620831434338725, 'imdbRating'),
 (0.09123237686058365, 'imdbVotes'),
 (0.08320322060109761, 'Metascore'),
 (0.0806223724319694, 'Runtime'),
 (0.05805978884917216, 'GlobesWinner'),
 (0.03699768624718095, 'Musical'),
 (0.022667660824041744, 'Rated_R'),
 (0.01493477290253348, 'Rated_Passed'),
 (0.014152218324631016, 'Rated_X'),
 (0.01410006639763255, 'Sport'),
 (0.013726156208656064, 'Comedy'),
 (0.01273310354392418, 'Rated_PG-13'),
 (0.011866831558936124, 'Romance'),
 (0.01143529854433123, 'year_ceremony_1981'),
 (0.011382223857506021, 'Crime'),
 (0.01107536140625937, 'Rated_Approved'),
 (0.010019266594785938, 'Rated_PG'),
 (0.010001159521846993, 'Drama'),
 (0.009708637216055676, 'year_ceremony_1977'),
 (0.009516810567165572, 'War'),
 (0.009017090059497407, 'Thriller'),
 (0.008946488858305802, 'year_ceremony_1990'),
 (0.008762076568775944, 'Adventure'),
 (0.00821321453151342, 'Biography'),
 (0.007916601105805074, 'year_ceremony_2017'),
 (0.00784

## Attempt 10: random forest with data scaling, Data added: boolean value for if movie won best picture at the Golden Globes. Data removed: BoxOffice column.

Accuracy Score : 0.75

Balanced Accuracy Score : 0.5

In [133]:
df_11 = df_6.copy()

In [134]:
# Define features set
X = df_11.drop("OscarsWinner", axis=1)
y = df_11["OscarsWinner"].ravel()

In [135]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [136]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=50, random_state=29)

In [137]:
# Fitting the model
rf_model = rf_model.fit(X_train, y_train)

In [138]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test)

In [139]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,41,0
Actual winner,14,0


Accuracy Score : 0.7454545454545455
Balanced Accuracy Score : 0.5

Classification Report
              precision    recall  f1-score   support

       loser       0.75      1.00      0.85        41
      winner       0.00      0.00      0.00        14

    accuracy                           0.75        55
   macro avg       0.37      0.50      0.43        55
weighted avg       0.56      0.75      0.64        55



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [140]:
# Random Forests in sklearn will automatically calculate feature importance
importances = rf_model.feature_importances_
# We can sort the features by their importance
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.11472829165014922, 'Runtime'),
 (0.1142280817774619, 'imdbVotes'),
 (0.0884368471136361, 'Metascore'),
 (0.07643021431141918, 'imdbRating'),
 (0.07452171350081482, 'GlobesWinner'),
 (0.02587131452723942, 'Musical'),
 (0.019179891057973037, 'Rated_Passed'),
 (0.018390100132183514, 'Rated_PG'),
 (0.016094017220126974, 'Romance'),
 (0.016079961069618218, 'Rated_R'),
 (0.014936625320243648, 'Crime'),
 (0.014817093215248125, 'Sport'),
 (0.01337217567378619, 'Biography'),
 (0.012673948492389241, 'Rated_PG-13'),
 (0.012308440468716701, 'Fantasy'),
 (0.012307083209015711, 'Rated_G'),
 (0.012069755849322426, 'Comedy'),
 (0.011887628670700444, 'War'),
 (0.011381687418212647, 'year_ceremony_1971'),
 (0.011342209719894247, 'Rated_X'),
 (0.010801763369282968, 'History'),
 (0.010344376449215948, 'Drama'),
 (0.009835154984824542, 'year_ceremony_1990'),
 (0.009755531437765282, 'year_ceremony_1980'),
 (0.00928229021049107, 'year_ceremony_2018'),
 (0.008725036780774286, 'year_ceremony_1978'),
 (0.00

## Attempt 12: logistic regression, Data Added: boolean value for if movie won best picture at the Golden Globes, director, country, and producer

Accuracy Score : 0.78

Balanced Accuracy Score : 0.64

In [141]:
df_12 = df_5.copy()
df_12.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,...,year_ceremony_2011,year_ceremony_2012,year_ceremony_2013,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020
0,1,-0.255022,0.66598,-1.441614,-0.768565,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,-1.250497,-3.331114,0.463857,-0.7086,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,-0.573574,-3.331114,-1.018176,-0.758132,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,-1.210678,-3.331114,0.040419,-0.78583,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,-1.170859,-3.331114,-0.383019,-0.770181,-0.804997,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [142]:
# Add in director, country, and productioncompany columns
df_12["Director"] = df["Director"]
df_12["Producer"] = df["name"]
df_12["Country"] = df["Country"]
df_12.head()

Unnamed: 0,OscarsWinner,Runtime,Metascore,imdbRating,imdbVotes,BoxOffice,GlobesWinner,Fantasy,Action,Adventure,...,year_ceremony_2014,year_ceremony_2015,year_ceremony_2016,year_ceremony_2017,year_ceremony_2018,year_ceremony_2019,year_ceremony_2020,Director,Producer,Country
0,1,-0.255022,0.66598,-1.441614,-0.768565,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,Alfred E. Green,Warner Bros.,United States
1,1,-1.250497,-3.331114,0.463857,-0.7086,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,Ernst Lubitsch,Paramount Famous Lasky,United States
2,1,-0.573574,-3.331114,-1.018176,-0.758132,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,John Ford,Samuel Goldwyn Productions,United States
3,0,-1.210678,-3.331114,0.040419,-0.78583,-0.804997,1,0,0,0,...,0,0,0,0,0,0,0,Frank Borzage,Fox,United States
4,0,-1.170859,-3.331114,-0.383019,-0.770181,-0.804997,0,0,0,0,...,0,0,0,0,0,0,0,King Vidor,Metro-Goldwyn-Mayer,United States


In [143]:
# Split director column into 3 columns since up to 3 directors can be featured
df_12[["Director1", "Director2", "Director3"]] = df_12["Director"].str.split(', ', expand=True)
df_12.drop(columns=["Director"], axis=1, inplace=True)
list(df_12.columns)

['OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror',
 'Rated_Approved',
 'Rated_G',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_Unrated',
 'Rated_X',
 'year_ceremony_1945',
 'year_ceremony_1946',
 'year_ceremony_1948',
 'year_ceremony_1949',
 'year_ceremony_1951',
 'year_ceremony_1953',
 'year_ceremony_1955',
 'year_ceremony_1957',
 'year_ceremony_1958',
 'year_ceremony_1959',
 'year_ceremony_1960',
 'year_ceremony_1961',
 'year_ceremony_1962',
 'year_ceremony_1963',
 'year_ceremony_1964',
 'year_ceremony_1965',
 'year_ceremony_1966',
 'year_ceremony_1967',
 'year_ceremony_1968',
 'year_ceremony_1969',
 'year_ceremony_1970',
 'year_ceremony_1971',
 'year_c

In [144]:
# Check what unique values we have for our three directors columns
directors = []
directors.extend(df_12["Director1"].values)
directors.extend(df_12["Director2"].values)
directors.extend(df_12["Director3"].values)
directors_list = list(set(directors))
print(directors_list)


['George Sidney', 'John Huston', 'Fred Zinnemann', 'Cecil B. DeMille', 'Peter Glenville', 'Michael Anderson', 'Edmund Goulding', 'John Schlesinger', 'Tay Garnett', 'Henry Hathaway', 'Jack Conway', 'Edward Dmytryk', 'Stanley Kramer', 'David Lean', 'William Keighley', 'Henry King', 'Jack Cardiff', 'Morton DaCosta', 'Alfred E. Green', 'Leslie Howard', 'Michael Powell', 'Frank Lloyd', 'Sam Wood', 'George Seaton', 'Lloyd Bacon', 'Michael Curtiz', 'Mark Robson', 'Frank Capra', 'Otto Lang', 'Anthony Harvey', 'George Cukor', 'Richard Brooks', 'Leo McCarey', 'W.S. Van Dyke', 'Gerd Oswald', 'Walter Lang', 'Orson Welles', 'Irving Rapper', 'Mervyn LeRoy', 'Joshua Logan', 'Delbert Mann', 'Robert Rossen', 'Otto Preminger', 'Ernst Lubitsch', 'Andrew Marton', 'Robert Stevenson', 'Sidney Lumet', 'Robert Z. Leonard', 'Irving Cummings', 'William Wyler', 'Edward F. Cline', 'Robert Mulligan', 'Richard Thorpe', 'Anthony Asquith', 'William Dieterle', 'Lewis Gilbert', 'Mike Nichols', 'Mark Sandrich', 'Michael

In [145]:
# Perform one-hot encoding on directors
for d in directors_list:
    # Create a column for each genre
    df_12[d] = 0
    # Columns will have a 0 or 1 if the movie is of the column's genre
    df_12[d] = ((df_12["Director1"] == d) | (df_12["Director2"] == d) | (df_12["Director3"] == d)).astype(int)
df_12.drop(columns=["Director1", "Director2", "Director3"], inplace=True)
list(df_12.columns)

  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0
  df_12[d] = 0


['OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror',
 'Rated_Approved',
 'Rated_G',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_Unrated',
 'Rated_X',
 'year_ceremony_1945',
 'year_ceremony_1946',
 'year_ceremony_1948',
 'year_ceremony_1949',
 'year_ceremony_1951',
 'year_ceremony_1953',
 'year_ceremony_1955',
 'year_ceremony_1957',
 'year_ceremony_1958',
 'year_ceremony_1959',
 'year_ceremony_1960',
 'year_ceremony_1961',
 'year_ceremony_1962',
 'year_ceremony_1963',
 'year_ceremony_1964',
 'year_ceremony_1965',
 'year_ceremony_1966',
 'year_ceremony_1967',
 'year_ceremony_1968',
 'year_ceremony_1969',
 'year_ceremony_1970',
 'year_ceremony_1971',
 'year_c

In [146]:
# Remove the 'None' column
df_12 = df_12[['OscarsWinner',
 'Runtime',
 'Metascore',
 'imdbRating',
 'imdbVotes',
 'BoxOffice',
 'GlobesWinner',
 'Fantasy',
 'Action',
 'Adventure',
 'Animation',
 'Romance',
 'Musical',
 'Biography',
 'Family',
 'Comedy',
 'Sci-Fi',
 'Film-Noir',
 'War',
 'Sport',
 'Music',
 'Drama',
 'Mystery',
 'Thriller',
 'History',
 'Western',
 'Crime',
 'Horror',
 'Rated_Approved',
 'Rated_G',
 'Rated_Not Rated',
 'Rated_PG',
 'Rated_PG-13',
 'Rated_Passed',
 'Rated_R',
 'Rated_Unrated',
 'Rated_X',
 'year_ceremony_1945',
 'year_ceremony_1946',
 'year_ceremony_1948',
 'year_ceremony_1949',
 'year_ceremony_1951',
 'year_ceremony_1953',
 'year_ceremony_1955',
 'year_ceremony_1957',
 'year_ceremony_1958',
 'year_ceremony_1959',
 'year_ceremony_1960',
 'year_ceremony_1961',
 'year_ceremony_1962',
 'year_ceremony_1963',
 'year_ceremony_1964',
 'year_ceremony_1965',
 'year_ceremony_1966',
 'year_ceremony_1967',
 'year_ceremony_1968',
 'year_ceremony_1969',
 'year_ceremony_1970',
 'year_ceremony_1971',
 'year_ceremony_1972',
 'year_ceremony_1973',
 'year_ceremony_1974',
 'year_ceremony_1975',
 'year_ceremony_1976',
 'year_ceremony_1977',
 'year_ceremony_1978',
 'year_ceremony_1979',
 'year_ceremony_1980',
 'year_ceremony_1981',
 'year_ceremony_1982',
 'year_ceremony_1983',
 'year_ceremony_1984',
 'year_ceremony_1985',
 'year_ceremony_1986',
 'year_ceremony_1987',
 'year_ceremony_1988',
 'year_ceremony_1989',
 'year_ceremony_1990',
 'year_ceremony_1991',
 'year_ceremony_1992',
 'year_ceremony_1993',
 'year_ceremony_1994',
 'year_ceremony_1995',
 'year_ceremony_1996',
 'year_ceremony_1997',
 'year_ceremony_1998',
 'year_ceremony_1999',
 'year_ceremony_2000',
 'year_ceremony_2001',
 'year_ceremony_2002',
 'year_ceremony_2003',
 'year_ceremony_2004',
 'year_ceremony_2005',
 'year_ceremony_2006',
 'year_ceremony_2007',
 'year_ceremony_2008',
 'year_ceremony_2009',
 'year_ceremony_2010',
 'year_ceremony_2011',
 'year_ceremony_2012',
 'year_ceremony_2013',
 'year_ceremony_2014',
 'year_ceremony_2015',
 'year_ceremony_2016',
 'year_ceremony_2017',
 'year_ceremony_2018',
 'year_ceremony_2019',
 'year_ceremony_2020',
 'Producer',
 'Country',
 'Fred Fleck',
 'Joseph L. Mankiewicz',
 'Sidney Lanfield',
 'Charles Walters',
 'John Schlesinger',
 'Edward Dmytryk',
 'Frank Borzage',
 'Michael Curtiz',
 'Vincente Minnelli',
 'J. Lee Thompson',
 'Ralph Nelson',
 'Victor Schertzinger',
 'Laurence Olivier',
 'Ernst Lubitsch',
 'Clarence Brown',
 'John Huston',
 'Leo McCarey',
 'Mark Robson',
 'Andrew Marton',
 'Arthur Penn',
 'John Ford',
 'Fred Coe',
 'John Farrow',
 'Stanley Kramer',
 'Jerome Robbins',
 'Orson Welles',
 'W.S. Van Dyke',
 'Jack Cardiff',
 'Elia Kazan',
 'Richard Thorpe',
 'George Sidney',
 'Carol Reed',
 'William Wyler',
 'Joseph Barbera',
 'David Lean',
 'Anthony Harvey',
 'Compton Bennett',
 'Billy Wilder',
 'Max Reinhardt',
 'John Wayne',
 'Cecil B. DeMille',
 'Robert Z. Leonard',
 'Alexander Hall',
 'Richard Fleischer',
 'Alfred E. Green',
 'Daniel Mann',
 'Howard Hawks',
 'Walter Lang',
 'Robert Mulligan',
 'George Seaton',
 'Gregory La Cava',
 'Henry Koster',
 'Norman Taurog',
 'Tony Richardson',
 'Emeric Pressburger',
 'Jack Conway',
 'Fred Zinnemann',
 'William Dieterle',
 'Robert Wise',
 'Anthony Asquith',
 'Michael Cacoyannis',
 'Gustav Machatý',
 'Joshua Logan',
 'Robert Stevenson',
 'King Vidor',
 'Gerd Oswald',
 'Tay Garnett',
 'Sidney Franklin',
 'Sam Wood',
 'Mitchell Leisen',
 'Morton DaCosta',
 'Michael Anderson',
 'Edward F. Cline',
 'Henry Hathaway',
 'Delbert Mann',
 'John M. Stahl',
 'Jean Negulesco',
 'William Keighley',
 'William A. Wellman',
 'Stanley Donen',
 'Alfred L. Werker',
 'Victor Fleming',
 'Leslie Howard',
 'Anthony Mann',
 'Michael Powell',
 'Anatole Litvak',
 'Charles Chaplin',
 'John Cromwell',
 'Stanley Kubrick',
 'Irving Cummings',
 'Mervyn LeRoy',
 'Albert Lewin',
 'Hal Mohr',
 'Roy Del Ruth',
 'Edmund Goulding',
 'Lloyd Bacon',
 'Sidney Lumet',
 'Robert Rossen',
 'Peter Glenville',
 'Frank Capra',
 'Otto Lang',
 'Alfred Hitchcock',
 'Frank Lloyd',
 'Otto Preminger',
 'Irving Rapper',
 'Norman Jewison',
 'Henry King',
 'Mike Nichols',
 'Lewis Gilbert',
 'Irving Pichel',
 'Richard Brooks',
 'Mark Sandrich',
 'George Cukor',
 'Herman Shumlin',
 'George Stevens',
 'William Hanna',
 'Lewis Milestone',
 'Ken Annakin']]

In [147]:
# Run get_dummies on our ProductionCompany Column
df_12 = pd.get_dummies(df_12, columns=["Producer", "Country"])


In [148]:
# define model results as y and features as X
y = df_12["OscarsWinner"]
X = df_12.drop(columns = "OscarsWinner")

In [149]:
# Look at how results are split between winners (1) and losers (0)
y.value_counts()

0    170
1     50
Name: OscarsWinner, dtype: int64

In [150]:
# Split into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=29)

In [151]:
# Create a Logistic Regression Model and fit (train) or model using the training data
classifier = LogisticRegression(random_state=29)
classifier.fit(X_train, y_train)

In [152]:
# Run model on testing data
predictions = classifier.predict(X_test)
pred_12_df = pd.DataFrame({"Prediction": predictions, "Actual": y_test}).tail(10)
pred_12_df

Unnamed: 0,Prediction,Actual
180,0,0
25,0,1
141,0,0
7,0,1
80,1,1
48,1,0
76,0,0
61,0,0
166,0,0
124,0,1


In [153]:
pred_12_df["Prediction"].value_counts()

0    8
1    2
Name: Prediction, dtype: int64

In [154]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual loser", "Actual winner"], columns=["Predicted loser", "Predicted winner"]
)

# Calculating the balanced accuracy score and accuracy score
acc_score = accuracy_score(y_test, predictions)
bal_acc_score = balanced_accuracy_score(y_test, predictions)
acc_score = accuracy_score(y_test, predictions)

# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print(f"Balanced Accuracy Score : {bal_acc_score}")
print("")
target_names = ["loser", "winner"]
print("Classification Report")
print(classification_report(y_test, predictions, target_names=target_names))

Confusion Matrix


Unnamed: 0,Predicted loser,Predicted winner
Actual loser,38,3
Actual winner,9,5


Accuracy Score : 0.7818181818181819
Balanced Accuracy Score : 0.64198606271777

Classification Report
              precision    recall  f1-score   support

       loser       0.81      0.93      0.86        41
      winner       0.62      0.36      0.45        14

    accuracy                           0.78        55
   macro avg       0.72      0.64      0.66        55
weighted avg       0.76      0.78      0.76        55



## The best attempt was Attempt 5, which used logistic regression, scaled the data, and included Golden globes results. Attempt 6 and 12 had the exact classification result, confusion matrix, and accuracy scores as Attempt 5. 
## We'll use attempt 5 as our model since, unlinke attempt 6, it still includes the Box Office column, which the random forest models we created seemed to give importance to. Attempt 12 has a lot more input data, with no improved results.

In [155]:
model_training_df = df_5.copy()
model_training_df.to_csv("Resources/model_training_data.csv", index=False)