In [2]:
import pandas as pd
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
import math

In [3]:
#importing gathered data
theMovies_DF=pd.read_csv('theMovies.csv')
#droping columns that: (1) have missing data for some movies, (2) have string values, (3) the year column
theMovies_DF=theMovies_DF.drop(columns=['Revenue', 'Budget', 'Prod Company', 'Title', 'Year'])
theMovies_DF

Unnamed: 0,Best Picture,imdbRating,Popularity,Rated_APPROVED,Rated_G,Rated_PG,Rated_PG-13,Rated_R,Rated_UNRATED,Release_Date_01,...,Genre_Short,Genre_Sport,Genre_Thriller,Genre_War,Genre_Western,Best Actor,Best Actress,Best Director,Best Supporting Actor,Best Supporting Actress
0,0,8.3,11.853,1,0,0,0,0,0,0,...,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0
1,0,7.8,6.868,0,0,0,0,0,1,0,...,0,0,1,0,0,0.0,1.0,0.0,0.0,0.0
2,1,7.2,3.592,1,0,0,0,0,0,0,...,0,0,0,0,0,1.0,0.0,1.0,1.0,0.0
3,0,7.6,1.400,1,0,0,0,0,0,0,...,0,0,0,1,0,0.0,0.0,0.0,0.0,0.0
4,0,5.8,0.771,0,0,0,0,1,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
5,0,7.2,4.672,1,0,0,0,0,0,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
6,0,7.4,2.620,0,0,0,0,0,1,0,...,0,0,0,0,0,0.0,0.0,0.0,0.0,0.0
7,1,8.0,5.871,0,0,0,0,0,1,0,...,0,0,0,0,0,1.0,0.0,1.0,0.0,0.0
8,0,8.0,6.637,1,0,0,0,0,0,0,...,0,0,0,0,0,0.0,1.0,0.0,0.0,0.0
9,0,7.6,8.192,0,0,0,0,0,1,0,...,0,0,1,0,0,0.0,0.0,0.0,0.0,0.0


In [4]:
#set features
X = theMovies_DF.drop(columns=['Best Picture']).values
#set label
y = theMovies_DF['Best Picture'].values

#running RFE (recursive Feature E, 
#dropping one feature each iteration (10000 iterations) based on p-value (higher the pvalue the more likely the change is due to chance), it drops high p-values (I think)
#getting top 10 most influential features
estimator = LogisticRegression(solver='sag',max_iter=10000) 
selector = RFE(estimator, 10, step=1)
selector.fit(X,y)
selector.ranking_

array([32, 34,  6, 24, 36, 13, 14, 31, 18,  1, 19, 35,  1,  5,  1, 17,  1,
       20, 21,  7, 40, 27, 30, 22, 12, 15,  1, 28, 23,  3,  2, 39,  9, 37,
        1,  1, 11, 16,  1, 25, 10, 26, 38,  8,  1, 33,  1, 29,  4])

In [5]:
#set features
X = theMovies_DF.drop(columns=['Best Picture']).values
#set label
y = theMovies_DF['Best Picture'].values

#logistic regression on all 49 features
estimator = LogisticRegression(solver='sag',max_iter=10000) 
estimator.fit(X,y)
coeffs=estimator.coef_[0]
intercept = estimator.intercept_[0]


In [6]:
#setting variable list for coefficients of top ten most influential features
#((e^(mx1+mx2....)/(1+e^(mx1+e^mx2...))), where m is the coefficient
coeffs_top10=(selector.estimator_.coef_[0])
print ('coefficients',selector.estimator_.coef_)

coefficients [[-0.82289757  1.1136191  -1.01995011 -0.7920987   0.77112844 -0.78837897
   0.82733004 -0.77007405  1.22956734  3.59754769]]


In [7]:
#get what feature corresponds to what coefficient
features = theMovies_DF.drop(columns=['Best Picture']).columns.tolist()
# cc = cX.columns.tolist()
feature_ranking = list(selector.ranking_)
Rank_DF = zip(feature_ranking,features)
parameters=[]
for rank,parameter in Rank_DF:
    if rank == 1:
        parameters.append(parameter)
    

In [8]:
#create Dataframe of top 10 features and coefficients
top_10=pd.DataFrame({"Feature": parameters,
                    "Coefficient": coeffs_top10})
top_10.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
9,Best Director,3.597548
8,Best Actor,1.229567
1,Release_Date_05,1.113619
6,Genre_Musical,0.82733
4,Genre_Crime,0.771128
7,Genre_Sci-Fi,-0.770074
5,Genre_Music,-0.788379
3,Release_Date_09,-0.792099
0,Release_Date_02,-0.822898
2,Release_Date_07,-1.01995


In [9]:
#create Dataframe of all 49 features and their coefficients (regular logistic regression, not RFE)
all_feat=pd.DataFrame({"Feature": features,
                    "Coefficient": coeffs})
all_feat.sort_values('Coefficient', ascending=False)

Unnamed: 0,Feature,Coefficient
46,Best Director,2.939211
44,Best Actor,1.049609
12,Release_Date_05,0.422584
26,Genre_Crime,0.402428
48,Best Supporting Actress,0.395913
35,Genre_Musical,0.389108
32,Genre_History,0.259904
17,Release_Date_10,0.226638
7,Rated_UNRATED,0.198653
18,Release_Date_11,0.168029


# Predicting 2018 Best Picture

In [10]:
contenders=pd.read_csv('theMovies_contenders.csv')
contenders

Unnamed: 0,Title,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,Popularity,Rated_PG-13,Rated_R,Release_Date_02,...,Genre_Action,Genre_Adventure,Genre_Biography,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_History,Genre_Music,Genre_Romance,Genre_Sci-Fi
0,Roma,1,0,0,0,0,5.243,0,0,0,...,0,0,0,0,0,1,0,0,0,0
1,The Favourite,1,0,1,0,1,14.494,0,1,0,...,0,0,1,0,0,1,0,0,0,0
2,If Beale Street Could Talk,0,0,1,0,1,10.871,0,0,0,...,0,0,0,0,1,1,0,0,1,0
3,First Man,1,1,0,0,1,99.314,0,0,0,...,0,0,1,0,0,1,1,0,0,0
4,The Front Runner,0,0,0,0,0,4.898,0,0,0,...,0,0,1,0,0,1,0,0,0,0
5,A Star Is Born,1,1,1,1,0,101.953,0,1,0,...,0,0,0,0,0,1,0,1,1,0
6,Black Panther,1,0,0,1,0,52.407,1,0,1,...,1,1,0,0,0,0,0,0,0,1
7,Green Book,0,0,0,1,0,13.276,1,0,0,...,0,0,1,1,0,1,0,0,0,0
8,BlacKkKlansman,0,0,0,0,0,54.589,0,1,0,...,0,0,1,1,1,1,0,0,0,0
9,Can You Ever Forgive Me?,0,0,1,1,0,5.828,0,1,0,...,0,0,1,1,1,1,0,0,0,0


In [11]:
def getprediction(df,title):
    tempdf = df[df['Title']==title]
    col = df.columns.tolist()
    x = 0
    for c in col:
        try:
            coeff = all_feat[all_feat['Feature']==c].iloc[:,1].values[0]
            x += tempdf[c].values * coeff
        except:
            pass

    y = (math.exp(x+intercept))/(1+(math.exp(x+intercept)))
    return y

In [23]:
predictions=[]
for t in contenders['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(contenders,t)}
    predictions.append(the_movie)

Predictions=pd.DataFrame(predictions)

In [24]:
Predictions=Predictions.sort_values('Liklihood of Winning', ascending=False)
Predictions

Unnamed: 0,Liklihood of Winning,Title
3,0.999982,First Man
5,0.999971,A Star Is Born
6,0.995853,Black Panther
1,0.983005,The Favourite
8,0.963516,BlacKkKlansman
0,0.959886,Roma
2,0.77859,If Beale Street Could Talk
9,0.683615,Can You Ever Forgive Me?
7,0.650271,Green Book
4,0.504349,The Front Runner


# Predicting 2016 Best Picture

In [25]:
films2016=pd.read_csv('theMovies2016.csv')
films2016

Unnamed: 0,Title,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG,Rated_PG-13,...,Genre_Drama,Genre_History,Genre_Music,Genre_Musical,Genre_Mystery,Genre_Romance,Genre_Sci-Fi,Genre_Thriller,Genre_War,Genre_Western
0,La La Land,1,0,1,0,0,8.1,20.783,0,1,...,1,0,1,1,0,1,0,0,0,0
1,Arrival,0,0,0,0,0,7.9,19.625,0,1,...,1,0,0,0,1,0,1,1,0,0
2,Lion,0,0,0,0,0,8.1,17.488,0,1,...,1,0,0,0,0,0,0,0,0,0
3,Hell or High Water,0,0,0,0,0,7.6,8.546,0,0,...,1,0,0,0,0,0,0,1,0,1
4,Hidden Figures,0,0,0,0,0,7.8,16.528,1,0,...,1,1,0,0,0,0,0,0,0,0
5,Moonlight,0,0,0,1,0,7.4,12.346,0,0,...,1,0,0,0,0,0,0,0,0,0
6,Hacksaw Ridge,0,0,0,0,0,8.1,14.517,0,0,...,1,1,0,0,0,0,0,0,1,0
7,Manchester by the Sea,0,1,0,0,0,7.8,10.828,0,0,...,1,0,0,0,0,0,0,0,0,0
8,Fences,0,0,0,0,1,7.2,8.971,0,1,...,1,0,0,0,0,0,0,0,0,0


In [28]:
predictions2016=[]
for t in films2016['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(films2016,t)}
    predictions2016.append(the_movie)

Predictions2016=pd.DataFrame(predictions2016)
Predictions2016

Unnamed: 0,Liklihood of Winning,Title
0,0.689361,La La Land
1,0.049983,Arrival
2,0.062471,Lion
3,0.066019,Hell or High Water
4,0.095323,Hidden Figures
5,0.091938,Moonlight
6,0.084243,Hacksaw Ridge
7,0.174039,Manchester by the Sea
8,0.092367,Fences


In [29]:
Predictions2016=Predictions2016.sort_values('Liklihood of Winning', ascending=False)
Predictions2016

Unnamed: 0,Liklihood of Winning,Title
0,0.689361,La La Land
7,0.174039,Manchester by the Sea
4,0.095323,Hidden Figures
8,0.092367,Fences
5,0.091938,Moonlight
6,0.084243,Hacksaw Ridge
3,0.066019,Hell or High Water
2,0.062471,Lion
1,0.049983,Arrival


# Predicting 2017 Best Picture

In [30]:
films2017=pd.read_csv('theMovies2017.csv')
films2017

Unnamed: 0,Title,Best Director,Best Actor,Best Actress,Best Supporting Actor,Best Supporting Actress,imdbRating,Popularity,Rated_PG-13,Rated_R,...,Genre_Comedy,Genre_Crime,Genre_Drama,Genre_Fantasy,Genre_History,Genre_Horror,Genre_Mystery,Genre_Romance,Genre_Thriller,Genre_War
0,The Shape of Water,1,0,0,0,0,7.4,21.903,0,1,...,0,0,1,1,0,0,0,1,1,0
1,Darkest Hour,0,1,0,0,0,7.4,13.953,1,0,...,0,0,1,0,1,0,0,0,0,1
2,Dunkirk,0,0,0,0,0,7.9,21.633,1,0,...,0,0,1,0,1,0,0,0,1,1
3,Phantom Thread,0,0,0,0,0,7.5,10.017,0,1,...,0,0,1,0,0,0,0,1,0,0
4,The Post,0,0,0,0,0,7.2,11.181,1,0,...,0,0,1,0,1,0,0,0,1,0
5,Call Me by Your Name,0,0,0,0,0,8.0,16.849,0,1,...,0,0,1,0,0,0,0,1,0,0
6,Lady Bird,0,0,0,0,0,7.5,17.112,0,1,...,1,0,1,0,0,0,0,0,0,0
7,"Three Billboards Outside Ebbing, Missouri",0,0,1,1,0,8.2,15.942,0,1,...,1,1,1,0,0,0,0,0,0,0
8,Get Out,0,0,0,0,0,7.7,16.42,0,1,...,0,0,0,0,0,1,1,0,1,0


In [32]:
predictions2017=[]
for t in films2017['Title'].tolist():
    the_movie={'Title':t,
              'Liklihood of Winning':getprediction(films2017,t)}
    predictions2017.append(the_movie)

Predictions2017=pd.DataFrame(predictions2017)
Predictions2017

Unnamed: 0,Liklihood of Winning,Title
0,0.55912,The Shape of Water
1,0.211013,Darkest Hour
2,0.108949,Dunkirk
3,0.062511,Phantom Thread
4,0.067355,The Post
5,0.078735,Call Me by Your Name
6,0.112013,Lady Bird
7,0.133161,"Three Billboards Outside Ebbing, Missouri"
8,0.080379,Get Out


In [33]:
Predictions2017=Predictions2017.sort_values('Liklihood of Winning', ascending=False)
Predictions2017

Unnamed: 0,Liklihood of Winning,Title
0,0.55912,The Shape of Water
1,0.211013,Darkest Hour
7,0.133161,"Three Billboards Outside Ebbing, Missouri"
6,0.112013,Lady Bird
2,0.108949,Dunkirk
8,0.080379,Get Out
5,0.078735,Call Me by Your Name
4,0.067355,The Post
3,0.062511,Phantom Thread
