# Download Final Data

# Import Libraries

In [9]:
import pandas as pd, numpy as np

from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from scipy import stats
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, f1_score, log_loss

from sklearn.feature_selection import RFE

## Import Data and define global variables

In [10]:
X_train = pd.read_csv('train_75.csv', index_col = 0)

y_train = pd.read_csv('train_target.csv', index_col = 0).iloc[:,0]

X_test = pd.read_csv('test_75.csv', index_col = 0)

# Transform!

In [11]:
def z_norm(x):
    
    res = (x - x.mean()) / x.std()
    
    return res

X_test.iloc[:,60:67].head()

Unnamed: 0,team1_elo,team2_elo,team1_elv_diff,team2_elv_diff,host_elv,team1_elv,team2_elv
0,100.0,57.142857,129.095609,172.104489,10.430255,139.525864,182.534744
1,100.0,0.0,129.095609,970.228803,10.430255,139.525864,980.659058
2,100.0,147.368421,129.095609,405.422925,10.430255,139.525864,415.85318
3,100.0,30.769231,129.095609,277.075727,10.430255,139.525864,287.505981
4,100.0,40.0,129.095609,18.804208,10.430255,139.525864,29.234463


In [12]:
for i in range(60, 68):
    X_test.iloc[:,i] = z_norm(X_test.iloc[:,66])

for i in range(60, 68):
    X_train.iloc[:,i] = z_norm(X_train.iloc[:,66])


X_test.iloc[:,60:67].head()

Unnamed: 0,team1_elo,team2_elo,team1_elv_diff,team2_elv_diff,host_elv,team1_elv,team2_elv
0,-0.271798,-0.271798,-0.271798,-0.271798,-0.271798,-0.271798,-0.271798
1,2.767374,2.767374,2.767374,2.767374,2.767374,2.767374,2.767374
2,0.616653,0.616653,0.616653,0.616653,0.616653,0.616653,0.616653
3,0.127921,0.127921,0.127921,0.127921,0.127921,0.127921,0.127921
4,-0.85555,-0.85555,-0.85555,-0.85555,-0.85555,-0.85555,-0.85555


In [13]:
def top_40_plus(model, X, y):

    rfe = RFE(model, 40)
    fit = rfe.fit(X, y)
    
    keep = []
    for i,a in enumerate(fit.support_):
        if (a == True):
            keep.append(i)
    
    return keep

In [14]:
def lr(X_train, y_train, X_test):

    lr = BaggingClassifier(base_estimator = LogisticRegression())
    lr.fit(X_train, y_train)
    y_pred = lr.predict(X_test)
    
    return y_pred

def lr_prob(X_train, y_train, X_test):

    lr = BaggingClassifier(base_estimator = LogisticRegression())
    lr.fit(X_train, y_train)
    y_pred = lr.predict_proba(X_test)
    
    return y_pred

In [15]:
def nb(X_train, y_train, X_test):
    
    nb = BaggingClassifier(base_estimator = GaussianNB())
    nb.fit(X_train,y_train)
    y_pred = nb.predict(X_test)
    
    return y_pred

def nb_prob(X_train, y_train, X_test):
    
    nb = BaggingClassifier(base_estimator = GaussianNB())
    nb.fit(X_train,y_train)
    y_pred = nb.predict_proba(X_test)
    
    return y_pred

In [16]:
def rf(X_train, y_train, X_test):
    
    rf = BaggingClassifier(base_estimator = RandomForestClassifier())
    rf.fit(X_train,y_train)
    y_pred = rf.predict(X_test)
    
    return y_pred

def rf_prob(X_train, y_train, X_test):
    
    rf = BaggingClassifier(base_estimator = RandomForestClassifier())
    rf.fit(X_train,y_train)
    y_pred = rf.predict_proba(X_test)
    
    return y_pred

In [17]:
def sv(X_train, y_train, X_test):
    
    sv = BaggingClassifier(base_estimator = SVC(probability = True))
    sv.fit(X_train,y_train)
    y_pred = sv.predict(X_test)
    
    return y_pred

def sv_prob(X_train, y_train, X_test):
    
    sv = BaggingClassifier(base_estimator = SVC(probability = True))
    sv.fit(X_train,y_train)
    y_pred = sv.predict_proba(X_test)
    
    return y_pred

In [27]:
def gb(X_train, y_train, X_test):

    gb = BaggingClassifier(base_estimator = GradientBoostingClassifier())
    gb.fit(X_train, y_train)
    y_pred = gb.predict(X_test)
    
    return y_pred

def gb_prob(X_train, y_train, X_test):

    gb = BaggingClassifier(base_estimator = GradientBoostingClassifier())
    gb.fit(X_train, y_train)
    y_pred = gb.predict_proba(X_test)
    
    return y_pred

In [19]:
#X_train, X_test, y_train, y_test = train_test_split(X_train ,y_train, test_size = 0.3, random_state = 42)

In [36]:
def ensemble(X_train, y_train, X_test):
    lr_keep = top_40_plus(LogisticRegression(), X_train, y_train)

    # Declared Globally
    #X_train, X_test, y_train, y_test = train_test_split(X ,y, test_size = 0.3, random_state = 42)
   
    lr_y = lr(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    sv_y = sv(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    rf_y = rf(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    nb_y = nb(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    gb_y = gb(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    
    
    lr_p = lr_prob(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    sv_p = sv_prob(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    rf_p = rf_prob(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    nb_p = nb_prob(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    gb_p = gb_prob(X_train.iloc[:,lr_keep], y_train, X_test.iloc[:,lr_keep])
    
    final_y = []
    
    for i in range(len(X_test)):
        
        if (stats.mode([lr_y[i], nb_y[i], rf_y[i], sv_y[i], gb_y[i]])[0][0] == 0):
            
            final_y.append(min([lr_p[i][1], nb_p[i][1], rf_p[i][1], sv_p[i][1], gb_p[i][1]]))   
                           
        else:
                           
            final_y.append(max([lr_p[i][1], nb_p[i][1], rf_p[i][1], sv_p[i][1], gb_p[i][1]]))                    
                       
        
    return final_y


In [37]:
e100 = ensemble(X_train, y_train, X_test)

In [38]:
e100

[0.9786330078377411,
 0.9932330184222111,
 0.9944484698274858,
 0.9685057286630452,
 0.984534985980041,
 0.9963301557827554,
 0.9990003019756865,
 0.9966909215694265,
 0.9915117915216121,
 0.9976288637201904,
 0.9952919471868658,
 0.9966314231262162,
 0.9993627201754636,
 0.9951259590622431,
 0.9993568550343779,
 0.999754359157985,
 0.9992479415786631,
 0.9767603960576323,
 0.955146257341753,
 0.9666982813742495,
 0.9477991947266251,
 0.9835786314472108,
 0.9962472440856154,
 0.9798281707419114,
 0.9939703107542526,
 0.9880327116710067,
 0.9915579628229482,
 0.9885866511403665,
 0.9669313974349374,
 0.994903673775184,
 0.99468149622311,
 0.9952326559773519,
 0.9979050645174317,
 0.998484024940914,
 0.9278869563715558,
 0.9468801667507034,
 0.9845067054896562,
 0.9840517919021066,
 0.9664091933089475,
 0.9763881461275611,
 0.9912281558705119,
 0.9973552838770322,
 0.9951910950623599,
 0.9815988209395409,
 0.9933467935767059,
 0.995668289519718,
 0.9962414594649317,
 0.9973559368390779,


In [44]:
pd.Series(ebinary).value_counts()/(1724+554)

1    0.756804
0    0.243196
dtype: float64

In [39]:
ebinary = []

for i in range(len(e100)):    
    if (e100[i] > 0.5): 
        ebinary.append(1)
    else:
        ebinary.append(0)

In [74]:
accuracy_score(e100, y_test)
precision_score(e100, y_test)
recall_score(e100, y_test)

0.7339449541284404

In [2]:
max([1,2,3,4])

4

In [42]:
pd.DataFrame({"Prob_Win":e100}).to_csv("Max_Dem_5.csv")