In [1]:
import numpy as np
import pandas as pd

In [2]:
# Read in the files
train_data = pd.read_csv('../assets/train_complete_mg.csv')
test_data = pd.read_csv('../assets/test_complete_mg.csv')

In [3]:
# Continuing to borrow Ritika's EDA function.
# Need to ensure types are workable

def eda(dataframe):
    print "dataframe types \n", dataframe.dtypes, "\n"
    print "dataframe shape \n", dataframe.shape, "\n"

In [4]:
eda(train_data)

dataframe types 
Species                   object
Block                      int64
Trap                      object
Latitude                 float64
Longitude                float64
AddressAccuracy            int64
NumMosquitos               int64
WnvPresent                 int64
YMD                       object
PIPIENS                    int64
RESTUANS                   int64
SALINARIUS                 int64
TERRITANS                  int64
Station                    int64
Tmax                       int64
Tmin                       int64
DewPoint                   int64
WetBulb                  float64
Sunrise                   object
Sunset                    object
PrecipTotal              float64
StnPressure              float64
SeaLevel                 float64
ResultSpeed              float64
ResultDir                  int64
AvgSpeed                 float64
Tavg_int                   int64
Normal_Temp              float64
Depart_calc              float64
RA                        

In [19]:
##### Mega Function to run all of the models of relevance ###
# Mostly derived from 6.08 work and Project 3, with a couple modifications #

### NEED TO MODIFY FUNCTION BEFORE RUNNING MODEL 
### IF WISHING TO USE model_metric (also needs commented back in)

### Scale = "Yes"; skip otherwise
### TTS = "Yes" or "No" on whether or not to run a train/test split
### Boost = "Yes"; skip otherwise
### X = Pre-defined dataframe and its columns of interest
### y = Pre-defined dataframe target column
### model_to_run = Model() wishing to run this test on
### grid_search_dictionary = parameter dictionary to feed 
            ### into grid_search for the model of interest

def evaluate_model(Scale, TTS, Boost, X, y, model_to_run, grid_search_dictionary):
    
    if Scale == "Yes":
        from sklearn.preprocessing import Normalizer
        normalizer = Normalizer()
        X = normalizer.fit_transform(X)
        
    if TTS == "Yes":
        # Perform the train/test split:
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)   
        
    if TTS == "No":
        # Skip TTS:
        X_train = X
        X_test = X
        y_train = y
        y_test = y
    
    ##### Run the model
    model = model_to_run
    model.fit(X_train, y_train)
    
    # Generate predictions
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test) # Need the second value
    
    probability_list = [] # Becomes the list of probabilities
    for i in probabilities:
        probability_list.append(i[1])
    probabilities = probability_list
        
    
    # Create cross-val score on train
    from sklearn.model_selection import cross_val_score
    
    # Perform 10-fold cross validation
    trn_cv_scores = cross_val_score(model, X_train, y_train, cv=10)
    
    # Create scores on test group
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    from sklearn.metrics import roc_auc_score
    
    # Comment in if running a model with this attribute
    # model_metric = model.feature_importances_   
    # print model_metric
    
        # Model specific output attribute
        # Have to reassign each function run
    
    acc_score = accuracy_score(y_test, predictions)
    con_matrix = confusion_matrix(y_test, predictions)
    class_rep = classification_report(y_test, predictions)
    roc_auc = roc_auc_score(y_test, probabilities)

    
    ##### Run Bagging
    from sklearn.ensemble import BaggingClassifier
    
    bagging = BaggingClassifier(base_estimator=model)
    bagging.fit(X_train, y_train)
    bagging_predictions = bagging.predict(X_test)
    bagging_probabilities = bagging.predict_proba(X_test)
    
    bag_probability_list = [] # Becomes the list of probabilities
    for i in bagging_probabilities:
        bag_probability_list.append(i[1])
    bagging_probabilities = bag_probability_list
    
    bg_acc_score = accuracy_score(y_test, bagging_predictions)
    bg_con_matrix = confusion_matrix(y_test, bagging_predictions)
    bg_class_rep = classification_report(y_test, bagging_predictions)
    bg_roc_auc = roc_auc_score(y_test, bagging_probabilities)
    
    
    ##### Run a GridSearch
    from sklearn.model_selection import GridSearchCV
    
    # Run 10-fold cross validation on the bagged model
    grid_search = GridSearchCV(model, grid_search_dictionary, cv=10, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    grid_search_predictions = grid_search.predict(X_test)
    grid_search_probabilities = grid_search.predict_proba(X_test)
    
    gs_probability_list = [] # Becomes the list of probabilities
    for i in grid_search_probabilities:
        gs_probability_list.append(i[1])
    grid_search_probabilities = gs_probability_list
    
    gs_acc_score = accuracy_score(y_test, grid_search_predictions)
    gs_con_matrix = confusion_matrix(y_test, grid_search_predictions)
    gs_class_rep = classification_report(y_test, grid_search_predictions)
    gs_roc_auc = roc_auc_score(y_test, grid_search_probabilities)
    
    
    # See the outputs
    print "Cross Val Scores \n", trn_cv_scores, "\n"
    # print "Model Attribute \n", model_metric, "\n"
            # Add back in if used above
    print "Accuracy Score \n", acc_score, "\n"
    print "Confusion Matrix \n", con_matrix, "\n"
    print "Classification Report \n", class_rep, "\n"
    print "ROC-AUC Score \n", roc_auc, "\n"
    
    print "Bagging Classifiers \n", bagging.base_estimator_, "\n"
    print "Bagging Accuracy Score \n", bg_acc_score, "\n"
    print "Bagging Confusion Matrix \n", bg_con_matrix, "\n"
    print "Bagging Classification Report \n", bg_class_rep
    print "Bagging ROC-AUC Score \n", bg_roc_auc, "\n"
    
    print "Grid Search Best Params \n", grid_search.best_params_, "\n"
    print "Grid Search Best Score \n", grid_search.best_estimator_, "\n"
    print "GS Accuracy Score \n", gs_acc_score, "\n"
    print "GS Confusion Matrix \n", gs_con_matrix, "\n"
    print "GS Classification Report \n", gs_class_rep, "\n"
    print "GS ROC-AUC Score \n", gs_roc_auc, "\n"
    
    
        ##### Run Boosting (change if not supported)
        
    if Boost == "Yes":
        
        # AdaBoost
    
        from sklearn.ensemble import AdaBoostClassifier

        aboosting = AdaBoostClassifier(base_estimator=model)
        aboosting.fit(X_train, y_train)
        aboosting_predictions = aboosting.predict(X_test)
        aboosting_probabilities = aboosting.predict_proba(X_test)

        aboost_probability_list = [] # Becomes the list of probabilities
        for i in aboosting_probabilities:
            aboost_probability_list.append(i[1])
        aboosting_probabilities = aboost_probability_list

        abst_acc_score = accuracy_score(y_test, aboosting_predictions)
        bbst_con_matrix = confusion_matrix(y_test, aboosting_predictions)
        abst_class_rep = classification_report(y_test, aboosting_predictions)
        abst_roc_auc = roc_auc_score(y_test, aboosting_probabilities)
        
        print "AdaBoosting Classifiers \n", aboosting.base_estimator_, "\n"
        print "AdaBoosting Accuracy Score \n", abst_acc_score, "\n"
        print "AdaBoosting Confusion Matrix \n", abst_con_matrix, "\n"
        print "AdaBoosting Classification Report \n", abst_class_rep, "\n"
        print "AdaBoosting ROC-AUC Score \n", abst_roc_auc, "\n"
        
        # GradientBoosting
    
        from sklearn.ensemble import GradientBoostingClassifier

        gboosting = GradientBoostingClassifier(init=model)
        gboosting.fit(X_train, y_train)
        gboosting_predictions = gboosting.predict(X_test)
        gboosting_probabilities = gboosting.predict_proba(X_test)

        gboost_probability_list = [] # Becomes the list of probabilities
        for i in gboosting_probabilities:
            gboost_probability_list.append(i[1])
        gboosting_probabilities = gboost_probability_list

        gbst_acc_score = accuracy_score(y_test, gboosting_predictions)
        gbst_con_matrix = confusion_matrix(y_test, gboosting_predictions)
        gbst_class_rep = classification_report(y_test, gboosting_predictions)
        gbst_roc_auc = roc_auc_score(y_test, gboosting_probabilities)
        
        print "GradientBoosting Classifiers \n", gboosting.base_estimator_, "\n"
        print "GradientBoosting Accuracy Score \n", gbst_acc_score, "\n"
        print "GradientBoosting Confusion Matrix \n", gbst_con_matrix, "\n"
        print "GradientBoosting Classification Report \n", gbst_class_rep, "\n"
        print "GradientBoosting ROC-AUC Score \n", gbst_roc_auc, "\n"
    


In [6]:
train_data.columns

Index([u'Species', u'Block', u'Trap', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'NumMosquitos', u'WnvPresent', u'YMD', u'PIPIENS',
       u'RESTUANS', u'SALINARIUS', u'TERRITANS', u'Station', u'Tmax', u'Tmin',
       u'DewPoint', u'WetBulb', u'Sunrise', u'Sunset', u'PrecipTotal',
       u'StnPressure', u'SeaLevel', u'ResultSpeed', u'ResultDir', u'AvgSpeed',
       u'Tavg_int', u'Normal_Temp', u'Depart_calc', u'RA', u'BR', u'TS', u'HZ',
       u'SN', u'FG', u'FG+', u'FU', u'DZ', u'VC', u'MI', u'BC',
       u'Days_Since_Spray', u'Dist_to_Closest_Spray', u'Week',
       u'DaylightMinutes'],
      dtype='object')

In [7]:
test_data.columns

Index([u'Species', u'Block', u'Trap', u'Latitude', u'Longitude',
       u'AddressAccuracy', u'YMD', u'PIPIENS', u'RESTUANS', u'SALINARIUS',
       u'TERRITANS', u'Station', u'Tmax', u'Tmin', u'DewPoint', u'WetBulb',
       u'Sunrise', u'Sunset', u'PrecipTotal', u'StnPressure', u'SeaLevel',
       u'ResultSpeed', u'ResultDir', u'AvgSpeed', u'Tavg_int', u'Normal_Temp',
       u'Depart_calc', u'RA', u'BR', u'TS', u'HZ', u'SN', u'FG', u'FG+', u'FU',
       u'DZ', u'VC', u'MI', u'BC', u'Days_Since_Spray',
       u'Dist_to_Closest_Spray', u'Week', u'DaylightMinutes'],
      dtype='object')

### From Miranda's investigation:
Highest potential columns:  
PIPIENS  
Tavg  
DewPoint  
WetBulb  
PrecipTotal  
Days_Since_Spray  
Dist_to_Closest_Spray  
FG   
BR    
HZ  
VC  
Week  
DaylightMinutes  

In [247]:
# Look at particular columns and run the functions.  

# Narrowing columns to a few of high interest.
columns_of_interest = ["PIPIENS", "Tavg_int", "PrecipTotal", "Days_Since_Spray", 
                      "Dist_to_Closest_Spray", "Week", "DaylightMinutes"]

In [8]:
# This list has all possible based on Miranda's list
columns_of_interest = ["PIPIENS", "Tavg_int", "DewPoint", "WetBulb", "PrecipTotal", 
                      "Days_Since_Spray", "Dist_to_Closest_Spray", "FG", "BR", 
                      "HZ", "VC", "Week", "DaylightMinutes"]

In [249]:
# This list uses almost everything with a value:
columns_of_interest = ["Block", "Latitude", "Longitude", "AddressAccuracy", 
                      "PIPIENS", "RESTUANS", "SALINARIUS", "TERRITANS", "Tmax", "Tmin",
                      "Tavg_int", "DewPoint", "WetBulb", "PrecipTotal", "StnPressure", 
                      "SeaLevel", "ResultSpeed", "ResultDir", "AvgSpeed", "Normal_Temp", 
                      "RA", "BR", "TS", "HZ", "SN", "FG", "FU", "DZ", "VC", "MI", "BC", 
                      "Days_Since_Spray", "Dist_to_Closest_Spray", "Week", "DaylightMinutes"]

In [9]:
X = train_data[columns_of_interest]
y = train_data["WnvPresent"]

In [10]:
# Import necessary models:

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

In [209]:
# Recall

### Scale = "Yes"; skip otherwise
### TTS = "Yes" or "No" on whether or not to run a train/test split
### X = Pre-defined dataframe and its columns of interest
### y = Pre-defined dataframe target column
### model_to_run = Model() wishing to run this test on
### grid_search_dictionary = parameter dictionary to feed 
            ### into grid_search for the model of interest

# Logistic Regression

In [20]:
# Logistic Regression

# Set up the parameters for GridSearch
log_dict = {
    'penalty':('l1', 'l2'),
    'C':[0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 10.0]
    }

evaluate_model("Yes", "Yes", "Yes", X, y, LogisticRegression(), log_dict)

Cross Val Scores 
[ 0.94558824  0.9455081   0.9455081   0.9455081   0.94690265  0.94690265
  0.94690265  0.94690265  0.94690265  0.94690265] 

Accuracy Score 
0.952200825309 

Confusion Matrix 
[[2769    0]
 [ 139    0]] 

Classification Report 
             precision    recall  f1-score   support

          0       0.95      1.00      0.98      2769
          1       0.00      0.00      0.00       139

avg / total       0.91      0.95      0.93      2908


ROC-AUC Score 
0.673682938806 

Bagging Classifiers 
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False) 

Bagging Accuracy Score 
0.952200825309 

Bagging Confusion Matrix 
[[2769    0]
 [ 139    0]] 

Bagging Classification Report 
             precision    recall  f1-score   support

          0       0.95      1.00     

NameError: global name 'boosting' is not defined

# KNN

In [22]:
# KNN Classifier

# Good to check in both high number of columns, and low.
# This run is low

knn_dict = {
    'n_neighbors': [1, 2, 3, 5, 10, 20, 50],
    'weights': ('uniform', 'distance')
    }


evaluate_model("Yes", "Yes", "No", X, y, KNeighborsClassifier(), knn_dict)

Cross Val Scores 
[ 0.94256259  0.94108984  0.93372607  0.94698085  0.93961708  0.94108984
  0.94403535  0.94837758  0.95125554  0.95125554] 

Accuracy Score 
0.943947730399 

Confusion Matrix 
[[2733   29]
 [ 134   12]] 

Classification Report 
             precision    recall  f1-score   support

          0       0.95      0.99      0.97      2762
          1       0.29      0.08      0.13       146

avg / total       0.92      0.94      0.93      2908


ROC-AUC Score 
0.681436173906 

Bagging Classifiers 
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform') 

Bagging Accuracy Score 
0.945323246217 

Bagging Confusion Matrix 
[[2737   25]
 [ 134   12]] 

Bagging Classification Report 
             precision    recall  f1-score   support

          0       0.95      0.99      0.97      2762
          1       0.32      0.08      0.13       146

avg / total       0.92      0.9

# Decision Tree

In [23]:
# Decision tree:

# With this setup, better to run with more columns, so
# columns_of_interest takes high amount

dtree_dict = {
    'criterion':('gini', 'entropy'),
    'max_features':[2, 3, 5, 8, 10, 0.2, 0.4, 0.6, 0.8, "sqrt", "log2", None],
    'max_depth':[2, 3, 5, 8, 10, None],
    'min_samples_split':[2, 3, 5, 8, 10, 0.2, 0.4, 0.6, 0.8],
    'min_samples_leaf': [2, 3, 5, 8, 10, 0.2, 0.3, 0.4, 0.5]
    }

evaluate_model("No", "Yes", "Yes", X, y, DecisionTreeClassifier(), dtree_dict)

Cross Val Scores 
[ 0.93235294  0.92488954  0.94108984  0.93814433  0.9439528   0.94247788
  0.93362832  0.93362832  0.94100295  0.93362832] 

Accuracy Score 
0.937414030261 

Confusion Matrix 
[[2707   52]
 [ 130   19]] 

Classification Report 
             precision    recall  f1-score   support

          0       0.95      0.98      0.97      2759
          1       0.27      0.13      0.17       149

avg / total       0.92      0.94      0.93      2908


ROC-AUC Score 
0.723140375245 

Bagging Classifiers 
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best') 

Bagging Accuracy Score 
0.945323246217 

Bagging Confusion Matrix 
[[2725   34]
 [ 125   24]] 

Bagging Classification Report 
             precision    recall  f1-score  

NameError: global name 'boosting' is not defined

# Random forest

In [None]:
# Random forest:


rforest_dict = {
    'criterion':('gini', 'entropy'),
    'n_estimators':[2, 4, 7, 10, 20],
    'max_features':[2, 3, 5, 8, 10, 0.2, 0.4, 0.6, 0.8, "sqrt", "log2", None],
    'min_samples_split':[2, 3, 5, 8, 10, 0.2, 0.4, 0.6, 0.8],
    'min_samples_leaf': [2, 3, 5, 8, 10, 0.2, 0.3, 0.4, 0.5]
    }

rforest_model = evaluate_model("No", "Yes", "Yes", X, y, RandomForestClassifier(), rforest_dict)

# SVM

In [18]:
# SVM - Trying without modifying the function

svm_dict = {
    'C':[0.001, 0.01, 0.1, 1.0, 2.0, 5.0, 10.0],
    'kernel':('linear', 'poly', 'rbf', 'sigmoid'),
}

evaluate_model("Yes", "Yes", "Yes", X, y, SVC(probability=True), svm_dict)

# Note: probability=True to actually get probabilities, otherwise
# function will throw an error (.predict_proba will not work)

Cross Val Scores 
[ 0.94845361  0.94845361  0.94845361  0.94845361  0.94845361  0.94845361
  0.94845361  0.94985251  0.94977843  0.94977843] 

Accuracy Score 
0.946354883081 

Confusion Matrix 
[[2752    0]
 [ 156    0]] 

Classification Report 
             precision    recall  f1-score   support

          0       0.95      1.00      0.97      2752
          1       0.00      0.00      0.00       156

avg / total       0.90      0.95      0.92      2908


ROC-AUC Score 
0.481768504025 

Bagging Classifiers 
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=True, random_state=None, shrinking=True,
  tol=0.001, verbose=False) 

Bagging Accuracy Score 
0.905433287483 

Bagging Confusion Matrix 
[[2632  120]
 [ 155    1]] 

Bagging Classification Report 
             precision    recall  f1-score   support

          0       0.94      0.96      0.95      2752
          1       0.01    

# Get an output of probabilities
Re-do the function to take in the test list and output the probability list

In [None]:
# Include the Id column here for the actual output
X_to_test = test_data[columns_of_interest]

In [None]:
##### Mega Function to run all of the models of relevance ###
# Mostly derived from 6.08 work and Project 3, with a couple modifications #

### NEED TO MODIFY FUNCTION BEFORE RUNNING MODEL 
### IF WISHING TO USE model_metric (also needs commented back in)

### Scale = "Yes"; skip otherwise
### TTS = "Yes" or "No" on whether or not to run a train/test split
### X = Pre-defined dataframe and its columns of interest
### y = Pre-defined dataframe target column
### model_to_run = Model() wishing to run this test on
### grid_search_dictionary = parameter dictionary to feed 
            ### into grid_search for the model of interest

def evaluate_model(Scale, TTS, X, y, model_to_run, grid_search_dictionary, X_to_test):
    
    if Scale == "Yes":
        from sklearn.preprocessing import Normalizer
        normalizer = Normalizer()
        X = normalizer.fit_transform(X)
        
    if TTS == "Yes":
        # Perform the train/test split:
        from sklearn.model_selection import train_test_split
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)   
        
    if TTS == "No":
        # Skip TTS:
        X_train = X
        X_test = X
        y_train = y
        y_test = y
    
    ##### Run the model
    model = model_to_run
    model.fit(X_train, y_train)
    
    # Generate predictions
    predictions = model.predict(X_test)
    probabilities = model.predict_proba(X_test) # Need the second value
    
    probability_list = [] # Becomes the list of probabilities
    for i in probabilities:
        probability_list.append(i[1])
    probabilities = probability_list
        
    
    # Create cross-val score on train
    from sklearn.model_selection import cross_val_score
    
    # Perform 10-fold cross validation
    trn_cv_scores = cross_val_score(model, X_train, y_train, cv=10)
    
    # Create scores on test group
    from sklearn.metrics import accuracy_score
    from sklearn.metrics import confusion_matrix
    from sklearn.metrics import classification_report
    from sklearn.metrics import roc_auc_score
    
    # Comment in if running a model with this attribute
    # model_metric = model.feature_importances_   
    # print model_metric
    
        # Model specific output attribute
        # Have to reassign each function run
    
    acc_score = accuracy_score(y_test, predictions)
    con_matrix = confusion_matrix(y_test, predictions)
    class_rep = classification_report(y_test, predictions)
    roc_auc = roc_auc_score(y_test, probabilities)

    
    ##### Run Bagging
    from sklearn.ensemble import BaggingClassifier
    
    bagging = BaggingClassifier(base_estimator=model)
    bagging.fit(X_train, y_train)
    bagging_predictions = bagging.predict(X_test)
    bagging_probabilities = bagging.predict_proba(X_test)
    
    bag_probability_list = [] # Becomes the list of probabilities
    for i in bagging_probabilities:
        bag_probability_list.append(i[1])
    bagging_probabilities = bag_probability_list
    
    bg_acc_score = accuracy_score(y_test, bagging_predictions)
    bg_con_matrix = confusion_matrix(y_test, bagging_predictions)
    bg_class_rep = classification_report(y_test, bagging_predictions)
    bg_roc_auc = roc_auc_score(y_test, bagging_probabilities)
    
    
    ##### Run a GridSearch
    from sklearn.model_selection import GridSearchCV
    
    # Run 10-fold cross validation on the bagged model
    grid_search = GridSearchCV(model, grid_search_dictionary, cv=10, n_jobs = -1)
    grid_search.fit(X_train, y_train)
    grid_search_predictions = grid_search.predict(X_test)
    grid_search_probabilities = grid_search.predict_proba(X_test)
    
    gs_probability_list = [] # Becomes the list of probabilities
    for i in grid_search_probabilities:
        gs_probability_list.append(i[1])
    grid_search_probabilities = gs_probability_list
    
    gs_acc_score = accuracy_score(y_test, grid_search_predictions)
    gs_con_matrix = confusion_matrix(y_test, grid_search_predictions)
    gs_class_rep = classification_report(y_test, grid_search_predictions)
    gs_roc_auc = roc_auc_score(y_test, grid_search_probabilities)
    
    ##### Run Boosting
    
    from sklearn.ensemble import AdaBoostClassifier
    
    boosting = AdaBoostClassifier(base_estimator=model)
    boosting.fit(X_train, y_train)
    boosting_predictions = boosting.predict(X_test)
    boosting_probabilities = boosting.predict_proba(X_test)
    
    boost_probability_list = [] # Becomes the list of probabilities
    for i in boosting_probabilities:
        boost_probability_list.append(i[1])
    boosting_probabilities = boost_probability_list
    
    bst_acc_score = accuracy_score(y_test, boosting_predictions)
    bst_con_matrix = confusion_matrix(y_test, boosting_predictions)
    bst_class_rep = classification_report(y_test, boosting_predictions)
    bst_roc_auc = roc_auc_score(y_test, boosting_probabilities)
    
    # Get final output list
    final_predictions = boosting.predict_proba(X_to_test)
    
    # See the outputs
    print "Cross Val Scores \n", trn_cv_scores, "\n"
    # print "Model Attribute \n", model_metric, "\n"
            # Add back in if used above
    print "Accuracy Score \n", acc_score, "\n"
    print "Confusion Matrix \n", con_matrix, "\n"
    print "Classification Report \n", class_rep, "\n"
    print "ROC-AUC Score \n", roc_auc, "\n"
    
    print "Bagging Classifiers \n", bagging.base_estimator_, "\n"
    print "Bagging Accuracy Score \n", bg_acc_score, "\n"
    print "Bagging Confusion Matrix \n", bg_con_matrix, "\n"
    print "Bagging Classification Report \n", bg_class_rep
    print "Bagging ROC-AUC Score \n", bg_roc_auc, "\n"
    
    print "Boosting Classifiers \n", boosting.base_estimator_, "\n"
    print "Bagging Accuracy Score \n", bst_acc_score, "\n"
    print "Bagging Confusion Matrix \n", bst_con_matrix, "\n"
    print "Bagging Classification Report \n", bst_class_rep, "\n"
    print "Bagging ROC-AUC Score \n", bst_roc_auc, "\n"
    
    print "Grid Search Best Params \n", grid_search.best_params_, "\n"
    print "Grid Search Best Score \n", grid_search.best_estimator_, "\n"
    print "GS Accuracy Score \n", gs_acc_score, "\n"
    print "GS Confusion Matrix \n", gs_con_matrix, "\n"
    print "GS Classification Report \n", gs_class_rep, "\n"
    print "GS ROC-AUC Score \n", gs_roc_auc, "\n"
    
    print "Generating output dataframe: ", "\n"
    return final_predictions


In [None]:
run_final_model = evaluate_model()

In [None]:
submission_df = test_data["Id"]
submission_df["WnvPresent"] = run_final_model