# 1.1 Import Libraries and Data

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import pickle
import category_encoders as ce

# Sklearn Specific
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

# sklearn algos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.neural_network import MLPClassifier # neural network (multilayer perceptron)

# Bayesian Hyperparameter optimization
from hyperopt import hp, tpe, fmin, space_eval

# Import Data
with open('../data/temp_data/data_dicts.pickle', 'rb') as handle:
    data_dicts = pickle.load(handle)

  from numpy.core.umath_tests import inner1d


# 1.2 Functions

In [2]:
def split_encode_standardize(df):
     # Split features and labels
    X = df.iloc[:,4:-1]
    X = X.fillna(0)
    y = df['result'].astype('int')

    # Wrangle 
    X['f_stance'] = X.apply(lambda x: str(x['f_stance']), axis = 1)
    X['o_stance'] = X.apply(lambda x: str(x['o_stance']), axis = 1)

    # Encode categorical data
    ce_binary = ce.BinaryEncoder(cols = ['f_stance','o_stance'])
    X = ce_binary.fit_transform(X, y)
    # Scale features with mean = 0 and sd = 1
    X = preprocessing.scale(X)
    
    return(X,y)

def without_keys(d, keys):
    return {x: d[x] for x in d if x not in keys}

# 2. Setup Bayesian Hyperparameter Optimization

In [3]:
def objective_function(params):
    """Objective function to minimize: (1- cross validated test_accuracy_score)"""
    
    h_model = params['model']  # Gets the model name
    del params['model'] # Gets the hyperparameters
    
    # Initialize model with parameters
    if h_model == "RandomForestClassifier":
        model = RandomForestClassifier(**params)
    elif h_model == "KNeighborsClassifier":
        model = KNeighborsClassifier(**params)
    elif h_model == "LogisticRegression":
        model = LogisticRegression(**params)
    
    scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}
    
    avg_scores = cross_validate(model, X, y, cv=5, scoring = scoring_stats)

    cv_accuracy = np.mean(avg_scores['test_accuracy'])
    return(1 - cv_accuracy)

def get_space(model):
    model_name = type(model).__name__
    
            
    if model_name == "LogisticRegression":
        space_dict = {'model': model_name,
                      'dual': hp.choice('dual', [True,False])}
        
    elif model_name == "RandomForestClassifier":
        space_dict = {'model' : model_name,
                      'max_depth': hp.choice('max_depth', range(1,50)),
                      'max_features': hp.choice('max_features', range(1,50)),
                      'n_estimators': hp.choice('n_estimators', range(1,50)),}
        
    elif model_name == "svm.SVC": # Fix space dict
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
        
    elif model_name == "KNeighborsClassifier":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100)),
                      'weights': hp.choice('leaf_size', ['auto', 'ball_tree', 'kd_tree', 'brute']),
                      'leaf_size': hp.choice('leaf_size', range(1,60))}
    
    elif model_name == "DecisionTreeClassifier":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
    
    elif model_name == "GaussianNB":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
    
    elif model_name == "Perceptron":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
    
    elif model_name == "SGDClassifier":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
    
    elif model_name == "MLPClassifier":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
    

    return(space_dict)   

In [4]:
# blr_clf = LogisticRegression()
# rf_clf = RandomForestClassifier()
# svm_clf = svm.SVC()
# knn_clf = KNeighborsClassifier()
# dtree_clf = DecisionTreeClassifier()
# nb_clf = GaussianNB()
# perc_clf = Perceptron()
# sgd_clf = SGDClassifier()
# mlp_clf = MLPClassifier()

# 3. Initialize Models and Generate Dictionary for Results

# Initialize models
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
perc_clf = Perceptron()
sgd_clf = SGDClassifier()
mlp_clf = MLPClassifier()

# add models to a list
models.extend((blr_clf ,rf_clf, svm_clf, knn_clf, dtree_clf, nb_clf, perc_clf, sgd_clf,mlp_clf))

# Initialize a dictionary for scores, dataset
score_d = {}
score_d['dict_type'] = []
score_d['dataset'] = []
score_d['num_obs'] = []
score_d['model_name'] = []
score_d['accuracy'] = []
score_d['precision'] = []
score_d['recall'] = []
score_d['roc_auc'] = []
score_d['hp_dict'] = []

In [5]:
# Initialize models
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()



# add models to a list
models.extend((rf_clf, blr_clf))

# Initialize a dictionary for scores, dataset
score_d = {}
score_d['dict_type'] = []
score_d['dataset'] = []
score_d['num_obs'] = []
score_d['model_name'] = []
score_d['accuracy'] = []
score_d['precision'] = []
score_d['recall'] = []
score_d['roc_auc'] = []
score_d['hp_dict'] = []

# 5. Train and print scores for each model

In [6]:
cumu_dfs_dict = data_dicts[0]

In [7]:
len(cumu_dfs_dict)

20

In [8]:
df_index = 0
for key in cumu_dfs_dict:
    df_index += 1
    
    if (df_index <= 10):
        continue
    
    dict_type = "cumu_dfs_dict"
    df = cumu_dfs_dict[key].copy()

    dataset = key
    num_obs = df.shape[0]

    X,y = split_encode_standardize(df)

    for model in models:
        model_name = type(model).__name__
        
        print(model_name)
        
        space = get_space(model)
        best_h = fmin(fn=objective_function, space=space, algo=tpe.suggest, max_evals=25)
        opt_hp = space_eval(space, best_h)
        
        print(opt_hp)
        del opt_hp['model']
        
        # create new model with tuned hyperparameters
        if model_name == "RandomForestClassifier":
            ht_model = RandomForestClassifier(**opt_hp)
        
        elif model_name == "KNeighborsClassifier":
            ht_model = KNeighborsClassifier(**opt_hp)
        
        elif model_name == "LogisticRegression":
            ht_model = LogisticRegression(**opt_hp)
        
    
        scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}    
        
        

        # Calculate scores
        avg_scores = cross_validate(ht_model, X, y, cv=5, scoring = scoring_stats)

        accuracy = np.mean(avg_scores['test_accuracy'])
        precision = np.mean(avg_scores['test_precision'])
        recall = np.mean(avg_scores['test_recall'])
        roc_auc = np.mean(avg_scores['test_roc_auc'])

        # append to dictionary
        score_d['dict_type'].append(dict_type)
        score_d['dataset'].append(dataset)
        score_d['num_obs'].append(num_obs)
        score_d['model_name'].append(model_name)
        score_d['accuracy'].append(accuracy)
        score_d['precision'].append(precision)
        score_d['recall'].append(recall)
        score_d['roc_auc'].append(roc_auc)
        score_d['hp_dict'].append(opt_hp)

RandomForestClassifier
100%|██████████| 25/25 [04:05<00:00,  7.11s/it, best loss: 0.40527550755794983]
{'max_depth': 10, 'max_features': 15, 'model': 'RandomForestClassifier', 'n_estimators': 39}
LogisticRegression
100%|██████████| 25/25 [00:28<00:00,  1.03it/s, best loss: 0.43968571978283966]
{'dual': True, 'model': 'LogisticRegression'}
RandomForestClassifier
100%|██████████| 25/25 [03:30<00:00, 15.80s/it, best loss: 0.42331799777069246]
{'max_depth': 36, 'max_features': 44, 'model': 'RandomForestClassifier', 'n_estimators': 49}
LogisticRegression
100%|██████████| 25/25 [00:15<00:00,  1.50it/s, best loss: 0.4542601079559164]
{'dual': True, 'model': 'LogisticRegression'}
RandomForestClassifier
100%|██████████| 25/25 [02:15<00:00,  8.74s/it, best loss: 0.41616446020262055]
{'max_depth': 41, 'max_features': 22, 'model': 'RandomForestClassifier', 'n_estimators': 49}
LogisticRegression
100%|██████████| 25/25 [00:10<00:00,  2.07it/s, best loss: 0.4547748199490437]
{'dual': True, 'model': '

In [9]:
scores_dict = without_keys(score_d, 'hp_dict')
scores_df = pd.DataFrame(scores_dict)
scores_df

Unnamed: 0,dict_type,dataset,num_obs,model_name,accuracy,precision,recall,roc_auc
0,cumu_dfs_dict,Cumulative Data: 1 Fight Lookback Window,6914,RandomForestClassifier,0.589087,0.585427,0.565246,0.61841
1,cumu_dfs_dict,Cumulative Data: 1 Fight Lookback Window,6914,LogisticRegression,0.552505,0.549287,0.504708,0.573533
2,cumu_dfs_dict,Cumulative Data: 2 Fight Lookback Window,5006,RandomForestClassifier,0.569498,0.566291,0.53504,0.590993
3,cumu_dfs_dict,Cumulative Data: 2 Fight Lookback Window,5006,LogisticRegression,0.54294,0.539227,0.502649,0.560074
4,cumu_dfs_dict,Cumulative Data: 3 Fight Lookback Window,3780,RandomForestClassifier,0.573261,0.567054,0.557795,0.599265
5,cumu_dfs_dict,Cumulative Data: 3 Fight Lookback Window,3780,LogisticRegression,0.542579,0.537529,0.504569,0.559633
6,cumu_dfs_dict,Cumulative Data: 4 Fight Lookback Window,2884,RandomForestClassifier,0.557544,0.551661,0.540845,0.575129
7,cumu_dfs_dict,Cumulative Data: 4 Fight Lookback Window,2884,LogisticRegression,0.557558,0.55193,0.538028,0.58416
8,cumu_dfs_dict,Cumulative Data: 5 Fight Lookback Window,2248,RandomForestClassifier,0.533346,0.526007,0.505882,0.552977
9,cumu_dfs_dict,Cumulative Data: 5 Fight Lookback Window,2248,LogisticRegression,0.527584,0.519745,0.508597,0.558876


In [10]:
scores_df.to_csv("../data/partial_scores_v2.csv")

# Preliminary Analysis
- The hyperparameters from the sklearn classifiers were set to their defaults and will be tuned. 


- Nevertheless, the effectiveness of each model appears to vary based off the length of the look back number. I will soon be transforming the printed text data above into nicer looking graphs.  


- Please scroll around in the above cell to view the accuracy, recall, precision, and ROC-AUC from using a cross validate method. 


- Now, most of the accuracy percentages are hovering near the 50% mark. As noted by previous literature, this data is inherently noisy and will likely make it very difficult to have an accuracy of over 60%. It is even more of an issue when generating the data using a look back window because the number of observations decreases substantially. 

