# 1. Import Libraries and Data

In [1]:
# General Libraries
import numpy as np
import pandas as pd
import pickle
import category_encoders as ce

# Sklearn Specific
from sklearn.model_selection import cross_val_score 
from sklearn.model_selection import cross_validate
from sklearn import preprocessing

# sklearn algos
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier 
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB # Naive Bayes
from sklearn.linear_model import Perceptron
from sklearn.linear_model import SGDClassifier # SGD
from sklearn.neural_network import MLPClassifier # neural network (multilayer perceptron)

# Bayesian Hyperparameter optimization
from hyperopt import hp, tpe, fmin, Trials

# Import Data
with open('../data/temp_data/data_dicts.pickle', 'rb') as handle:
    data_dicts = pickle.load(handle)

  from numpy.core.umath_tests import inner1d


# 2. Functions

In [2]:
def split_encode_standardize(df):
     # Split features and labels
    X = df.iloc[:,4:-1]
    X = X.fillna(0)
    y = df['result'].astype('int')

    # Wrangle 
    X['f_stance'] = X.apply(lambda x: str(x['f_stance']), axis = 1)
    X['o_stance'] = X.apply(lambda x: str(x['o_stance']), axis = 1)

    # Encode categorical data
    ce_binary = ce.BinaryEncoder(cols = ['f_stance','o_stance'])
    X = ce_binary.fit_transform(X, y)
    # Scale features with mean = 0 and sd = 1
    X = preprocessing.scale(X)
    
    return(X,y)

# Try to do hyperopt for one dataset and one model

In [3]:
def objective_function(params):
    """Objective function to minimize: (1- cross validated test_accuracy_score)"""
    
    h_model = params['model']  # Gets the model name
    del params['model'] # Gets the hyperparameters
    
    # Initialize model with parameters
    if h_model == "RandomForestClassifier":
        model = RandomForestClassifier(**params)
    elif h_model == "KNeighborsClassifier":
        model = KNeighborsClassifier(**params)
    elif h_model == "LogisticRegression":
        model = LogisticRegression(**params)
    
    scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}
    
    # Calculate scores
    avg_scores = cross_validate(model, X, y, cv=5, scoring = scoring_stats)

    cv_accuracy = np.mean(avg_scores['test_accuracy'])
    return(1 - cv_accuracy)

In [4]:
def get_space(model):
    model_name = type(model).__name__
    
    if model_name == "RandomForestClassifier":
        space_dict = {'model' : model_name,
                      'max_depth': hp.choice('max_depth', range(1,20)),
                      'max_features': hp.choice('max_features', range(1,5)),
                      'n_estimators': hp.choice('n_estimators', range(1,20)),}
        
    elif model_name == "KNeighborsClassifier":
        space_dict = {'model': model_name,
                      'n_neighbors': hp.choice('n_neighbors', range(1,100))}
        
    elif model_name == "LogisticRegression":
        space_dict = {'model': model_name,
                      'penalty': hp.choice('n_neighbors', range(1,100))}
        
                      
    return(space_dict)   

# Iterate over two models

In [5]:
models = []

rf_clf = RandomForestClassifier()
# knn_clf = KNeighborsClassifier()

models.extend([rf_clf])

In [6]:
test_dict = data_dicts[0]
test_df = test_dict['Cumulative Data: 1 Fight Lookback Window']
X,y = split_encode_standardize(test_df)

In [7]:
hype_dict = {}

for model in models:
    print(type(model).__name__)
    
    space = get_space(model)
    best_h = fmin(fn=objective_function, space=space, algo=tpe.suggest, max_evals=10)
    print(best_h)

RandomForestClassifier
100%|██████████| 10/10 [00:05<00:00,  1.64it/s, best loss: 0.42364149301924825]
{'max_depth': 6, 'max_features': 2, 'n_estimators': 11}


# Sample with 1 df and 1 clf

In [13]:
rf_clf = RandomForestClassifier()

In [11]:
X.shape

(6914, 58)

In [9]:
test_dict = data_dicts[0]
test_df = test_dict['Cumulative Data: 1 Fight Lookback Window']

In [17]:
X,y = split_encode_standardize(test_df)

In [18]:
rf_space = get_space(rf_clf)

In [19]:
trials = Trials()
best_rf = fmin(fn=objective_function, space=rf_space, algo=tpe.suggest, max_evals=10, trials = trials)

{'max_depth': 9, 'max_features': 4, 'model': 'RandomForestClassifier', 'n_estimators': 12}
{'max_depth': 9, 'max_features': 4, 'n_estimators': 12}
  0%|          | 0/10 [00:00<?, ?it/s, best loss: ?]


ValueError: n_estimators must be an integer, got <class 'dict'>.

In [15]:
for model in models:
    get_space(model)

In [None]:
# Run 2000 evals with the tpe algorithm
rf_tpe_best = fmin(fn=objective, space=space, algo=tpe_algo, trials=tpe_trials, 
                max_evals=2000, rstate= np.random.RandomState(50))

# 3. Initialize Models and Generate Dictionary for Results

In [49]:
# Initialize models
models = []

blr_clf = LogisticRegression()
rf_clf = RandomForestClassifier()
svm_clf = svm.SVC()
knn_clf = KNeighborsClassifier()
dtree_clf = DecisionTreeClassifier()
nb_clf = GaussianNB()
perc_clf = Perceptron()
sgd_clf = SGDClassifier()
mlp_clf = MLPClassifier()

models.extend((blr_clf ,rf_clf, svm_clf, knn_clf, dtree_clf, nb_clf, perc_clf, sgd_clf,mlp_clf))

# Initialize a dictionary for scores, dataset
score_d = {}
score_d['dict_type'] = []
score_d['dataset'] = []
score_d['num_obs'] = []
score_d['model_name'] = []
score_d['accuracy'] = []
score_d['precision'] = []
score_d['recall'] = []
score_d['roc_auc'] = []

# 5. Train and print scores for each model

In [18]:
# Create function that takes in data set
cumu_dfs_dict = data_dicts[0]

In [None]:
for key in cumu_dfs_dict:
    dict_type = "cumu_dfs_dict"
    df = cumu_dfs_dict[key].copy()

    dataset = key
    num_obs = df.shape[0]

    # Split features and labels
    X = df.iloc[:,4:-1]
    X = X.fillna(0)
    y = df['result'].astype('int')

    # Wrangle 
    X['f_stance'] = X.apply(lambda x: str(x['f_stance']), axis = 1)
    X['o_stance'] = X.apply(lambda x: str(x['o_stance']), axis = 1)

    # Encode categorical data
    ce_binary = ce.BinaryEncoder(cols = ['f_stance','o_stance'])
    X = ce_binary.fit_transform(X, y)
    # Scale features with mean = 0 and sd = 1
    X = preprocessing.scale(X)

    for model in models:
        model_name = type(model).__name__

        scoring_stats = {'accuracy': 'accuracy',
               'recall': 'recall',
               'precision': 'precision',
               'roc_auc': 'roc_auc'}

        # Calculate scores
        avg_scores = cross_validate(model, X, y, cv=5, scoring = scoring_stats)

        accuracy = np.mean(avg_scores['test_accuracy'])
        precision = np.mean(avg_scores['test_precision'])
        recall = np.mean(avg_scores['test_recall'])
        roc_auc = np.mean(avg_scores['test_roc_auc'])

        # append to dictionary
        score_d['dict_type'].append(dict_type)
        score_d['dataset'].append(dataset)
        score_d['num_obs'].append(num_obs)
        score_d['model_name'].append(model_name)
        score_d['accuracy'].append(accuracy)
        score_d['precision'].append(precision)
        score_d['recall'].append(recall)
        score_d['roc_auc'].append(roc_auc)























In [None]:
scores_df = pd.DataFrame(score_d)
scores_df

In [34]:
scores_df.to_csv("../data/scores_v2.csv")

# Preliminary Analysis
- The hyperparameters from the sklearn classifiers were set to their defaults and will be tuned. 


- Nevertheless, the effectiveness of each model appears to vary based off the length of the look back number. I will soon be transforming the printed text data above into nicer looking graphs.  


- Please scroll around in the above cell to view the accuracy, recall, precision, and ROC-AUC from using a cross validate method. 


- Now, most of the accuracy percentages are hovering near the 50% mark. As noted by previous literature, this data is inherently noisy and will likely make it very difficult to have an accuracy of over 60%. It is even more of an issue when generating the data using a look back window because the number of observations decreases substantially. 

