# Import packages

In [17]:
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import KFold, cross_validate, LeaveOneOut, train_test_split, GridSearchCV
import time

# Train Test Function

In [5]:
def train_test_models(file_name, models='ALL', crossval_method='kfold',
                        zero_column_delete=True, absolute_value=True, notes=None):
    """[summary]

    Args:
        file_name ([type]): file to use for data
        models (str, optional): Model to be used for training and testing. Defaults to 'ALL'.
                                If all is selected, many different types of classifiers will
                                be used for training and testing.
        crossval_method (str, optional): Cross validation method used for model training.
                                         'kfold' and 'loo' are acceptable inputs.
                                         Defaults to 'kfold'.
        zero_column_delete (bool, optional): Gets rid of columns with all values=0. Defaults to True.
        absolute_value (bool, optional): Takes the absolute value of every datapoint. Defaults to True.
        notes ([type], optional): Description of the run. Defaults to None.
    """

    print("############################")
    print("Begining train_test function")
    print("############################")
    print(f'File the data is being pulled from: {file_name}')

    # Load in the data using pandas into a dataframe
    data_df = pd.read_csv(file_name, header=None)
    print(f'Data is of shape {data_df.shape}')

    # Delete the zero value columns if argument is provided
    if zero_column_delete:
        print('Columns with all Zeros are being removed')
        data_df = data_df.loc[:, (data_df != 0).any(axis=0)]
    
    # Get values from dataframe 
    X = data_df.iloc[:, 1:].values
    y = data_df.iloc[:, 0].values

    # Set absolute value of X if argument is provided
    if absolute_value:
        X = abs(X)
    
    # Initialize models if 'ALL' is selected
    if models == 'ALL':
        models = [AdaBoostClassifier(),
                GradientBoostingClassifier(),
                RandomForestClassifier(), 
                DecisionTreeClassifier(),
                ExtraTreesClassifier(),
                svm.SVC(kernel='rbf'),
                SGDClassifier(random_state=2021)]

    ## Check to see if model is in list or not
    elif type(models) is not list:
        print(f'{models} is not given in list format')
        raise

    # Initialize cross validation
    if crossval_method == 'kfold':
        cross_val = KFold(n_splits=10,
                          shuffle=True,
                          random_state = 42)
    elif crossval_method == 'loo':
        cross_val = LeaveOneOut()
    else:
        print('crossval_method neither kfold or loo')

    # Initiailze dataframe to store results
    results_df = pd.DataFrame([], columns = ['filename',
                                                'model',
                                                'train_scores',
                                                'test_scores',
                                                'min_test',
                                                'avg_test',
                                                'max_test',
                                                'notes'])
    
    # Model training testing loop
    for model in models:
        print('-----------------')
        print(f'{model} being used')
        results = cross_validate(model, X, y, cv=cross_val, return_train_score=True)
        train_score = results['train_score']
        test_score = results['test_score']
        test_min = min(test_score)
        test_avg = np.mean(test_score)
        test_max = max(test_score)
        print(f'train scores: {train_score}')
        print(f'test scores: {test_score}')
        ## Create a dict to store results (makes dataframes easier)
        results_dict = {'filename': file_name,
                        'model': str(model),
                        'train_scores': [train_score.round(3)],
                        'test_scores': [test_score.round(3)],
                        'min_test': test_min,
                        'avg_test': test_avg,
                        'max_test': test_max,
                        'notes': notes,
                        }
        temp_df = pd.DataFrame(results_dict, index=[0])
        results_df = results_df.append(temp_df, ignore_index=True)
        del results
        del model
        time.sleep(2)
        print('---------------------------------')

    return(results_df)

# Initial runs

In [6]:
sgd_results_1 = train_test_models(file_name='./data/60x61.csv',models=[SGDClassifier()])
sgd_results_1.head()

############################
Begining train_test function
############################
File the data is being pulled from: ./data/60x61.csv
Data is of shape (60, 61)
Columns with all Zeros are being removed
-----------------
SGDClassifier() being used
train scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
test scores: [0.83333333 0.83333333 1.         0.83333333 1.         1.
 1.         0.83333333 1.         1.        ]
---------------------------------


Unnamed: 0,filename,model,train_scores,test_scores,min_test,avg_test,max_test,notes
0,./data/60x61.csv,SGDClassifier(),"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[0.833, 0.833, 1.0, 0.833, 1.0, 1.0, 1.0, 0.83...",0.833333,0.933333,1.0,


In [7]:
sgd_results_2 = train_test_models(file_name='./data/60x61.csv',models=[SGDClassifier()])
sgd_results_2.head()

############################
Begining train_test function
############################
File the data is being pulled from: ./data/60x61.csv
Data is of shape (60, 61)
Columns with all Zeros are being removed
-----------------
SGDClassifier() being used
train scores: [1.         1.         1.         1.         0.51851852 1.
 1.         1.         1.         0.48148148]
test scores: [1.         1.         1.         0.83333333 0.16666667 1.
 1.         0.83333333 1.         0.5       ]
---------------------------------


Unnamed: 0,filename,model,train_scores,test_scores,min_test,avg_test,max_test,notes
0,./data/60x61.csv,SGDClassifier(),"[1.0, 1.0, 1.0, 1.0, 0.519, 1.0, 1.0, 1.0, 1.0...","[1.0, 1.0, 1.0, 0.833, 0.167, 1.0, 1.0, 0.833,...",0.166667,0.833333,1.0,


In [8]:
sgd_results_3 = train_test_models(file_name='./data/60x61.csv',models=[SGDClassifier()])
sgd_results_3.head()

############################
Begining train_test function
############################
File the data is being pulled from: ./data/60x61.csv
Data is of shape (60, 61)
Columns with all Zeros are being removed
-----------------
SGDClassifier() being used
train scores: [1.         1.         1.         1.         1.         1.
 1.         1.         1.         0.51851852]
test scores: [1.         1.         1.         0.83333333 1.         0.83333333
 0.83333333 0.83333333 1.         0.5       ]
---------------------------------


Unnamed: 0,filename,model,train_scores,test_scores,min_test,avg_test,max_test,notes
0,./data/60x61.csv,SGDClassifier(),"[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 0.833, 1.0, 0.833, 0.833, 0.83...",0.5,0.883333,1.0,


# Grid Search for parameters

## Import data

In [106]:
data_df = pd.read_csv('./data/60x61.csv', header=None)
data_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,51,52,53,54,55,56,57,58,59,60
0,1,0.062832,0.097064,0.041652,0.008974,0.093586,0.011168,0.010246,0.097714,0.093843,...,0.003632,0.028356,0.066206,0.062532,0.10037,0.069072,0.025882,0.054651,0.11276,0.10607
1,1,0.025996,0.056383,0.052404,0.079622,0.07071,0.046787,0.004721,0.034333,0.059902,...,0.068505,0.030921,0.004277,0.059342,0.059611,0.025914,0.018308,0.10288,0.078364,0.049836
2,1,0.015806,0.099749,0.082649,0.076439,0.045748,0.061168,0.037715,0.10508,0.11137,...,0.07326,0.008919,0.01206,0.03868,0.013767,0.01477,0.028393,0.08889,0.021125,0.01447
3,1,0.096576,0.021273,0.071937,0.023003,0.051179,0.026095,0.014747,0.027601,0.011309,...,0.075601,0.028319,0.004177,0.026253,0.025739,0.018103,0.075973,0.057072,0.043127,0.024063
4,1,0.0245,0.0381,0.019063,0.004089,0.02301,0.10454,0.075521,0.032552,0.018125,...,0.061589,0.030194,0.004605,0.041527,0.050398,0.008049,0.1259,0.10101,0.026404,0.004397


## Train test split

In [107]:
y = data_df.iloc[:,0]
X = abs(data_df.iloc[:,1:])

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, train_size=0.8)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)

(48, 60)
(12, 60)
(48,)


## Parameter selection

In [108]:
params = {
    "loss": ["hinge", "log", "squared_hinge", "modified_huber", "perceptron"],
    "alpha": [0.0001, 0.001, 0.01, 0.1],
    "penalty": ["l2","l1","elasticnet", "none"],
    # "learning_rate": ['constant', 'optimal', 'adaptive'],
    "max_iter":[1000,2000,3000]
}

In [113]:
# give cv method
loo = LeaveOneOut()
# initialize model
clf = SGDClassifier()
grid = GridSearchCV(clf, param_grid=params, cv=loo)

In [114]:
# fit to grid
grid.fit(X, y)



GridSearchCV(cv=LeaveOneOut(), estimator=SGDClassifier(),
             param_grid={'alpha': [0.0001, 0.001, 0.01, 0.1],
                         'loss': ['hinge', 'log', 'squared_hinge',
                                  'modified_huber', 'perceptron'],
                         'max_iter': [1000, 2000, 3000],
                         'penalty': ['l2', 'l1', 'elasticnet', 'none']})

In [117]:
pd.DataFrame(grid.cv_results_).sort_values('mean_test_score', ascending=False)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_alpha,param_loss,param_max_iter,param_penalty,params,split0_test_score,...,split53_test_score,split54_test_score,split55_test_score,split56_test_score,split57_test_score,split58_test_score,split59_test_score,mean_test_score,std_test_score,rank_test_score
67,0.000763,0.000043,0.000472,0.000018,0.001,hinge,2000,none,"{'alpha': 0.001, 'loss': 'hinge', 'max_iter': ...",1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.966667,0.179505,1
35,0.000699,0.000024,0.000472,0.000016,0.0001,squared_hinge,3000,none,"{'alpha': 0.0001, 'loss': 'squared_hinge', 'ma...",1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.966667,0.179505,1
7,0.000690,0.000026,0.000475,0.000023,0.0001,hinge,2000,none,"{'alpha': 0.0001, 'loss': 'hinge', 'max_iter':...",1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.966667,0.179505,1
19,0.000902,0.000070,0.000474,0.000018,0.0001,log,2000,none,"{'alpha': 0.0001, 'loss': 'log', 'max_iter': 2...",1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.933333,0.249444,4
15,0.000891,0.000056,0.000472,0.000013,0.0001,log,1000,none,"{'alpha': 0.0001, 'loss': 'log', 'max_iter': 1...",1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.933333,0.249444,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
225,0.000760,0.000050,0.000464,0.000026,0.1,modified_huber,3000,l1,"{'alpha': 0.1, 'loss': 'modified_huber', 'max_...",1.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.366667,0.481894,234
137,0.000802,0.000071,0.000475,0.000025,0.01,log,2000,l1,"{'alpha': 0.01, 'loss': 'log', 'max_iter': 200...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.350000,0.476970,237
218,0.000769,0.000063,0.000465,0.000005,0.1,modified_huber,1000,elasticnet,"{'alpha': 0.1, 'loss': 'modified_huber', 'max_...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.333333,0.471405,238
209,0.000807,0.000154,0.000469,0.000010,0.1,squared_hinge,2000,l1,"{'alpha': 0.1, 'loss': 'squared_hinge', 'max_i...",0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.333333,0.471405,238


In [131]:
print(grid.best_estimator_)
print(grid.best_params_)
print(grid.best_index_)
print(grid.best_score_)

SGDClassifier(max_iter=2000, penalty='none')
{'alpha': 0.0001, 'loss': 'hinge', 'max_iter': 2000, 'penalty': 'none'}
7
0.9666666666666667


In [136]:
## Test to see if results are reliable
SGD_clf = SGDClassifier(max_iter=2000, alpha=0.0001, loss='hinge', penalty='none')
sgd_results = train_test_models(file_name='./data/60x61.csv',models=[SGD_clf], crossval_method='loo')

############################
Begining train_test function
############################
File the data is being pulled from: ./data/60x61.csv
Data is of shape (60, 61)
Columns with all Zeros are being removed
-----------------
SGDClassifier(max_iter=2000, penalty='none') being used
train scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
test scores: [1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0. 1. 1. 0. 1. 1. 1. 1. 1. 1. 1. 1. 1. 0.
 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]
---------------------------------


In [137]:
sgd_results

Unnamed: 0,filename,model,train_scores,test_scores,min_test,avg_test,max_test,notes
0,./data/60x61.csv,"SGDClassifier(max_iter=2000, penalty='none')","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...","[1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, ...",0.0,0.95,1.0,


In [None]:
# Repeated Stratified k-fold cross valiation on best model