In [94]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
#from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

import warnings
warnings.filterwarnings('ignore')

### CoverType: cover_type 2 (+1) vs Others (-1)

In [78]:
covtype = pd.read_csv('covtype.data', header=None)
encoded_covtype = covtype.copy()

print(covtype[54].value_counts()[0:1]) #Find which value occurs the most

#Set value 2 in column 54 as +1, others as -1
encoded_covtype[54] = encoded_covtype[54].apply(lambda x: -1 if x != 2 else 1)

covtype_array = encoded_covtype.to_numpy().astype(np.float)  #Convert df to np array
X_covtype = covtype_array[:, :-1]                               
Y_covtype = covtype_array[:, -1].reshape(-1,1)
covtype_array = np.hstack((X_covtype, Y_covtype))  #Stack

print(X_covtype.shape)  # (581012, 54)
print(Y_covtype.shape)  # (581012, 1)

2    283301
Name: 54, dtype: int64
(581012, 54)
(581012, 1)


### Adult:  > 50k income (+1) vs  <= 50k income (-1)

In [79]:
adult = pd.read_csv('adult.data', header=None)
encoded_adult = adult.copy()

#Clean data
encoded_adult = encoded_adult.replace(' ?', np.nan) #change all missing data to np.nan
encoded_adult[14] = encoded_adult[14].apply(lambda x: -1 if x == ' <=50K' else 1)  #Encode >50k as +1, otherwise -1

Y_adult = encoded_adult[14].to_numpy().astype(np.float).reshape(-1, 1)

encoded_adult = pd.get_dummies(encoded_adult[encoded_adult.columns[:-1]])   #One-hot encoding all categorical data

X_adult = encoded_adult.to_numpy().astype(np.float)

adult_array = np.hstack((X_adult, Y_adult)) # stack

print(X_adult.shape) # (32561, 105)
print(Y_adult.shape) # (32561, 1)

(32561, 105)
(32561, 1)


### Letter Recognition: A-M (+1) vs N-Z (-1)

In [80]:
#with open('covtype.info', 'r') as covtype_info:
    #print(covtype_info.read())

In [81]:
letter = pd.read_csv('letter-recognition.data', header=None) # Load dataset letter-recognition
encoded_letter = letter.copy()  # Copy original data

encoded_letter[17] = letter[0] < 'N'  #Create a column where 'A-M' are True, others are False
encoded_letter = encoded_letter.replace(False, -1) #Replace False to -1, True to +1  
encoded_letter = encoded_letter.drop(0, axis=1)  #Drop first column containing letters

letter_array = encoded_letter.to_numpy().astype(np.float) #Convert dataframe to numpy array

X_letter = encoded_letter.to_numpy().astype(np.float)[:, :-1]  
Y_letter = encoded_letter.to_numpy().astype(np.float)[:, -1].reshape(-1,1)

print(X_letter.shape)  #(20000,16)
print(Y_letter.shape)  #(20000, 1)

(20000, 16)
(20000, 1)


### 3 Algorithms and their parameters

In [82]:
# parameters for Random Forest CV
max_depth = [1,2,3,4,5]
max_feature = [1,2,4,6,8,12,16,20]
n_estimators = 1024

# parameters for Logistic Regression 
C_list = [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

# paramters for KNN
n_neighbors = np.logspace(1, 500, 25)

models = []
models.append(('RF', RandomForestClassifier(n_estimators= n_estimators)))
models.append(('LOGREG', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier(metric='euclidean')))

In [83]:
# Computation for Adult dataset

# Random 5000 samples 
X_train, X_test, Y_train, Y_test = train_test_split(X_adult, Y_adult,
                                                   train_size=5000)

# Create a pipeline 
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', RandomForestClassifier(n_estimators=n_estimators))])

# Create search space of learning algorithms 
search_space = [{'classifier': [LogisticRegression(solver='saga')],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__C': n_neighbors},
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_features': max_feature},
                {'classifier': [KNeighborsClassifier(metric='euclidean')],
                 'classifier__n_neighbors': n_neighbors}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)

In [100]:
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=20,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=1024,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [101]:

# Random 5000 samples 
X_train, X_test, Y_train, Y_test = train_test_split(X_letter, Y_letter,
                                                   train_size=5000)

# Create a pipeline 
pipe = Pipeline([('std', StandardScaler()),
                 ('classifier', RandomForestClassifier(n_estimators=n_estimators))])

# Create search space of learning algorithms 
search_space = [{'classifier': [LogisticRegression(solver='saga')],
                'classifier__penalty': ['l1', 'l2'],
                'classifier__C': n_neighbors},
                {'classifier': [RandomForestClassifier()],
                 'classifier__max_features': max_feature},
                {'classifier': [KNeighborsClassifier(metric='euclidean')],
                 'classifier__n_neighbors': n_neighbors}]

# Create grid search
clf = GridSearchCV(pipe, search_space, cv=5, verbose=0)

# Fit grid search
best_model = clf.fit(X_train, Y_train)
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

### Algorithms for Adult dataset

In [104]:
# Computation for Adult dataset

# Random 5000 samples 
X_train, X_test, y_train, y_test = train_test_split(X_adult, Y_adult,
                                                   train_size=5000)

# Initiating Classifiers
clf1 = LogisticRegression()

clf2 = KNeighborsClassifier()

clf3 = RandomForestClassifier(n_estimators=n_estimators)

# Building the pipeliens
pipe1 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf1)])

pipe2 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf2)])

pipe3 = Pipeline([('std', StandardScaler()),
                  ('classifier', clf3)])

# Setting up the parameter grids
param_grid1 = [{'classifier__penalty': ['l2'],
                'classifier__C': np.power(10., np.arange(-8, 4))}]

param_grid2 = [{'classifier__n_neighbors': np.linspace(1, 500, 25).astype(int)}]

param_grid3 = [{'classifier__max_features': max_feature}]

# Settign up multiple GridSearch 
gridcvs = {}

for pgrid, est, name in zip((param_grid1, param_grid2, param_grid3), 
                            (pipe1, pipe2, pipe3),
                            ('Logistic', 'KNN', 'RandomForest')):
    gcv = GridSearchCV(estimator=est,
                      param_grid=pgrid,
                      scoring='accuracy',
                      n_jobs=1,
                      cv=5,
                      verbose=0,
                      refit=True)
    gridcvs[name] = gcv

cv_scores = {name: [] for name, gs_est in gridcvs.items()}


In [92]:
best_model = gcv.fit(X_train, y_train)

In [103]:
best_model.best_estimator_.get_params()['classifier']

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=4,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [105]:
for name, gs_est in sorted(gridcvs.items()):
    gs_est.fit(X_train, y_train)
    y_pred = gs_est.predict(X_train)
    acc = accuracy_score(y_true=y_train, y_pred=y_pred)
    cv_scores[name].append(acc)

In [106]:
cv_scores

{'Logistic': [0.8626], 'KNN': [0.8356], 'RandomForest': [1.0]}

In [110]:
for name in cv_scores:
    print('%-8s | outer CV acc. %.2f%% +\- %.3f' % (
           name, 100*np.mean(cv_scores[name]), 100 * np.std(cv_scores[name])))
    
print()
for name in cv_scores:
    print('{} best parameters'.format(name), gridcvs[name].best_params_)

Logistic | outer CV acc. 86.26% +\- 0.000
KNN      | outer CV acc. 83.56% +\- 0.000
RandomForest | outer CV acc. 100.00% +\- 0.000

Logistic best parameters {'classifier__C': 0.1, 'classifier__penalty': 'l2'}
KNN best parameters {'classifier__n_neighbors': 104}
RandomForest best parameters {'classifier__max_features': 12}


In [111]:
best_algo = gridcvs['RandomForest']

best_algo.fit(X_train, y_train)
train_acc = accuracy_score(y_true=y_train, y_pred=best_algo.predict(X_train))
test_acc = accuracy_score(y_true=y_test, y_pred=best_algo.predict(X_test))


# want to calculate scores

print('Accuracy %.2f%% (average over CV test folds)' %
      (100 * best_algo.best_score_))
print('Best parameters: %s' % gridcvs['RandomForest'].best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Accuracy 86.18% (average over CV test folds)
Best parameters: {'classifier__max_features': 16}
Training Accuracy: 100.00%
Test Accuracy: 84.66%


In [163]:
# Random 5000 samples 
X_train, X_test, Y_train, Y_test = train_test_split(X_adult, Y_adult,
                                                   train_size=5000)

p_grid = {'n_neighbors': np.linspace(1, 500, 25).astype(int)}

knn = KNeighborsClassifier()

clf = GridSearchCV(estimator=knn, param_grid=p_grid, cv=5)
clf.fit(X_train, Y_train)

print(clf.best_score_)
print(clf.best_estimator_.get_params())

train_acc = accuracy_score(y_true=Y_train, y_pred=clf.predict(X_train))
test_acc = accuracy_score(y_true=Y_test, y_pred=clf.predict(X_test))

print('Best parameters: %s' % clf.best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

0.7822
{'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}
Best parameters: {'n_neighbors': 21}
Training Accuracy: 78.88%
Test Accuracy: 78.30%


In [114]:
clf.best_score_

0.8421999999999998

In [127]:
clf.best_estimator_.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 16,
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 1024,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [124]:
clf.fit(X_train, Y_train)

train_acc = accuracy_score(y_true=Y_train, y_pred=clf.predict(X_train))
test_acc = accuracy_score(y_true=Y_test, y_pred=clf.predict(X_test))


print('Best parameters: %s' % clf.best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

Best parameters: {'max_features': 16}
Training Accuracy: 99.98%
Test Accuracy: 85.23%


In [134]:
datasets = [adult_array, letter_array, covtype_array]

IndexError: list index out of range

In [None]:
# one trial 
X_train, X_test, Y_train, Y_test = train_test_split(X_adult, Y_adult,
                                                   train_size=5000)

p_grid = {'max_features': max_feature}

rf = RandomForestClassifier(n_estimators=n_estimators)

clf = GridSearchCV(estimator=rf, param_grid=p_grid, cv=5)
clf.fit(X_train, Y_train)

print(clf.bets_score)
print(clf.best_estimator_.get_params())

train_acc = accuracy_score(y_true=Y_train, y_pred=clf.predict(X_train))
test_acc = accuracy_score(y_true=Y_test, y_pred=clf.predict(X_test))

print('Best parameters: %s' % clf.best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

In [150]:
covtype_array[:, -1].reshape(-1,1)

array([[-1.],
       [-1.],
       [ 1.],
       ...,
       [-1.],
       [-1.],
       [-1.]])

In [149]:
adult_array[:, :-1]

array([[3.90000e+01, 7.75160e+04, 1.30000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.00000e+01, 8.33110e+04, 1.30000e+01, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [3.80000e+01, 2.15646e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       ...,
       [5.80000e+01, 1.51910e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [2.20000e+01, 2.01490e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00],
       [5.20000e+01, 2.87927e+05, 9.00000e+00, ..., 1.00000e+00,
        0.00000e+00, 0.00000e+00]])

In [152]:
# Random 5000 samples 
X_train, X_test, Y_train, Y_test = train_test_split(X_adult, Y_adult,
                                                   train_size=5000)

p_grid = {'n_neighbors': np.linspace(1, 500, 25).astype(int)}

knn = KNeighborsClassifier()

clf = GridSearchCV(estimator=knn, param_grid=p_grid, cv=5)
clf.fit(X_train, Y_train)

print(clf.best_score_)
print(clf.best_estimator_.get_params())

train_acc = accuracy_score(y_true=Y_train, y_pred=clf.predict(X_train))
test_acc = accuracy_score(y_true=Y_test, y_pred=clf.predict(X_test))

print('Best parameters: %s' % clf.best_params_)
print('Training Accuracy: %.2f%%' % (100 * train_acc))
print('Test Accuracy: %.2f%%' % (100 * test_acc))

array([[-1.],
       [-1.],
       [-1.],
       ...,
       [-1.],
       [-1.],
       [ 1.]])

In [None]:
for dataset in datasets: # for loop for each of the three datasets
    features = dataset[:, :-1]  # get X (features) from each of the datasets
    labels = dataset[:, -1].reshape(-1,1) # get Y (labels) from each of the datasets
    
    for trial in range(3): # Loop through 3 trials for each dataset
        # 5000 random samples 
        X_train, X_test, Y_train, Y_test = train_test_split(features, labels,
                                                       train_size=5000)
        
        # Algorithm 1/3 LOGREG
        logreg_p_grid = {'C': np.power(10., np.arange(-8, 4))}
        
        logreg = LogisticRegression()

        logreg_clf = GridSearchCV(estimator=logreg, param_grid=logreg_p_grid, cv=5)
        logreg_clf.fit(X_train, Y_train)

        print('Mean cross-validated score of the best_estimator: %s' % logreg_clf.best_score_)
        print('Parameters: %s' % logreg_clf.best_estimator_.get_params())

        logreg_train_acc = accuracy_score(y_true=Y_train, y_pred=logreg_clf.predict(X_train))
        logreg_test_acc = accuracy_score(y_true=Y_test, y_pred=logreg_clf.predict(X_test))

        print('Best parameters: %s' % logreg_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * logreg_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * logreg_test_acc))
        print('\n')

        
        
        # Algorithm 2/3 KNN
        knn_p_grid = {'n_neighbors': np.linspace(1, 500, 25).astype(int)}
        
        knn = KNeighborsClassifier()
        
        knn_clf = GridSearchCV(estimator=knn, param_grid=knn_p_grid, cv=5)
        knn_clf.fit(X_train, Y_train)
        
        print('Mean cross-validated score of the best_estimator: %s' % knn_clf.best_score_)
        print('Parameters: %s' % knn_clf.best_estimator_.get_params())

        knn_train_acc = accuracy_score(y_true=Y_train, y_pred=knn_clf.predict(X_train))
        knn_test_acc = accuracy_score(y_true=Y_test, y_pred=knn_clf.predict(X_test))

        print('Best parameters: %s' % knn_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * knn_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * knn_test_acc))
        print('\n')
        
        
        
        # Algorithm 3/3 RandomForest
        p_grid_rf = {'max_features': max_feature} #hyperparameter for RandomForest
        rf = RandomForestClassifier(n_estimators=n_estimators) # RandomForest classifier

        rf_clf = GridSearchCV(estimator=rf, param_grid=p_grid_rf, cv=5) #5 fold GridSearch 
        
        rf_clf.fit(X_train, Y_train)

        print('Mean cross-validated score of the best_estimator: %s' % rf_clf.best_score_)
        print('Parameters: %s' % rf_clf.best_estimator_.get_params())

        rf_train_acc = accuracy_score(y_true=Y_train, y_pred=rf_clf.predict(X_train))
        rf_test_acc = accuracy_score(y_true=Y_test, y_pred=rf_clf.predict(X_test))

        print('Best parameters: %s' % rf_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * rf_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * rf_test_acc))
        print('\n')


Mean cross-validated score of the best_estimator: 0.7924
Parameters: {'C': 0.001, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Best parameters: {'C': 0.001}
Training Accuracy: 79.26%
Test Accuracy: 79.74%


Mean cross-validated score of the best_estimator: 0.7708
Parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}
Best parameters: {'n_neighbors': 21}
Training Accuracy: 77.78%
Test Accuracy: 78.48%


Mean cross-validated score of the best_estimator: 0.8468
Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 12, 'max_leaf_nodes': None, 'max_samples': None, 'min_impurity_decrease': 0.0, 'm