In [10]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import datasets
#from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
import os

import warnings
warnings.filterwarnings('ignore')

### CoverType: cover_type 2 (+1) vs Others (-1)

In [30]:
from google.colab import files
uploaded = files.upload()
import io

covtype = pd.read_csv(io.BytesIO(uploaded['covtype.data.gz']), header=None, compression='gzip')
encoded_covtype = covtype.copy()

print(covtype[54].value_counts()[0:1]) #Find which value occurs the most

#Set value 2 in column 54 as +1, others as -1
encoded_covtype[54] = encoded_covtype[54].apply(lambda x: -1 if x != 2 else 1)

covtype_array = encoded_covtype.to_numpy().astype(np.float)  #Convert df to np array
X_covtype = covtype_array[:, :-1]                               
Y_covtype = covtype_array[:, -1].reshape(-1,1)
covtype_array = np.hstack((X_covtype, Y_covtype))  #Stack

print(X_covtype.shape)  # (581012, 54)
print(Y_covtype.shape)  # (581012, 1)

Saving covtype.data.gz to covtype.data.gz
2    283301
Name: 54, dtype: int64
(581012, 54)
(581012, 1)


### Adult:  > 50k income (+1) vs  <= 50k income (-1)

In [24]:
adult = pd.read_csv(io.BytesIO(uploaded['adult.data']), header=None)
encoded_adult = adult.copy()

#Clean data
encoded_adult = encoded_adult.replace(' ?', np.nan) #change all missing data to np.nan
encoded_adult[14] = encoded_adult[14].apply(lambda x: -1 if x == ' <=50K' else 1)  #Encode >50k as +1, otherwise -1

Y_adult = encoded_adult[14].to_numpy().astype(np.float).reshape(-1, 1)

encoded_adult = pd.get_dummies(encoded_adult[encoded_adult.columns[:-1]])   #One-hot encoding all categorical data

X_adult = encoded_adult.to_numpy().astype(np.float)

adult_array = np.hstack((X_adult, Y_adult)) # stack

print(X_adult.shape) # (32561, 105)
print(Y_adult.shape) # (32561, 1)

Saving adult.data to adult (5).data
(32561, 105)
(32561, 1)


### Letter Recognition: A-M (+1) vs N-Z (-1)

In [26]:
from google.colab import files
uploaded = files.upload()
import io

letter = pd.read_csv(io.BytesIO(uploaded['letter-recognition.data']), header=None) #Load dataset letter-recognition
encoded_letter = letter.copy()  # Copy original data

encoded_letter[17] = letter[0] < 'N'  #Create a column where 'A-M' are True, others are False
encoded_letter = encoded_letter.replace(False, -1) #Replace False to -1, True to +1  
encoded_letter = encoded_letter.drop(0, axis=1)  #Drop first column containing letters

letter_array = encoded_letter.to_numpy().astype(np.float) #Convert dataframe to numpy array

X_letter = encoded_letter.to_numpy().astype(np.float)[:, :-1]  
Y_letter = encoded_letter.to_numpy().astype(np.float)[:, -1].reshape(-1,1)

print(X_letter.shape)  #(20000,16)
print(Y_letter.shape)  #(20000, 1)

Saving letter-recognition.data to letter-recognition.data
(20000, 16)
(20000, 1)


### 3 Algorithms and their parameters

In [46]:
# parameters for Random Forest CV
max_depth = [1,2,3,4,5]
max_feature = [1,2,4,6,8,12,16,20]
n_estimators = 1024

# parameters for Logistic Regression 
C_list = [0.00000001, 0.0000001, 0.000001, 0.00001, 0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000]

# paramters for KNN
n_neighbors = np.logspace(1, 500, 25)


In [38]:
datasets = [adult_array, letter_array, covtype_array]
test_accuracy_list = []
best_score_list = []
train_accuracy_list = []

for dataset in datasets: # for loop for each of the three datasets
    features = dataset[:, :-1]  # get X (features) from each of the datasets
    labels = dataset[:, -1].reshape(-1,1) # get Y (labels) from each of the datasets
    
    for trial in range(3): # Loop through 3 trials for each dataset
        # 5000 random samples 
        X_train, X_test, Y_train, Y_test = train_test_split(features, labels,
                                                       train_size=5000)
        
        # Algorithm 1/3 LOGREG
        logreg_p_grid = {'C': np.power(10., np.arange(-8, 4))}
         
        logreg = LogisticRegression()

        logreg_clf = GridSearchCV(estimator=logreg, param_grid=logreg_p_grid, cv=5) #gridsearch with 5 fold cv
        logreg_clf.fit(X_train, Y_train) # fit the data

        print('Mean cross-validated score of the best_estimator: %s' % logreg_clf.best_score_)
        print('Parameters: %s' % logreg_clf.best_estimator_.get_params())

        logreg_train_acc = accuracy_score(y_true=Y_train, y_pred=logreg_clf.predict(X_train)) #training accuracy
        logreg_test_acc = accuracy_score(y_true=Y_test, y_pred=logreg_clf.predict(X_test)) # testing accuracy

        print('Best parameters: %s' % logreg_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * logreg_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * logreg_test_acc))
        print('\n')

        # Append best_score, training_accuracy, test_accuracy into lists
        best_score_list.append(['logreg', logreg_clf.best_score_])
        train_accuracy_list.append(['logreg', logreg_train_acc])
        test_accuracy_list.append(['logreg', logreg_test_acc])

        
        # Algorithm 2/3 KNN
        knn_p_grid = {'n_neighbors': np.linspace(1, 500, 25).astype(int)}
        
        knn = KNeighborsClassifier()
        
        knn_clf = GridSearchCV(estimator=knn, param_grid=knn_p_grid, cv=5) #gridsearch with 5 fold cv
        knn_clf.fit(X_train, Y_train) # fit the data
        
        print('Mean cross-validated score of the best_estimator: %s' % knn_clf.best_score_)
        print('Parameters: %s' % knn_clf.best_estimator_.get_params())

        knn_train_acc = accuracy_score(y_true=Y_train, y_pred=knn_clf.predict(X_train)) # training accuracy
        knn_test_acc = accuracy_score(y_true=Y_test, y_pred=knn_clf.predict(X_test)) # testing accuracy

        print('Best parameters: %s' % knn_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * knn_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * knn_test_acc))
        print('\n')
        
        # Append best_score, training_accuracy, testing_accuracy into lists
        best_score_list.append(['knn', knn_clf.best_score_])
        train_accuracy_list.append(['knn', knn_train_acc])
        test_accuracy_list.append(['knn', knn_test_acc])

        
        # Algorithm 3/3 RandomForest
        p_grid_rf = {'max_features': max_feature} #hyperparameter for RandomForest
        rf = RandomForestClassifier(n_estimators=n_estimators) # RandomForest classifier

        rf_clf = GridSearchCV(estimator=rf, param_grid=p_grid_rf, cv=5) #5 fold GridSearch 
        
        rf_clf.fit(X_train, Y_train)

        print('Mean cross-validated score of the best_estimator: %s' % rf_clf.best_score_)
        print('Parameters: %s' % rf_clf.best_estimator_.get_params())

        rf_train_acc = accuracy_score(y_true=Y_train, y_pred=rf_clf.predict(X_train)) # training accuracy
        rf_test_acc = accuracy_score(y_true=Y_test, y_pred=rf_clf.predict(X_test)) # testing accuracy

        print('Best parameters: %s' % rf_clf.best_params_)
        print('Training Accuracy: %.2f%%' % (100 * rf_train_acc))
        print('Test Accuracy: %.2f%%' % (100 * rf_test_acc))
        print('\n')

        # Append best_score, training_accuracy, testing_accuracy into lists
        best_score_list.append(['rf', rf_clf.best_score_])
        train_accuracy_list.append(['rf', rf_train_acc])
        test_accuracy_list.append(['rf', rf_test_acc])

Mean cross-validated score of the best_estimator: 0.7982000000000001
Parameters: {'C': 1e-06, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'auto', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'lbfgs', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}
Best parameters: {'C': 1e-06}
Training Accuracy: 79.88%
Test Accuracy: 79.78%


Mean cross-validated score of the best_estimator: 0.7824
Parameters: {'algorithm': 'auto', 'leaf_size': 30, 'metric': 'minkowski', 'metric_params': None, 'n_jobs': None, 'n_neighbors': 21, 'p': 2, 'weights': 'uniform'}
Best parameters: {'n_neighbors': 21}
Training Accuracy: 78.82%
Test Accuracy: 78.39%


Mean cross-validated score of the best_estimator: 0.8555999999999999
Parameters: {'bootstrap': True, 'ccp_alpha': 0.0, 'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': 20, 'max_leaf_nodes': None, 'max_samples': None, 'min_imp

In [39]:
best_score_list

[['logreg', 0.7982000000000001],
 ['knn', 0.7824],
 ['rf', 0.8555999999999999],
 ['logreg', 0.7986],
 ['knn', 0.7756000000000001],
 ['rf', 0.8497999999999999],
 ['logreg', 0.8042],
 ['knn', 0.7817999999999999],
 ['rf', 0.8515999999999998],
 ['logreg', 0.734],
 ['knn', 0.9495999999999999],
 ['rf', 0.9396000000000001],
 ['logreg', 0.7258],
 ['knn', 0.9545999999999999],
 ['rf', 0.9390000000000001],
 ['logreg', 0.7233999999999999],
 ['knn', 0.9480000000000001],
 ['rf', 0.9366],
 ['logreg', 0.6162],
 ['knn', 0.7699999999999999],
 ['rf', 0.8141999999999999],
 ['logreg', 0.6128],
 ['knn', 0.7642],
 ['rf', 0.8219999999999998],
 ['logreg', 0.623],
 ['knn', 0.7708000000000002],
 ['rf', 0.8108000000000001]]

In [40]:
train_accuracy_list

[['logreg', 0.7988],
 ['knn', 0.7882],
 ['rf', 1.0],
 ['logreg', 0.7946],
 ['knn', 0.7812],
 ['rf', 1.0],
 ['logreg', 0.803],
 ['knn', 0.7914],
 ['rf', 1.0],
 ['logreg', 0.7362],
 ['knn', 1.0],
 ['rf', 1.0],
 ['logreg', 0.7266],
 ['knn', 1.0],
 ['rf', 1.0],
 ['logreg', 0.721],
 ['knn', 1.0],
 ['rf', 1.0],
 ['logreg', 0.619],
 ['knn', 1.0],
 ['rf', 1.0],
 ['logreg', 0.6108],
 ['knn', 1.0],
 ['rf', 1.0],
 ['logreg', 0.6204],
 ['knn', 1.0],
 ['rf', 1.0]]

In [41]:
test_accuracy_list

[['logreg', 0.797830267406843],
 ['knn', 0.7838975363738616],
 ['rf', 0.8545771198432568],
 ['logreg', 0.7953267297993541],
 ['knn', 0.7858205435216429],
 ['rf', 0.8517107506984507],
 ['logreg', 0.7978665505605748],
 ['knn', 0.7846957657559596],
 ['rf', 0.847356772250644],
 ['logreg', 0.7268],
 ['knn', 0.9568],
 ['rf', 0.9477333333333333],
 ['logreg', 0.7272666666666666],
 ['knn', 0.9569333333333333],
 ['rf', 0.9456666666666667],
 ['logreg', 0.7231333333333333],
 ['knn', 0.9606666666666667],
 ['rf', 0.9549333333333333],
 ['logreg', 0.6133639576953258],
 ['knn', 0.785563495204961],
 ['rf', 0.8217137837406165],
 ['logreg', 0.6138795719533621],
 ['knn', 0.7852510017152421],
 ['rf', 0.8169517301722881],
 ['logreg', 0.6187266931938918],
 ['knn', 0.7826850829496608],
 ['rf', 0.8236807566509031]]

In [45]:
test_accuracy_list[0][1]

['logreg', 0.797830267406843]