In [2]:
import pandas as pd
pd.options.display.max_columns = 500
import numpy as np

import datetime
import warnings
from tqdm import tqdm_notebook as tqdm

from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC


from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

# Load remote data

Skip if data is saved on microSD

In [3]:
MUSH_headers = ['edibility',
                'cap-shape',
                'cap-surface',
                'cap-color',
                'bruises',
                'odor',
                'gill-attachment',
                'gill-spacing',
                'gill-size',
                'gill-color',
                'stalk-shape',
                'stalk-root',
                'stalk-surface-above-ring',
                'stalk-surface-below-ring',
                'stalk-color-above-ring',
                'stalk-color-below-ring',
                'veil-type',
                'veil-color',
                'ring-number',
                'ring-type',
                'spore-print-color',
                'population',
                'habitat']

In [4]:
MUSH = pd.read_csv('http://archive.ics.uci.edu/ml/machine-learning-databases/mushroom/agaricus-lepiota.data',
                  index_col=False, names=MUSH_headers)

In [5]:
MUSH.shape

(8124, 23)

In [6]:
for col in range(MUSH.shape[1]):
    u = MUSH.iloc[:, col].unique()
    MUSH.iloc[:, col] = MUSH.iloc[:, col].replace(u, np.arange(len(u)))

In [7]:
MUSH.sample(3)

Unnamed: 0,edibility,cap-shape,cap-surface,cap-color,bruises,odor,gill-attachment,gill-spacing,gill-size,gill-color,stalk-shape,stalk-root,stalk-surface-above-ring,stalk-surface-below-ring,stalk-color-above-ring,stalk-color-below-ring,veil-type,veil-color,ring-number,ring-type,spore-print-color,population,habitat
3572,0,0,2,1,1,4,0,0,1,2,0,2,2,3,2,4,0,0,0,2,3,4,4
3877,1,3,2,4,0,3,0,0,1,1,1,2,0,0,1,1,0,0,0,0,1,3,3
3608,0,3,2,3,1,4,0,0,1,3,0,2,2,3,4,3,0,0,0,2,3,4,4


In [8]:
def log(message):
    t = datetime.datetime.now()
    with open('MUSH_cv.log', 'a') as logf:
        logf.write(f'[{t}] {message}\n')

def log_scores(partition, classifier, training, validation, testing):
    log(f'{classifier} {partition} {training:.4f} {validation:.4f} {testing:.4f}')

def log_parameters(params):
    res = []
    for key in params:
        res.append(f'{key}: {params[key]}')
        
    log(f'BEST_PARAMS: {", ".join(res)}')

# Model Performance Testing

### Set up KFold

In [9]:
folds = 5

kf = KFold(n_splits=folds, shuffle=True)

### Initialize 4 classifiers to test

In [10]:
rfc = RandomForestClassifier(n_estimators=1024, n_jobs=-1, criterion='entropy')
rfc_params = {'max_features': [1, 2, 4, 6, 8, 12, 16, 20]}

svc = SVC()
svc_params = {'kernel': ['linear', 'poly', 'rbf'], 
              'degree': [2, 3], 
              'gamma': [0.001,0.005,0.01,0.05,0.1,0.5,1,2],
              'C': [10e-7, 10e-6, 10e-5, 10e-4, 10e-3, 10e-2, 0.1, 1.0, 10.0, 100.0]}

knn = KNeighborsClassifier(n_jobs=-1)
knn_params = {'weights': ['uniform', 'distance']}

boost = xgb.XGBClassifier()
boost_params = {'n_estimators': [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048]}

## Test performance of all classifiers

In [11]:
classifier_names = ['boost', 'knn', 'random forest', 'svc']
classifiers = [boost, knn, rfc, svc]
classifier_params = [boost_params, knn_params, rfc_params, svc_params]
trials = 3
partitions = [0.2, 0.5, 0.8]
total = MUSH.shape[0]

def grid_search(classifier, params):
    return GridSearchCV(classifier, params, cv=kf, return_train_score=True, scoring='f1')

def train_val(classifier, params, X_train, y_train):
    gs = grid_search(classifier, params)
    gs.fit(X_train, y_train)
        
    return gs

def test(estimator, X_test, y_test):
    return estimator.score(X_test, y_test)

pbar = tqdm(total=len(partitions)*trials*len(classifiers))

log('==== MUSH CV ====')

for partition in partitions:
    
    shuffled = MUSH.sample(frac=1)
    
    split = min(int(total * partition), 5000)
    
    log(f'TRAINING SIZE: {split}, TESTING SIZE: {total - split}')
    
    obj_cols = shuffled.iloc[:, 1:].astype(object)
    shuffled_ohe = pd.get_dummies(obj_cols, prefix=MUSH_headers[1:])
    
    X_train_val = shuffled.iloc[:split, 1:]
    X_train_val_ohe = shuffled_ohe.iloc[:split, :]
    y_train_val = shuffled.iloc[:split, 0]
    
    X_test = shuffled.iloc[split:, 1:]
    X_test_ohe = shuffled_ohe.iloc[split:, :]
    y_test = shuffled.iloc[split:, 0]
        
    for classifier, params, name in zip(classifiers, classifier_params, classifier_names):
        
        training = []
        validation = []
        testing = []
        best_params = []
        
        if name == 'knn':
            params['n_neighbors'] = np.linspace(1, int(split * (folds-1)/folds), 26).astype(int)
        
        for trial in range(trials):
            with warnings.catch_warnings():
                warnings.simplefilter("ignore")
                if name in ['svc', 'knn']:
                    gs = train_val(classifier, params, X_train_val_ohe, y_train_val)
                    testing.append(test(gs.best_estimator_, X_test_ohe, y_test))
                elif name in ['random forest', 'boost']:
                    gs = train_val(classifier, params, X_train_val, y_train_val)
                    testing.append(test(gs.best_estimator_, X_test, y_test))                
            
            log(f'TRAIN: {name} {partition} {trial + 1}')
                                    
            training.append(gs.cv_results_['mean_train_score'])
            validation.append(gs.cv_results_['mean_test_score'])
            best_params.append(gs.best_params_)
            pbar.update()
            
        training = max(np.array(training).mean(axis=0))
        validation = max(np.array(validation).mean(axis=0))
        testing = np.mean(testing)
        
        log_scores(partition, name, training, validation, testing)
        log_parameters(best_params[np.argmax(testing)])
        
pbar.close()

A Jupyter Widget


