# Regularization vs. Transformation

In [73]:
import warnings
import scipy, pandas as pd
import numpy as np
from pandas import DataFrame as df
import sklearn
import re
import matplotlib.pyplot as plt
from pandas.tools.plotting import table
%matplotlib inline
warnings.filterwarnings('ignore')

# Datasets

In [74]:
datasets = {}

## Leukemia

In [75]:
def load_data(X_filename, y_filename):
    df_X = pd.read_csv(X_filename, sep="\t")
    df_X = df_X.select(lambda x: not re.search('call\.*', x), axis=1)
    df_X = df_X.drop(['Gene Description', 
                      'Gene Accession Number'], axis=1)
    df_X = df_X.T
    X = df_X.values

    with open(y_filename, "r") as fin:
        data = fin.read().splitlines(True)
    data = data[1].rstrip()

    y = np.fromstring(data, sep=" ")

    return X, y
clean_lines = []
with open("data_set_ALL_AML_train.txt", "r") as f:
    lines = f.readlines()
    clean_lines = [l.rstrip('\t\n') for l in lines]

with open("data_set_ALL_AML_train_cleaned.txt", "w") as f:
    f.writelines('\n'.join(clean_lines))
X_train, y_train = load_data("data_set_ALL_AML_train_cleaned.txt",
                             "ALL_vs_AML_train_set_38_sorted.cls")
X_test, y_test = load_data("data_set_ALL_AML_independent.txt",
                           "Leuk_ALL_AML.test.cls")
y_test = y_test[1:]  # dataset has one additional 0 at beginning, remove
datasets['leukemia'] = [X_train,X_test,y_train,y_test]

## MNIST

In [76]:
# MNIST dataset
from sklearn.datasets import fetch_mldata
from sklearn.model_selection import train_test_split
mnist = fetch_mldata('MNIST original')
X = df(pd.SparseDataFrame(mnist.data))
y = df(mnist.target)
X['label'] = y
X.head()
# Extract only 0 and 8 digits
X = X[(X['label']==0) | (X['label']==8)]
datasets['mnist'] = train_test_split(X.drop('label',1),X['label'], test_size=0.4, random_state=42)
print("MNIST Loaded")
print("Train: {} samples".format(datasets['mnist'][0].shape[0]))
print("Test: {} samples".format(datasets['mnist'][1].shape[0]))

MNIST Loaded
Train: 8236 samples
Test: 5492 samples


## Breast Cancer

In [77]:
# Breast Cancer dataset
from sklearn.datasets import load_breast_cancer
datasets['breast cancer'] = train_test_split(*load_breast_cancer(True), test_size=0.4, random_state=42)
print("Breast Cancer Loaded")
print("Train: {} samples".format(datasets['breast cancer'][0].shape[0]))
print("Test: {} samples".format(datasets['breast cancer'][1].shape[0]))

Breast Cancer Loaded
Train: 341 samples
Test: 228 samples


## Wilt

In [78]:
# Wilt dataset
from wilt_dataset import WiltDataSet
wds_train = WiltDataSet()
wds_test = WiltDataSet()
wds_train.read_in_dataset("../WiltDataSet/training.txt")
wds_test.read_in_dataset("../WiltDataSet/testing.txt")
tr_data, tr_labels = wds_train.convert_data_to_numpy_array()
te_data, te_labels = wds_test.convert_data_to_numpy_array()
datasets["wilt"] = [tr_data, te_data, tr_labels.ravel(), te_labels.ravel()]
print("Wilt Loaded")
print("Train: {} samples".format(datasets['wilt'][0].shape[0]))
print("Test: {} samples".format(datasets['wilt'][1].shape[0]))

Wilt Loaded
Train: 4339 samples
Test: 500 samples


## Letters

In [79]:
# Letters dataset
from letters import LetterRecognitionDataSet
letter = LetterRecognitionDataSet()
letter.read_in_dataset("letter-recognition.data")
letter_data, letter_labels = letter.convert_data_to_numpy_array()
datasets["letters"] = train_test_split(letter_data, letter_labels.ravel(), test_size=0.4, random_state=42)
print("Letters Loaded")
print("Train: {} samples".format(datasets['letters'][0].shape[0]))
print("Test: {} samples".format(datasets['letters'][1].shape[0]))

Letters Loaded
Train: 925 samples
Test: 618 samples


## Dataset Summary

In [13]:
# Display dataset splits and sizes
index = columns=['Dataset', '# of Features', 'Train [classA, classB]', 'Test [classA, classB]']
data_stats = df(columns=index)
for m in datasets:
    features = datasets[m][0].shape[1]
    train = np.unique(datasets[m][2], return_counts=True)[1]
    test = np.unique(datasets[m][3], return_counts=True)[1]
    data_stats = data_stats.append(pd.Series([
        m.capitalize(),features, train,test],index=index),ignore_index=True)
data_stats

Unnamed: 0,Dataset,# of Features,"Train [classA, classB]","Test [classA, classB]"
0,Mnist,784,"[4132, 4104]","[2771, 2721]"
1,Letters,16,"[455, 470]","[313, 305]"
2,Breast cancer,30,"[132, 209]","[80, 148]"
3,Leukemia,7129,"[27, 11]","[20, 14]"
4,Wilt,5,"[74, 4265]","[187, 313]"


# Experiment

## Logistic Regression

In [84]:
import sklearn.linear_model
import sklearn.kernel_ridge
import sklearn.metrics.pairwise
from sklearn.metrics import confusion_matrix
from sklearn.metrics import zero_one_loss
from scipy.spatial.distance import cdist
import statistics

def estimate_log_regression(X_train, X_test, y_train, y_test, 
                            kernelize=False, penalty='l2'):
    """Train logistic regression model, return test error and confusion matrix"""
    if kernelize == True:   
        print('Estimating logisitic regression with RBF kernel...')
        # Calculate gamma as in Gretton et al.
        b = cdist(X_train, X_train).ravel()
        gamma = 1/(2 * pow(statistics.median(b), 2))
        # Transform data via RBF kernel 
        K_train = sklearn.metrics.pairwise.rbf_kernel(X_train, X_train, gamma=gamma)
        X_test = sklearn.metrics.pairwise.rbf_kernel(X_test, X_train, gamma=gamma)
        X_train = K_train
        c = 10000
    else:
        print('Estimating logisitic regression with {} regularization...'.format(penalty))
        c = 1
    # Fit logistic regression
    clf = sklearn.linear_model.LogisticRegression(penalty=penalty, C=c)
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    err = zero_one_loss(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)
    return {'error' : err, 'confusion' : conf_mat}


## SVM



In [151]:
from sklearn.svm import SVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
# This code is a modification of code at
# http://ogrisel.github.io/scikit-learn.org/sklearn-tutorial/auto_examples/svm/plot_svm_parameters_selection.html
def find_svm_best_params(X_train, y_train, kernel_type):
    """Find optimal C value for SVM using 2-fold cross validation"""
    b = cdist(X_train, X_train).ravel()
    # Calculate gamma as in Gretton et al.
    gamma = 1/(2 * pow(statistics.median(b), 2))
    C_range = 10. ** np.arange(-2, 4, 2)
    param_grid = dict(C=C_range)
    grid_search = GridSearchCV(SVC(kernel=kernel_type, gamma=gamma), param_grid, cv=5)
    grid_search.fit(X_train, y_train)
    best_params = grid_search.best_params_
    return best_params
def estimate_svm(X_train, X_test, y_train, y_test, kernel_type):
    """Train SVM model, returns test error rate and confusion matrix"""
    print("Estimating SVM with {} kernel...".format(kernel_type))
    # Determine best C with cross-validation
    best_params = find_svm_best_params(X_train, y_train, kernel_type)
    c = best_params['C']
    print("Best C value: {}".format(c))
    # Calculate gamma as in Gretton et al.
    b = cdist(X_train, X_train).ravel()
    gamma=1/(2 * pow(statistics.median(b), 2))
    our_svm = SVC(kernel=kernel_type, C=c, gamma=gamma)
    our_svm.fit(X_train, y_train)
    y_pred = our_svm.predict(X_test)
    err = zero_one_loss(y_test, y_pred)
    conf_mat = confusion_matrix(y_test, y_pred)
    return {'error' : err, 'confusion' : conf_mat}

## Evaluation

In [115]:
def evaluate_models(X_train, X_test, y_train, y_test):
    """Evaluates all experimental models for a given train/test set"""
    return {
        'Logistic Regression L1' : estimate_log_regression(X_train, X_test, y_train, y_test, 
                                                penalty='l1'),
        'Logistic Regression L2' : estimate_log_regression(X_train, X_test, y_train, y_test),
        'Logistic Regression RBF' : estimate_log_regression(X_train, X_test, y_train, y_test, 
                                               kernelize=True),
        'SVM RBF' : estimate_svm(X_train, X_test, y_train, y_test, 'rbf'),
        'SVM Linear' : estimate_svm(X_train, X_test, y_train, y_test, 'linear')}
def display_results(results):
    """Displays all experimental results in a formatted table"""
    series_index = ['Dataset', 
                    'Logistic Regression L1', 
                    'Logistic Regression L2', 
                    'Logistic Regression RBF',
                    'SVM RBF',
                    'SVM Linear']
    results_df = df(columns=series_index)
    for d in results:
        for model in results[d]:
            results[d][model] = results[d][model]['error']
        results[d]['Dataset'] = d.upper()
        results_df = results_df.append(results[d], ignore_index=True)
    display(results_df)

In [152]:
results = {}
for d in datasets:
    print("{} dataset:".format(d.upper()))
    results[d] = evaluate_models(*datasets[d])

WILT dataset:
Estimating logisitic regression with l1 regularization...
Estimating logisitic regression with l2 regularization...
Estimating logisitic regression with RBF kernel...
Estimating SVM with rbf kernel...
Best C value: 100.0
Estimating SVM with linear kernel...
Best C value: 0.01
MNIST dataset:
Estimating logisitic regression with l1 regularization...
Estimating logisitic regression with l2 regularization...
Estimating logisitic regression with RBF kernel...
Estimating SVM with rbf kernel...
Best C value: 100.0
Estimating SVM with linear kernel...
Best C value: 0.01
LEUKEMIA dataset:
Estimating logisitic regression with l1 regularization...
Estimating logisitic regression with l2 regularization...
Estimating logisitic regression with RBF kernel...
Estimating SVM with rbf kernel...
Best C value: 100.0
Estimating SVM with linear kernel...
Best C value: 0.01
LETTERS dataset:
Estimating logisitic regression with l1 regularization...
Estimating logisitic regression with l2 regular

In [153]:
display_results(results)

Unnamed: 0,Dataset,Logistic Regression L1,Logistic Regression L2,Logistic Regression RBF,SVM RBF,SVM Linear
0,BREAST CANCER,0.035088,0.035088,0.035088,0.026316,0.04386
1,MNIST,0.013292,0.012746,0.005645,0.003824,0.014385
2,LEUKEMIA,0.0,0.029412,0.029412,0.029412,0.029412
3,LETTERS,0.014563,0.017799,0.011327,0.004854,0.024272
4,WILT,0.308,0.35,0.134,0.144,0.296



# References (move to separate file later)

