
# Overview of dataset



The data was obtained from [the Broad Institute](http://portals.broadinstitute.org/cgi-bin/cancer/publications/view/43) and is stored as follows:

<table border="2" cellspacing="0" cellpadding="6" rules="groups" frame="hsides">


<colgroup>
<col  class="org-left" />

<col  class="org-left" />
</colgroup>
<thead>
<tr>
<th scope="col" class="org-left">Type of data</th>
<th scope="col" class="org-left">File name</th>
</tr>
</thead>

<tbody>
<tr>
<td class="org-left">Training data</td>
<td class="org-left">`data_set_ALL_AML_train.txt`</td>
</tr>


<tr>
<td class="org-left">Training data class labels</td>
<td class="org-left">`ALL_vs_AML_train_set_38_sorted.cls`</td>
</tr>


<tr>
<td class="org-left">Testing data</td>
<td class="org-left">`data_set_ALL_AML_independent.txt`</td>
</tr>


<tr>
<td class="org-left">Testing data class labels</td>
<td class="org-left">`Leuk_ALL_AML.test.cls`</td>
</tr>
</tbody>
</table>




# Cleaning the data



In [3]:
def clean_training_data():
    clean_lines = []
    with open("data_set_ALL_AML_train.txt", "r") as f:
        lines = f.readlines()
        clean_lines = [l.rstrip('\t\n') for l in lines]

    with open("data_set_ALL_AML_train_cleaned.txt", "w") as f:
        f.writelines('\n'.join(clean_lines))


clean_training_data()


# Loading the data



In [4]:
import numpy, scipy, pandas
import sklearn
import re

def load_data(x_filename, y_filename):
    df_x = pandas.read_csv(x_filename, sep="\t")
    df_x = df_x.select(lambda x: not re.search('call\.*', x), axis=1)
    df_x = df_x.drop(['Gene Description', 
                      'Gene Accession Number'], axis=1)
    df_x = df_x.T
    x = df_x.values

    with open(y_filename, "r") as fin:
        data = fin.read().splitlines(True)
    data = data[1].rstrip()

    y = numpy.fromstring(data, sep=" ")

    return x, y


x_train, y_train = load_data("data_set_ALL_AML_train_cleaned.txt",
                             "ALL_vs_AML_train_set_38_sorted.cls")
x_test, y_test = load_data("data_set_ALL_AML_independent.txt",
                           "Leuk_ALL_AML.test.cls")
y_test = y_test[1:]  # dataset has one additional 0 at beginning, 
                     # so remove it


# Run models



To choose the $\gamma$ function of the RBF kernel (where $\gamma = 1/(2\sigma^2)$) we follow the heuristic choice mentioned in Gretton et al. (p. 748) of setting $\sigma$ to equal the median distance between points of the training data.



In [5]:
import sklearn.linear_model
import sklearn.kernel_ridge
import sklearn.metrics.pairwise
from sklearn.metrics import confusion_matrix
from scipy.spatial.distance import cdist
import statistics

import tabulate

# Calculate gamma as in Gretton et al.
b = cdist(x_train, x_train).ravel()
gamma = 1/(2 * pow(statistics.median(b), 2))

#y_test_onehot = numpy.zeros((len(y_test), 2))
#y_test_onehot[numpy.arange(len(y_test)), y_test.astype(int)] = 1

# Calculate RBF kernel 
K      = sklearn.metrics.pairwise.rbf_kernel(x_train, x_train, gamma=gamma)
K_test = sklearn.metrics.pairwise.rbf_kernel(x_test, x_train, gamma=gamma)

# Fit kernelized logistic regression
# (note that l2 regularization is applied by default)
clf = sklearn.linear_model.LogisticRegression(solver='lbfgs')
clf.fit(K, y_train)
kernelized_l2_preds = clf.predict(K_test)

# Fit kernelized logistic regression with l1 regularization
# (note that liblinear solver used by default)
clf = sklearn.linear_model.LogisticRegression(penalty='l1')
clf.fit(K, y_train)
kernelized_l1_preds = clf.predict(K_test)

# Fit non-kernelized logistic regression with l1 regularization
clf = sklearn.linear_model.LogisticRegression(penalty='l1')
clf.fit(x_train, y_train)
l1_preds = clf.predict(x_test)


## Evaluation of results



In [6]:
from sklearn.metrics import zero_one_loss

kernelized_l2_er = zero_one_loss(y_test, kernelized_l2_preds)
kernelized_l1_er = zero_one_loss(y_test, kernelized_l1_preds)
l1_er = zero_one_loss(y_test, l1_preds)

kernelized_l2_cm = confusion_matrix(y_test, kernelized_l2_preds)
kernelized_l1_cm = confusion_matrix(y_test, kernelized_l1_preds)
l1_cm = confusion_matrix(y_test, l1_preds)

In [7]:
from IPython.display import HTML, display
import tabulate

table = [["Kernelized L2", kernelized_l2_er],
         ["Kernelized L1", kernelized_l1_er],
         ["L1", l1_er]]

display(HTML(tabulate.tabulate(table, tablefmt='html')))

0,1
Kernelized L2,0.264706
Kernelized L1,0.411765
L1,0.0


(TODO: add grid search for SVM parameters)




## SVM parameters



In [8]:
C_range = 2. ** np.arange(-5, 15, 2)
gamma_range = 2. ** np.arange(-5, 3, 2)

param_grid = dict(gamma=gamma_range, C=C_range)

grid = GridSearchCV(SVC(), 
                    param_grid=param_grid, 
                    cv=StratifiedKFold(y=y_train, k=5))

NameError: name 'np' is not defined


# References (move to separate file later)



Gretton, Arthur et al. 2012. "A Kernel Two-Sample Test." *Journal of Machine Learning Research*. Vol 13, p. 723-773.

Hsu, Chih-Wei et al. 2016. "A Practical Guide to Support Vector Classification." Department of Computer Science, National Taiwan University.

