##### 1. Download and load data

In [55]:
import os
import urllib.request


url = "https://jundongl.github.io/scikit-feature/files/datasets/leukemia.mat"

# check if the file exists
if os.path.exists("leukemia.mat"):
  print("The file exists, no need to download it again.")
else:
  # download the file
  print("The file does not exist, downloading it now...")
  urllib.request.urlretrieve(url, "leukemia.mat")


The file exists, no need to download it again.


In [56]:
from scipy.io import loadmat

mat = loadmat('leukemia.mat')

##### 2. Load and clean data

In [57]:
X = mat['X']
y = mat['Y']

print(X.shape)
print(y.shape)

# convert y to 1D array
y = y.ravel()

print(X.shape)
print(y.shape)

(72, 7070)
(72, 1)
(72, 7070)
(72,)


In [58]:
# remove the features which variance is below some threshold
from sklearn.feature_selection import VarianceThreshold

sel = VarianceThreshold(threshold=0.5)

X_sel = sel.fit_transform(X)
print(X_sel.shape)

(72, 6990)


In [64]:
# split the data into training and testing
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_sel, y, test_size=0.2, random_state=2)

##### 3. Find best features

In [65]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

clf_lr = LogisticRegression()
rfe_lr = RFE(estimator=clf_lr, n_features_to_select=1000, step=10)
rfe_lr = rfe_lr.fit(X_train, y_train)

clf_rf = RandomForestClassifier()
rfe_rf = RFE(estimator=clf_rf, n_features_to_select=1000, step=10)
rfe_lr = rfe_rf.fit(X_train, y_train)

##### 4. Investigate the influence of two metrics 

In [66]:
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score

scores_lr_acc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='accuracy')
scores_lr_auc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='roc_auc')

scores_rf_acc = cross_val_score(clf_rf, X_test, y_test, cv=6, scoring='accuracy')
scores_rf_auc = cross_val_score(clf_rf, X_test, y_test, cv=6, scoring='roc_auc')


print("Logistic Regression")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lr_acc.mean(), scores_lr_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_lr_auc.mean(), scores_lr_auc.std() * 2))

print()

print("Random Forest")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rf_acc.mean(), scores_rf_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_rf_auc.mean(), scores_rf_auc.std() * 2))




Logistic Regression
Accuracy: 0.92 (+/- 0.37)
AUC: 1.00 (+/- 0.00)

Random Forest
Accuracy: 0.92 (+/- 0.37)
AUC: 0.83 (+/- 0.75)


In [67]:
from sklearn.feature_selection import SelectFromModel

clf_lr = LogisticRegression(penalty='l1', solver='liblinear')
clf_lr.fit(X_train, y_train)


clf_rf = RandomForestClassifier()
clf_rf.fit(X_train, y_train)

sfm_rf = SelectFromModel(clf_rf, prefit=True)
X_train_rf = sfm_rf.transform(X_train)
X_test_rf = sfm_rf.transform(X_test)


scores_lr_acc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='accuracy')
scores_lr_auc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='roc_auc')

scores_rf_acc = cross_val_score(clf_rf, X_test_rf, y_test, cv=6, scoring='accuracy')
scores_rf_auc = cross_val_score(clf_rf, X_test_rf, y_test, cv=6, scoring='roc_auc')


print("Logistic Regression")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lr_acc.mean(), scores_lr_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_lr_auc.mean(), scores_lr_auc.std() * 2))

print()

print("Random Forest")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rf_acc.mean(), scores_rf_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_rf_auc.mean(), scores_rf_auc.std() * 2))

Logistic Regression
Accuracy: 0.92 (+/- 0.37)
AUC: 1.00 (+/- 0.00)

Random Forest
Accuracy: 0.86 (+/- 0.40)
AUC: 1.00 (+/- 0.00)


##### 5. The same without removed outliers

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

clf_lr = LogisticRegression()
rfe_lr = RFE(estimator=clf_lr, n_features_to_select=1000, step=10)
rfe_lr = rfe_lr.fit(X_train, y_train)

clf_rf = RandomForestClassifier()
rfe_rf = RFE(estimator=clf_rf, n_features_to_select=1000, step=10)
rfe_lr = rfe_rf.fit(X_train, y_train)


scores_lr_acc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='accuracy')
scores_lr_auc = cross_val_score(clf_lr, X_test, y_test, cv=6, scoring='roc_auc')

scores_rf_acc = cross_val_score(clf_rf, X_test, y_test, cv=6, scoring='accuracy')
scores_rf_auc = cross_val_score(clf_rf, X_test, y_test, cv=6, scoring='roc_auc')


print("Logistic Regression")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_lr_acc.mean(), scores_lr_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_lr_auc.mean(), scores_lr_auc.std() * 2))

print()

print("Random Forest")
print("Accuracy: %0.2f (+/- %0.2f)" % (scores_rf_acc.mean(), scores_rf_acc.std() * 2))
print("AUC: %0.2f (+/- %0.2f)" % (scores_rf_auc.mean(), scores_rf_auc.std() * 2))



Logistic Regression
Accuracy: 0.92 (+/- 0.37)
AUC: 1.00 (+/- 0.00)

Random Forest
Accuracy: 0.94 (+/- 0.25)
AUC: 1.00 (+/- 0.00)
