In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# load the PCA model
sarcoma_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-zscore-no-label-nomfs.csv')
sarcoma_df.shape

(189, 20605)

In [12]:
# convert df to array
sarcoma_data = sarcoma_df.to_numpy()
print(sarcoma_data)

[[-0.51154087  0.09480786 -0.35176093 ...  0.51624723  0.02346097
   2.48359169]
 [-0.19129576  0.15347405  0.20261954 ... -0.41028013  2.86123234
   0.16777757]
 [ 0.26472818  2.327348   -0.56049386 ... -0.21651268  0.0192731
   0.76332633]
 ...
 [-0.2652622  -0.48026337 -1.16285933 ... -0.36779173  0.24949394
  -0.36985907]
 [ 0.24931652 -0.03363532 -0.97441342 ... -0.50605902 -1.23871739
   0.28153212]
 [-0.07085013 -0.29522455 -0.59015045 ... -0.50644652 -0.01137879
  -0.39862195]]


In [13]:
# Run PCA based on preserving variance
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(sarcoma_data)
print("PCA fit is complete")

PCA fit is complete


In [14]:
X = pca.transform(sarcoma_data)
print("PCA transformation is complete")

PCA transformation is complete


In [15]:
print("reduced shape: {}".format(str(X.shape)))

reduced shape: (189, 150)


In [16]:
# read in labels
sarcoma_labels_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-labels-nomfs.csv')
sarcoma_labels_df.shape

(189, 1)

In [17]:
# Convert label df to np array
y_df = sarcoma_labels_df['label']
y = y_df.to_numpy()
print(y)

[4 2 2 5 3 2 4 4 3 4 4 4 3 0 3 4 2 4 4 0 2 4 3 3 3 2 0 4 4 5 3 3 2 4 4 5 4
 3 4 4 5 4 2 4 4 4 4 4 4 2 4 3 3 2 2 4 4 4 4 4 4 4 2 2 3 4 2 3 4 4 3 4 3 2
 4 3 2 3 4 4 3 4 3 4 3 4 4 3 4 4 4 0 4 3 4 3 3 3 5 2 0 3 3 4 2 3 0 3 4 2 4
 2 0 3 4 4 3 2 2 3 4 4 4 4 4 3 2 4 2 4 0 2 2 3 4 4 2 3 4 4 3 3 4 3 4 2 2 3
 2 2 0 4 2 4 2 4 3 3 4 2 4 2 4 3 4 2 3 4 2 4 4 2 2 3 4 4 4 3 2 4 2 3 0 3 3
 2 2 3 2]


In [18]:
# Get a count of the unique values in each categories to make sure there are enough to support cross-validation
unique_elements, counts_elements = np.unique(y, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[ 0  2  3  4  5]
 [10 44 50 80  5]]


In [19]:
# Use Stratified Kfold split
from sklearn.model_selection import StratifiedKFold,train_test_split
k_fold = StratifiedKFold(n_splits=4,shuffle=True,random_state=2019)
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018, stratify=y)
print(len(X_train))
# Create k folds with training data
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]

141


In [20]:
# KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model
neigh = KNeighborsClassifier(n_neighbors=7)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    neigh.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = neigh.predict(X_train_fold)
    y_test_pred = neigh.predict(X_cv_fold)
    # record training set accuracy
    print(neigh.score(X_train_fold, y_train_pred))
    # record generalization accuracy
    print(neigh.score(X_cv_fold, y_cv_fold))
print(neigh.score(X_test, y_test))


1.0
0.7297297297297297
1.0
0.7428571428571429
1.0
0.6
1.0
0.8235294117647058
0.7291666666666666


In [21]:
# SVM Classifier
from sklearn.svm import SVC

svc = SVC(kernel='rbf', gamma='scale', C=1.0)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    svc.fit(X_train, y_train) 
    # model evaluation for training set
    y_train_pred = svc.predict(X_train_fold)
    y_test_pred = svc.predict(X_cv_fold)
    # record training set accuracy
    print(svc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(svc.score(X_cv_fold, y_cv_fold))
print(svc.score(X_test, y_test))


0.9711538461538461
0.9459459459459459
0.9622641509433962
0.9714285714285714
0.9716981132075472
0.9428571428571428
0.9532710280373832
1.0
0.875


In [22]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=4, random_state=0, n_estimators=100) 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    rfc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_cv_fold)
    # record training set accuracy
    print(rfc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(rfc.score(X_cv_fold, y_cv_fold))
print(rfc.score(X_test, y_test))

0.9615384615384616
0.4864864864864865
0.9811320754716981
0.6285714285714286
1.0
0.6
0.9719626168224299
0.5588235294117647
0.6458333333333334


In [23]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, C=1.0, random_state=0, penalty='l2') 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    lrc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = lrc.predict(X_train)
    y_test_pred = lrc.predict(X_cv_fold)
    # record training set accuracy
    print(lrc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(lrc.score(X_cv_fold, y_cv_fold))
print(lrc.score(X_test, y_test))

1.0
0.8108108108108109
1.0
0.9142857142857143
1.0
0.7714285714285715
1.0
0.8529411764705882
0.8541666666666666


In [24]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [25]:
# Grid search for KNN
from sklearn.neighbors import KNeighborsClassifier

# Specify number of neighbors as hyperparameter
neighbors = range(1, 12)

# Create hyperparameter options
hyperparameters = dict(n_neighbors=neighbors)
gs_knn = GridSearchCV(KNeighborsClassifier(), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_knn = gs_knn.fit(X_train, y_train)
    
# View best hyperparameters
print('Best Number of Neighbors:', best_model_knn.best_estimator_.get_params()['n_neighbors'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_knn.predict(X_test)))


Best Number of Neighbors: 11
Accuracy Score: 0.7708333333333334




In [26]:
# Grid search for SVM
from sklearn.svm import SVC

# Create hyperparameter space
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
# Create hyperparameter options
hyperparameters = dict(C=C_range, gamma=gamma_range)
gs_svm = GridSearchCV(SVC(kernel='rbf'), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_svm = gs_svm.fit(X_train, y_train)

# View best hyperparameters
print('Best value of C', best_model_svm.best_estimator_.get_params()['C'])
print('Best value of gamma', best_model_svm.best_estimator_.get_params()['gamma'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_svm.predict(X_test)))

Best value of C 100.0
Best value of gamma 1e-07
Accuracy Score: 0.8541666666666666




In [27]:
# Grid search for RF
from sklearn.ensemble import RandomForestClassifier

# Create hyperparameter space
maxdepth=range(1, 12)
# Create hyperparameter options
hyperparameters = dict(max_depth=maxdepth)
gs_rfc = GridSearchCV(RandomForestClassifier(random_state=0, n_estimators=100), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_rfc = gs_rfc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of Max depth', best_model_rfc.best_estimator_.get_params()['max_depth'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_rfc.predict(X_test)))

Best value of Max depth 7
Accuracy Score: 0.6458333333333334




In [28]:
# Grid search for LR
from sklearn.linear_model import LogisticRegression

# Create hyperparameter space
C = np.logspace(0, 4, 10)
# Create hyperparameter options
hyperparameters = dict(C=C)
gs_lrc = GridSearchCV(LogisticRegression(multi_class='multinomial', solver='sag', max_iter=5000, random_state=0, penalty='l2') , hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_lrc = gs_lrc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of C', best_model_lrc.best_estimator_.get_params()['C'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_lrc.predict(X_test)))

Best value of C 1.0
Accuracy Score: 0.875


In [29]:
# Best model is LR with C = 1
lrc = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, C=1.0, random_state=0, penalty='l2') 
lrc.fit(X_train, y_train) 

y_pred = lrc.predict(X_test)
from sklearn.metrics import classification_report
class_names = ["Class {}".format(i) for i in range(5)]
print(classification_report(y_test, y_pred, target_names=class_names))


              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00         3
     Class 1       0.80      0.73      0.76        11
     Class 2       0.92      0.92      0.92        13
     Class 3       0.90      0.90      0.90        20
     Class 4       0.50      1.00      0.67         1

    accuracy                           0.88        48
   macro avg       0.82      0.91      0.85        48
weighted avg       0.88      0.88      0.88        48



In [30]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 3,  0,  0,  0,  0],
       [ 0,  8,  1,  2,  0],
       [ 0,  1, 12,  0,  0],
       [ 0,  1,  0, 18,  1],
       [ 0,  0,  0,  0,  1]])

In [31]:
# Voting classifier
def votingClassifiers():
    # This example is taken from https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb 


    from sklearn.tree import DecisionTreeClassifier
    from sklearn.ensemble import VotingClassifier
    from sklearn.linear_model import LogisticRegression
    from sklearn.svm import SVC
    from sklearn.metrics import accuracy_score
    
    log_clf = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, C=1.0, random_state=0, penalty='l2') 

    svm_clf = SVC(kernel='rbf', gamma=1e-05, C=100)
  
    svm2_clf = SVC(kernel='rbf', gamma='scale', C=1)
  
    voting_clf = VotingClassifier(
            estimators=[('lr', log_clf), ('svc', svm_clf), ('svc2', svm2_clf)],
            voting='hard')
     
    for clf in (log_clf, svm_clf, svm2_clf, voting_clf):
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

In [32]:
votingClassifiers()

LogisticRegression 0.875
SVC 0.8541666666666666
SVC 0.875
VotingClassifier 0.875


In [33]:
def baggingClassifiers():
    # This example is taken from https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb 


    from sklearn.ensemble import BaggingClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression

    bag_clf = BaggingClassifier(
        LogisticRegression(multi_class='multinomial', solver='sag', max_iter=2000, C=1.0, penalty='l2'), n_estimators=100,
            max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
    bag_clf.fit(X_train, y_train)
    y_pred = bag_clf.predict(X_test)
    print('Bagging LR',accuracy_score(y_test, y_pred))

In [34]:
baggingClassifiers()

Bagging LR 0.9166666666666666


In [35]:
def adaBoostClassifiers():
    # This example is taken from https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb 

    from sklearn.ensemble import AdaBoostClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression

    ada_clf = AdaBoostClassifier(
        LogisticRegression(multi_class='multinomial', solver='sag', max_iter=3000, C=1.0, penalty='l2'), n_estimators=100,
             random_state=42)
    ada_clf.fit(X_train, y_train)
    y_pred = ada_clf.predict(X_test)
    print('AdaBoost LR', accuracy_score(y_test, y_pred))

In [36]:
adaBoostClassifiers()

AdaBoost LR 0.8958333333333334
