In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# load the TSNE model
sarcoma_chi2_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-chi2-features.csv')
sarcoma_chi2_df.shape

(206, 200)

In [2]:
# convert df to array
X = sarcoma_chi2_df.to_numpy()

In [3]:
# standardize the data
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
print(X_scaled)

[[-0.04049528 -0.14765765 -0.08006068 ... -0.42762714  0.65763444
   0.77814244]
 [-0.44593423 -0.15499682 -0.54609253 ...  0.47296793 -0.57749408
  -0.49602358]
 [ 2.69945632 -0.07799443 -0.58733672 ...  0.84589856  0.50133046
   0.74859567]
 ...
 [ 0.72297918 -0.16884767 -0.3663509  ...  0.32596292 -0.18169448
   0.89638345]
 [-0.93592839 -0.16757154 -0.38408007 ...  0.26720675 -0.46918761
  -0.13611344]
 [-0.03612279 -0.16653641 -0.50626782 ...  4.13862186  0.02642636
  -0.1552729 ]]


In [2]:
# read in labels
sarcoma_labels_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-labels-only.csv')
sarcoma_labels_df.shape

(206, 1)

In [5]:
# Convert label df to np array
y_df = sarcoma_labels_df['label']
y = y_df.to_numpy()
print(y)

[4 2 2 1 5 3 2 4 4 3 4 4 4 3 0 3 1 4 2 4 4 0 2 4 3 3 3 2 0 4 4 5 3 3 2 4 4
 5 4 3 4 4 5 4 2 4 4 4 1 4 1 4 4 2 4 3 3 2 2 4 4 4 4 4 4 4 2 2 3 4 2 3 4 4
 3 4 3 2 4 3 2 3 4 4 3 4 3 4 3 4 4 1 3 4 4 4 0 4 3 4 3 3 3 5 2 0 3 3 1 1 4
 2 3 0 3 4 2 4 2 0 3 4 4 3 2 2 1 3 4 4 4 4 4 3 2 4 2 4 1 0 2 2 3 4 4 2 3 4
 4 3 1 3 4 3 4 2 2 1 3 2 2 0 4 2 1 4 2 1 4 3 3 4 2 4 2 4 3 4 2 1 3 4 2 4 4
 2 2 3 4 4 4 3 2 4 1 1 2 3 0 3 3 2 2 3 1 2]


In [6]:
# Get a count of the unique values in each categories to make sure there are enough to support cross-validation
unique_elements, counts_elements = np.unique(y, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[ 0  1  2  3  4  5]
 [10 17 44 50 80  5]]


In [7]:
# Use Stratified Kfold split
from sklearn.model_selection import StratifiedKFold,train_test_split
k_fold = StratifiedKFold(n_splits=4,shuffle=True,random_state=2019)
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.25, random_state=2018, stratify=y)
# Create k folds with trainng data
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]

In [8]:
# KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model
neigh = KNeighborsClassifier(n_neighbors=7)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    print(y_train_fold)
    neigh.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = neigh.predict(X_train_fold)
    y_test_pred = neigh.predict(X_cv_fold)
    # record training set accuracy
    print(neigh.score(X_train_fold, y_train_pred))
    # record generalization accuracy
    print(neigh.score(X_cv_fold, y_cv_fold))
print(neigh.score(X_test, y_test))

[4 4 4 4 1 4 4 4 4 4 3 2 4 4 5 2 3 2 2 3 0 4 2 4 2 3 4 4 4 4 3 4 2 4 2 4 2
 4 1 5 0 4 2 3 0 4 4 4 4 3 3 1 2 3 4 2 3 2 3 2 3 4 4 1 3 4 3 3 4 2 3 4 4 0
 2 1 4 3 2 4 4 3 3 4 2 0 3 4 4 5 2 3 4 3 4 3 1 4 2 1 1 2 3 1 3 2 3 4 4 2 3
 4 2]
1.0
0.7317073170731707
[3 4 4 4 1 4 4 4 4 4 1 2 4 2 3 3 0 3 0 4 3 4 2 2 4 4 2 4 4 4 3 4 4 3 2 4 2
 4 1 4 2 5 3 0 2 3 4 2 4 4 4 3 3 4 3 2 3 2 5 3 2 2 4 3 2 4 3 3 4 4 4 3 1 4
 4 4 0 2 3 4 4 2 1 4 3 2 3 3 2 0 4 3 4 4 5 2 3 4 3 1 1 4 2 1 1 1 2 3 2 3 4
 4 2 4 2 3]
1.0
0.6842105263157895
[4 3 4 1 4 4 4 1 3 4 4 5 2 2 3 0 3 0 4 3 2 2 4 2 2 4 3 4 3 4 4 4 4 3 4 2 2
 4 4 1 4 4 1 3 4 3 0 4 2 4 4 3 3 4 3 1 3 4 2 5 2 2 4 3 2 4 2 3 3 4 1 3 4 4
 1 3 4 4 0 2 4 4 0 4 4 2 3 4 4 4 2 4 3 5 2 3 3 4 3 1 4 1 2 3 1 2 2 3 4 2 2
 3 4 2 3 2]
1.0
0.7894736842105263
[4 3 4 4 4 4 1 3 2 4 5 2 3 2 2 3 0 4 3 4 2 4 4 2 2 4 3 4 3 4 4 4 4 4 2 2 2
 4 1 2 4 1 5 3 0 4 2 0 2 4 3 3 4 3 3 1 2 4 5 3 2 4 2 4 2 3 4 4 1 3 4 3 1 3
 4 4 4 0 3 4 4 0 4 4 1 4 2 4 4 3 3 4 0 4 4 4 3 3 4 3 4 3 1 1 2 1 2 3 2 3 4
 

In [9]:
# SVM Classifier
from sklearn.svm import SVC

svc = SVC(kernel='rbf', gamma='scale', C=1.0)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    svc.fit(X_train, y_train) 
    # model evaluation for training set
    y_train_pred = svc.predict(X_train_fold)
    y_test_pred = svc.predict(X_cv_fold)
    # record training set accuracy
    print(svc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(svc.score(X_cv_fold, y_cv_fold))
print(svc.score(X_test, y_test))

0.9203539823008849
0.9024390243902439
0.9224137931034483
0.8947368421052632
0.9137931034482759
0.9210526315789473
0.905982905982906
0.9459459459459459
0.7884615384615384


In [10]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=4, random_state=0, n_estimators=100) 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    rfc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_cv_fold)
    # record training set accuracy
    print(rfc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(rfc.score(X_cv_fold, y_cv_fold))
print(rfc.score(X_test, y_test))

0.9646017699115044
0.7560975609756098
0.9741379310344828
0.8157894736842105
0.9482758620689655
0.8157894736842105
0.9145299145299145
0.8108108108108109
0.7884615384615384


In [11]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, C=1.0, random_state=0, penalty='l2') 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    lrc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = lrc.predict(X_train)
    y_test_pred = lrc.predict(X_cv_fold)
    # record training set accuracy
    print(lrc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(lrc.score(X_cv_fold, y_cv_fold))
print(lrc.score(X_test, y_test))

1.0
0.7804878048780488
1.0
0.7631578947368421




1.0
0.868421052631579
1.0
0.7837837837837838
0.7692307692307693


In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [13]:
# Grid search for KNN
from sklearn.neighbors import KNeighborsClassifier

# Specify number of neighbors as hyperparameter
neighbors = range(1, 12)

# Create hyperparameter options
hyperparameters = dict(n_neighbors=neighbors)
gs_knn = GridSearchCV(KNeighborsClassifier(), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_knn = gs_knn.fit(X_train, y_train)
    
# View best hyperparameters
print('Best Number of Neighbors:', best_model_knn.best_estimator_.get_params()['n_neighbors'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_knn.predict(X_test)))


Best Number of Neighbors: 5
Accuracy Score: 0.7307692307692307




In [14]:
# Grid search for SVM
from sklearn.svm import SVC

# Create hyperparameter space
C_range = np.logspace(-2, 10, 13)
gamma_range = np.logspace(-9, 3, 13)
kernel = ['linear','rbf']
# Create hyperparameter options
hyperparameters = dict(kernel=kernel, C=C_range, gamma=gamma_range)

gs_svm = GridSearchCV(SVC(), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_svm = gs_svm.fit(X_train, y_train)

# View best hyperparameters
print('Best value of C', best_model_svm.best_estimator_.get_params()['C'])
print('Best value of gamma', best_model_svm.best_estimator_.get_params()['gamma'])
print('Best value of kernel', best_model_svm.best_estimator_.get_params()['kernel'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_svm.predict(X_test)))

Best value of C 1.0
Best value of gamma 0.001
Best value of kernel rbf
Accuracy Score: 0.7884615384615384




In [15]:
# Grid search for RF
from sklearn.ensemble import RandomForestClassifier

# Create hyperparameter space
maxdepth=range(1, 12)
# Create hyperparameter options
hyperparameters = dict(max_depth=maxdepth)
gs_rfc = GridSearchCV(RandomForestClassifier(random_state=0, n_estimators=100), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_rfc = gs_rfc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of Max depth', best_model_rfc.best_estimator_.get_params()['max_depth'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_rfc.predict(X_test)))

Best value of Max depth 4
Accuracy Score: 0.7692307692307693




In [16]:
# Grid search for LR
from sklearn.linear_model import LogisticRegression

# Create hyperparameter space
C = np.logspace(0, 4, 10)
# print(C)
# Create hyperparameter options
hyperparameters = dict(C=C)
gs_lrc = GridSearchCV(LogisticRegression(multi_class='multinomial', solver='sag', max_iter=5000, random_state=0, penalty='l2') , hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_lrc = gs_lrc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of C', best_model_lrc.best_estimator_.get_params()['C'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_lrc.predict(X_test)))



Best value of C 2.7825594022071245
Accuracy Score: 0.7884615384615384


In [17]:
# Best model - SVM with kernel = rbf, C = 1, and gamma = .001
from sklearn.svm import SVC

svc = SVC(kernel='rbf', gamma=0.001, C=1.0)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    svc.fit(X_train, y_train) 
    # model evaluation for training set
    y_train_pred = svc.predict(X_train_fold)
    y_test_pred = svc.predict(X_cv_fold)
    # record training set accuracy
    print(svc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(svc.score(X_cv_fold, y_cv_fold))
print(svc.score(X_test, y_test))

0.8376068376068376
0.8378378378378378
0.8376068376068376
0.8378378378378378
0.8376068376068376
0.8378378378378378
0.8376068376068376
0.8378378378378378
0.7884615384615384


In [18]:
y_pred = svc.predict(X_test)
from sklearn.metrics import classification_report
class_names = ["Class {}".format(i) for i in range(6)]
print(classification_report(y_test, y_pred, target_names=class_names))


              precision    recall  f1-score   support

     Class 0       1.00      0.67      0.80         3
     Class 1       0.00      0.00      0.00         4
     Class 2       0.55      1.00      0.71        11
     Class 3       0.85      0.85      0.85        13
     Class 4       1.00      0.85      0.92        20
     Class 5       0.00      0.00      0.00         1

    accuracy                           0.79        52
   macro avg       0.57      0.56      0.55        52
weighted avg       0.77      0.79      0.76        52



  'precision', 'predicted', average, warn_for)


In [19]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 2,  0,  0,  1,  0,  0],
       [ 0,  0,  4,  0,  0,  0],
       [ 0,  0, 11,  0,  0,  0],
       [ 0,  0,  2, 11,  0,  0],
       [ 0,  0,  3,  0, 17,  0],
       [ 0,  0,  0,  1,  0,  0]])