In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# load the TSNE model
sarcoma_tsne_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-tsne2.csv')
sarcoma_tsne_df.shape

(206, 3)

In [2]:
# convert df to array
X_df = sarcoma_tsne_df.iloc[:, 0:2]
X = X_df.to_numpy()
print(X)

[[ 1.5101881   4.495065  ]
 [-0.3933316  -2.2554176 ]
 [ 0.52394176 -2.9038317 ]
 [ 2.4056387   0.77142346]
 [-2.5902762  -1.8893213 ]
 [-0.74589753  1.5878414 ]
 [ 0.8318154  -0.4103697 ]
 [ 0.34786895 -1.8117441 ]
 [ 2.3835487   4.318263  ]
 [-0.3787896  -4.135516  ]
 [ 3.2163546   5.402535  ]
 [ 0.5341209   5.0063424 ]
 [ 0.89254034  4.400904  ]
 [-1.656523    2.1570492 ]
 [-3.2748675   2.4115992 ]
 [ 2.8672967  -3.676628  ]
 [-1.3153446  -2.1818364 ]
 [ 1.0054144   4.390404  ]
 [ 1.8168291   1.1770047 ]
 [ 0.76507306  6.1616173 ]
 [-0.49324757  6.2095857 ]
 [-3.162166    0.24230962]
 [ 1.5632206  -4.763572  ]
 [ 1.8031838   6.2502847 ]
 [ 3.5639775   1.767928  ]
 [ 2.3249142  -0.15714957]
 [ 1.7055893  -0.13778462]
 [ 0.54223424 -3.003455  ]
 [-1.6626078   0.01909894]
 [ 3.7204676   1.27804   ]
 [ 1.3868074   3.1736996 ]
 [ 4.626502    2.9895117 ]
 [ 4.696998    2.8944385 ]
 [ 1.0404147  -1.9515623 ]
 [ 1.3718221  -2.971115  ]
 [ 0.6894906   0.0091331 ]
 [-0.1307336   3.0051074 ]
 

In [4]:
# read in labels
sarcoma_labels_df = pd.read_csv('../Data/sarcoma-gene-exp-FPKM-labels-only.csv')
sarcoma_labels_df.shape

(206, 1)

In [5]:
y_df = sarcoma_labels_df['label']
y = y_df.to_numpy()
print(y)

[4 2 2 1 5 3 2 4 4 3 4 4 4 3 0 3 1 4 2 4 4 0 2 4 3 3 3 2 0 4 4 5 3 3 2 4 4
 5 4 3 4 4 5 4 2 4 4 4 1 4 1 4 4 2 4 3 3 2 2 4 4 4 4 4 4 4 2 2 3 4 2 3 4 4
 3 4 3 2 4 3 2 3 4 4 3 4 3 4 3 4 4 1 3 4 4 4 0 4 3 4 3 3 3 5 2 0 3 3 1 1 4
 2 3 0 3 4 2 4 2 0 3 4 4 3 2 2 1 3 4 4 4 4 4 3 2 4 2 4 1 0 2 2 3 4 4 2 3 4
 4 3 1 3 4 3 4 2 2 1 3 2 2 0 4 2 1 4 2 1 4 3 3 4 2 4 2 4 3 4 2 1 3 4 2 4 4
 2 2 3 4 4 4 3 2 4 1 1 2 3 0 3 3 2 2 3 1 2]


In [6]:
# Get a count of the unique values in each categories to make sure there are enought to support cross-validation
unique_elements, counts_elements = np.unique(y, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[ 0  1  2  3  4  5]
 [10 17 44 50 80  5]]


In [7]:
# Use Stratified Kfold split
from sklearn.model_selection import StratifiedKFold,train_test_split
k_fold = StratifiedKFold(n_splits=4,shuffle=True,random_state=2019)
# Split training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=2018, stratify=y)
# Create k folds with trainng data
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]

In [8]:
# KNN Classifier
from sklearn.neighbors import KNeighborsClassifier

# instantiate the model
neigh = KNeighborsClassifier(n_neighbors=7)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    #print(y_train_fold)
    neigh.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = neigh.predict(X_train_fold)
    y_test_pred = neigh.predict(X_cv_fold)
    # record training set accuracy
    print(neigh.score(X_train_fold, y_train_pred))
    # record generalization accuracy
    print(neigh.score(X_cv_fold, y_cv_fold))
print(neigh.score(X_test, y_test))

1.0
0.6585365853658537
1.0
0.6052631578947368
1.0
0.7105263157894737
1.0
0.7027027027027027
0.6923076923076923


In [9]:
# SVM Classifier
from sklearn.svm import SVC

svc = SVC(kernel='rbf', gamma='scale', C=1.0)
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    svc.fit(X_train, y_train) 
    # model evaluation for training set
    y_train_pred = svc.predict(X_train_fold)
    y_test_pred = svc.predict(X_cv_fold)
    # record training set accuracy
    print(svc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(svc.score(X_cv_fold, y_cv_fold))
print(svc.score(X_test, y_test))

0.7079646017699115
0.6097560975609756
0.6724137931034483
0.7105263157894737
0.6724137931034483
0.7105263157894737
0.6752136752136753
0.7027027027027027
0.6538461538461539


In [10]:
# Random Forest Classifier
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(max_depth=4, random_state=0, n_estimators=100) 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    rfc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = rfc.predict(X_train)
    y_test_pred = rfc.predict(X_cv_fold)
    # record training set accuracy
    print(rfc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(rfc.score(X_cv_fold, y_cv_fold))
print(rfc.score(X_test, y_test))

0.7876106194690266
0.6341463414634146
0.8448275862068966
0.5526315789473685
0.8103448275862069
0.7368421052631579
0.811965811965812
0.7027027027027027
0.6538461538461539


In [11]:
# Logistic Regression Classifier
from sklearn.linear_model import LogisticRegression

lrc = LogisticRegression(multi_class='multinomial', solver='sag', max_iter=1000, C=1.0, random_state=0, penalty='l2') 
for train_index, cv_index in k_fold.split(np.zeros(len(X_train)), y_train.ravel()):
    X_train_fold, X_cv_fold = X_train[train_index,:], X_train[cv_index,:]
    y_train_fold, y_cv_fold = y_train[train_index], y_train[cv_index]
    lrc.fit(X_train_fold, y_train_fold) 
    # model evaluation for training set
    y_train_pred = lrc.predict(X_train)
    y_test_pred = lrc.predict(X_cv_fold)
    # record training set accuracy
    print(lrc.score(X_train_fold, y_train_fold))
    # record generalization accuracy
    print(lrc.score(X_cv_fold, y_cv_fold))
print(lrc.score(X_test, y_test))

0.6460176991150443
0.4634146341463415
0.5862068965517241
0.5526315789473685
0.5517241379310345
0.5
0.5555555555555556
0.5675675675675675
0.5769230769230769


In [12]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [13]:
# Grid search for KNN
from sklearn.neighbors import KNeighborsClassifier

# Specify number of neighbors as hyperparameter
neighbors = range(1, 12)

# Create hyperparameter options
hyperparameters = dict(n_neighbors=neighbors)
gs_knn = GridSearchCV(KNeighborsClassifier(), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_knn = gs_knn.fit(X_train, y_train)
    
# View best hyperparameters
print('Best Number of Neighbors:', best_model_knn.best_estimator_.get_params()['n_neighbors'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_knn.predict(X_test)))


Best Number of Neighbors: 4
Accuracy Score: 0.6923076923076923




In [14]:
# Grid search for SVM
from sklearn.svm import SVC

# Create hyperparameter space
C_range = np.logspace(-2, 10, 13)
# print(C_range)
gamma_range = np.logspace(-9, 3, 13)
# Create hyperparameter options
hyperparameters = dict(C=C_range, gamma=gamma_range)
gs_svm = GridSearchCV(SVC(kernel='rbf'), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_svm = gs_svm.fit(X_train, y_train)

# View best hyperparameters
print('Best value of C', best_model_svm.best_estimator_.get_params()['C'])
print('Best value of gamma', best_model_svm.best_estimator_.get_params()['gamma'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_svm.predict(X_test)))

Best value of C 100.0
Best value of gamma 0.1
Accuracy Score: 0.7115384615384616




In [15]:
# Grid search for RF
from sklearn.ensemble import RandomForestClassifier

# Create hyperparameter space
maxdepth=range(1, 12)
# Create hyperparameter options
hyperparameters = dict(max_depth=maxdepth)
gs_rfc = GridSearchCV(RandomForestClassifier(random_state=0, n_estimators=100), hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_rfc = gs_rfc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of Max depth', best_model_rfc.best_estimator_.get_params()['max_depth'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_rfc.predict(X_test)))

Best value of Max depth 7
Accuracy Score: 0.6730769230769231




In [16]:
# Grid search for LR
from sklearn.linear_model import LogisticRegression

# Create hyperparameter space
C = np.logspace(0, 4, 10)
# print(C)
# Create hyperparameter options
hyperparameters = dict(C=C)
gs_lrc = GridSearchCV(LogisticRegression(multi_class='multinomial', solver='sag', max_iter=5000, random_state=0, penalty='l2') , hyperparameters, cv=4, scoring="accuracy", verbose=0)
# Fit search
best_model_lrc = gs_lrc.fit(X_train, y_train)
    
# View best hyperparameters
print('Best value of C', best_model_lrc.best_estimator_.get_params()['C'])

# Test the model and report Accuracy score
print('Accuracy Score:', accuracy_score(y_test, best_model_lrc.predict(X_test)))

Best value of C 1.0
Accuracy Score: 0.5961538461538461


