In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
# load the PCA model
sarcoma_df = pd.read_csv('Data/sarcoma-gene-exp-FPKM-zscore-no-label-nomfs.csv')
sarcoma_df.shape

(189, 20605)

In [2]:
# convert df to array
sarcoma_data = sarcoma_df.to_numpy()
print(sarcoma_data)

[[-0.51154087  0.09480786 -0.35176093 ...  0.51624723  0.02346097
   2.48359169]
 [-0.19129576  0.15347405  0.20261954 ... -0.41028013  2.86123234
   0.16777757]
 [ 0.26472818  2.327348   -0.56049386 ... -0.21651268  0.0192731
   0.76332633]
 ...
 [-0.2652622  -0.48026337 -1.16285933 ... -0.36779173  0.24949394
  -0.36985907]
 [ 0.24931652 -0.03363532 -0.97441342 ... -0.50605902 -1.23871739
   0.28153212]
 [-0.07085013 -0.29522455 -0.59015045 ... -0.50644652 -0.01137879
  -0.39862195]]


In [3]:
# Run PCA based on preserving variance
from sklearn.decomposition import PCA

pca = PCA(n_components=0.95)
pca.fit(sarcoma_data)
print("PCA fit is complete")

PCA fit is complete


In [4]:
X = pca.transform(sarcoma_data)
print("PCA transformation is complete")

PCA transformation is complete


In [5]:
print("reduced shape: {}".format(str(X.shape)))

reduced shape: (189, 150)


In [6]:
# read in labels
sarcoma_labels_df = pd.read_csv('Data/sarcoma-gene-exp-FPKM-labels-nomfs.csv')
sarcoma_labels_df.shape

(189, 1)

In [7]:
# Convert label df to np array
y_df = sarcoma_labels_df['label']
y = y_df.to_numpy()
print(y)

[4 2 2 5 3 2 4 4 3 4 4 4 3 0 3 4 2 4 4 0 2 4 3 3 3 2 0 4 4 5 3 3 2 4 4 5 4
 3 4 4 5 4 2 4 4 4 4 4 4 2 4 3 3 2 2 4 4 4 4 4 4 4 2 2 3 4 2 3 4 4 3 4 3 2
 4 3 2 3 4 4 3 4 3 4 3 4 4 3 4 4 4 0 4 3 4 3 3 3 5 2 0 3 3 4 2 3 0 3 4 2 4
 2 0 3 4 4 3 2 2 3 4 4 4 4 4 3 2 4 2 4 0 2 2 3 4 4 2 3 4 4 3 3 4 3 4 2 2 3
 2 2 0 4 2 4 2 4 3 3 4 2 4 2 4 3 4 2 3 4 2 4 4 2 2 3 4 4 4 3 2 4 2 3 0 3 3
 2 2 3 2]


In [8]:
# Get a count of the unique values in each categories to make sure there are enough to support cross-validation
unique_elements, counts_elements = np.unique(y, return_counts=True)
print("Frequency of unique values of the said array:")
print(np.asarray((unique_elements, counts_elements)))

Frequency of unique values of the said array:
[[ 0  2  3  4  5]
 [10 44 50 80  5]]


In [10]:
# split into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2,
    stratify=y, random_state=0
)

In [11]:
# split into development and validation sets
X_dev, X_val, y_dev, y_val = train_test_split(
    X_train, y_train, test_size=0.25,
    stratify=y_train, random_state=0
)

In [12]:
from collections import Counter
from imblearn.over_sampling import SMOTE # doctest: +NORMALIZE_WHITESPACE
from imblearn.over_sampling import BorderlineSMOTE
from imblearn.combine import SMOTETomek
from imblearn.combine import SMOTEENN

#smote_enn = SMOTEENN(random_state=0)
#smote_tomek = SMOTETomek(random_state=0)
# sm = BorderlineSMOTE(sampling_strategy='not majority', k_neighbors=4, m_neighbors=4, random_state=42)
sm = SMOTE(sampling_strategy='not majority', k_neighbors=2, random_state=42)
#sm = SMOTE(sampling_strategy={0: 40, 1: 40, 2: 40, 3: 40, 4: 40}, random_state=42)
#sm = SMOTE(sampling_strategy=.3, random_state=42)

X_res, y_res = sm.fit_resample(X_dev, y_dev)
print('Resampled dataset shape %s' % Counter(y_res))

Resampled dataset shape Counter({3: 48, 4: 48, 2: 48, 0: 48, 5: 48})


Using TensorFlow backend.


In [13]:
from sklearn.metrics import accuracy_score

In [22]:
def baggingClassifiers():
    # This example is taken from https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb 

    from sklearn.ensemble import BaggingClassifier
    from sklearn.metrics import accuracy_score
    from sklearn.linear_model import LogisticRegression

    bag_clf = BaggingClassifier(
        LogisticRegression(multi_class='multinomial', solver='sag', max_iter=2000, C=1.0, penalty='l2'), n_estimators=100,
            max_samples=100, bootstrap=True, n_jobs=-1, random_state=42)
    bag_clf.fit(X_res, y_res)
    y_pred = bag_clf.predict(X_test)
    print('Bagging LR',accuracy_score(y_test, y_pred))
    return y_pred

In [23]:
y_pred = baggingClassifiers()

Bagging LR 0.9736842105263158


In [25]:
from sklearn.metrics import classification_report
class_names = ["Class {}".format(i) for i in range(5)]
print(classification_report(y_test, y_pred, target_names=class_names))


              precision    recall  f1-score   support

     Class 0       1.00      1.00      1.00         2
     Class 1       1.00      1.00      1.00         9
     Class 2       1.00      1.00      1.00        10
     Class 3       1.00      0.94      0.97        16
     Class 4       0.50      1.00      0.67         1

    accuracy                           0.97        38
   macro avg       0.90      0.99      0.93        38
weighted avg       0.99      0.97      0.98        38



In [26]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

array([[ 2,  0,  0,  0,  0],
       [ 0,  9,  0,  0,  0],
       [ 0,  0, 10,  0,  0],
       [ 0,  0,  0, 15,  1],
       [ 0,  0,  0,  0,  1]])