# QBIO 490: Final Project
### Classifier Training and Evaluation for TCGA Ovarian Subtypes
Daven Pan, Aman Sharma, Joseph Kim, and Justin Wan

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import label_binarize

# Read in intermediary data (from R analysis).
consensus_labels = pd.read_csv("intermediary_data/consensus_labels.csv", index_col = 0)
counts = pd.read_csv("intermediary_data/counts.csv", index_col = 0)
counts = counts.transpose()

# Encode consensus tumor subtypes.
encoder = OrdinalEncoder()
consensus_labels["encoded"] = encoder.fit_transform(consensus_labels)

# Initialize classifiers.
classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    MLPClassifier(max_iter = 500),
    GaussianNB(),
    AdaBoostClassifier(algorithm="SAMME"),
    RandomForestClassifier(),
]

# Specify classifier names.
classifiers_names = [
    'K-Nearest Neighbors',
    'Decision Tree',
    'Neural Net',
    'Naive Bayes',
    "AdaBoost",
    "Random Forest",
]

# Hold accuracy scores across iterations.
classifiers_acc = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    5: []
}

# Hold F1 scores across iterations.
classifiers_f1 = {
    0: [],
    1: [],
    2: [],
    3: [],
    4: [],
    5: []
}

# Test each classifier and output mean accuracy across ten trials.
for i in range(len(classifiers)):
    classifier = classifiers[i]
    for j in range(10):
        X_train, X_test, y_train, y_test = train_test_split(counts, consensus_labels["encoded"], train_size = 0.70)
        classifier.fit(X_train, y_train)
        y_pred = classifier.predict(X_test)
        classifiers_acc[i].append(metrics.accuracy_score(y_test, y_pred))
        classifiers_f1[i].append(metrics.f1_score(y_test, y_pred, average = "macro"))

# Print results.
print('\nAfter 10 simulations, the average accuracy for each classifier is as follows:')
for i in classifiers_acc:
    print(f'\t{classifiers_names[i]} : {round(np.mean(classifiers_acc[i]) * 100, ndigits = 2)}%')

print('\nAfter 10 simulations, the average F1 score for each classifier is as follows:')
for i in classifiers_acc:
    print(f'\t{classifiers_names[i]} : {round(np.mean(classifiers_f1[i]), ndigits = 2)}')


After 10 simulations, the average accuracy for each classifier is as follows:
	K-Nearest Neighbors : 82.55%
	Decision Tree : 79.62%
	Neural Net : 90.47%
	Naive Bayes : 82.36%
	AdaBoost : 79.72%
	Random Forest : 88.49%

After 10 simulations, the average F1 score for each classifier is as follows:
	K-Nearest Neighbors : 0.83
	Decision Tree : 0.8
	Neural Net : 0.91
	Naive Bayes : 0.83
	AdaBoost : 0.79
	Random Forest : 0.88
