In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from k_fold import k_fold

df = pd.read_csv('./data/processed_mrna_zscore.csv')

In [2]:
X = df.iloc[:, :-2]
y = df.iloc[:, -1]

In [3]:
label_mapping = {'short': 0, 'medium': 1, 'long': 2}
y_numerical = y.map(label_mapping)

# # Now, 'y_numerical' contains the encoded numerical values
print(y_numerical)

0       2
1       1
2       2
3       2
4       2
       ..
1959    2
1960    2
1961    2
1962    2
1963    2
Name: OS_RANGE, Length: 1964, dtype: int64


In [4]:
y = y_numerical

In [5]:
randomState = 42
randomForest = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
xgBoost = xgb.XGBClassifier(random_state=randomState, n_estimators=10, n_jobs=-1)
decisionTree = DecisionTreeClassifier(random_state=randomState)
svm = SVC(random_state=randomState)
neuralNetwork = MLPClassifier(hidden_layer_sizes=(100, ), random_state=randomState, max_iter=1000)

classifiers = [randomForest, xgBoost, decisionTree, svm, neuralNetwork]

In [6]:
with open('./genetic_feats.txt', 'r') as file:
    genetic = file.read().splitlines()
    
with open('./top_features_rf_15.txt', 'r') as file:
    top15 = file.read().splitlines()

with open('./top_features_rf_100.txt', 'r') as file:
    top100 = file.read().splitlines()

with open('./top_features_rf_500.txt', 'r') as file:
    top500 = file.read().splitlines()

featuresDic = {'genetic': genetic, 'top15': top15, 'top100': top100, 'top500': top500}

In [7]:
maxF1Score = [0, 0, "", ""]
maxAcc = [0, 0, "", ""]
for key, features in featuresDic.items():
    print(f"\nFeature: {key}")
    for classifier in classifiers:
        acc, f1, _, _ = k_fold(classifier=classifier, X=X[features], y=y, verbose=False)
        if f1 > maxF1Score[1]:
            maxF1Score[0] = acc
            maxF1Score[1] = f1
            maxF1Score[2] = key
            maxF1Score[3] = classifier.__class__.__name__
        if acc > maxAcc[0]:
            maxAcc[0] = acc
            maxAcc[1] = f1
            maxAcc[2] = key
            maxAcc[3] = classifier.__class__.__name__
            
print(f"Max f1 score: {maxF1Score[1]}, acc: {maxF1Score[0]},  feature: {maxF1Score[2]}, classifier: {maxF1Score[3]}")
print(f"Max Accuracy: {maxAcc[0]}, f1: {maxAcc[1]}, feature: {maxAcc[2]}, classifier: {maxAcc[3]}")


Feature: genetic
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8528488552781519
Avg F1 Score: 0.8113625357070271
Avg Precision: 0.7905538834666859
Avg Recall: 0.8528488552781519

Classifier: XGBClassifier
Avg Weighted Accuracy: 0.7031311509375323
Avg F1 Score: 0.7372668530237299
Avg Precision: 0.7815209333033544
Avg Recall: 0.7031311509375323

Classifier: DecisionTreeClassifier
Avg Weighted Accuracy: 0.6557987154252564
Avg F1 Score: 0.7069628373282872
Avg Precision: 0.779599156665113
Avg Recall: 0.6557987154252564

Classifier: SVC
Avg Weighted Accuracy: 0.8263752201388168
Avg F1 Score: 0.7941720915415422
Avg Precision: 0.7671536882652953
Avg Recall: 0.8263752201388168

Classifier: MLPClassifier
Avg Weighted Accuracy: 0.7963431057702268
Avg F1 Score: 0.7832198874482634
Avg Precision: 0.7722683743261977
Avg Recall: 0.7963431057702268


Feature: top15
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8085595151766289
Avg F1 Score: 0.8028167213671613
Avg Precision: