In [1]:
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from k_fold import k_fold

df = pd.read_csv('./data/processed_mrna_zscore.csv')

In [2]:
X = df.iloc[:, :-2]
y = df.iloc[:, -1]

In [3]:
label_mapping = {'short': 0, 'medium': 1, 'long': 2}
y_numerical = y.map(label_mapping)

# # Now, 'y_numerical' contains the encoded numerical values
print(y_numerical)

0       2
1       1
2       2
3       2
4       2
       ..
1975    2
1976    2
1977    2
1978    2
1979    2
Name: OS_RANGE, Length: 1980, dtype: int64


In [4]:
y = y_numerical

In [5]:
randomState = 42
randomForest = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
xgBoost = xgb.XGBClassifier(random_state=randomState, n_estimators=10, n_jobs=-1)
decisionTree = DecisionTreeClassifier(random_state=randomState)
svm = SVC(random_state=randomState)
neuralNetwork = MLPClassifier(hidden_layer_sizes=(100, ), random_state=randomState, max_iter=1000)

classifiers = [randomForest, xgBoost, decisionTree, svm, neuralNetwork]

In [6]:
with open('./top_features_rf_50.txt', 'r') as file:
    top15 = file.read().splitlines()

with open('./top_features_rf_100.txt', 'r') as file:
    top100 = file.read().splitlines()

with open('./top_features_rf_500.txt', 'r') as file:
    top500 = file.read().splitlines()

featuresDic = {'top15': top15, 'top100': top100, 'top500': top500}

In [7]:
maxF1Score = [0, 0, "", ""]
maxAcc = [0, 0, "", ""]
for key, features in featuresDic.items():
    print(f"\nFeature: {key}")
    for classifier in classifiers:
        acc, f1, _, _ = k_fold(classifier=classifier, X=X[features], y=y, verbose=False)
        if f1 > maxF1Score[1]:
            maxF1Score[0] = acc
            maxF1Score[1] = f1
            maxF1Score[2] = key
            maxF1Score[3] = classifier.__class__.__name__
        if acc > maxAcc[0]:
            maxAcc[0] = acc
            maxAcc[1] = f1
            maxAcc[2] = key
            maxAcc[3] = classifier.__class__.__name__
            
print(f"Max f1 score: {maxF1Score[1]}, acc: {maxF1Score[0]},  feature: {maxF1Score[2]}, classifier: {maxF1Score[3]}")
print(f"Max Accuracy: {maxAcc[0]}, f1: {maxAcc[1]}, feature: {maxAcc[2]}, classifier: {maxAcc[3]}")


Feature: top15
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8368686868686869
Avg F1 Score: 0.8109922811525566
Avg Precision: 0.7916931747026946
Avg Recall: 0.8368686868686869

Classifier: XGBClassifier
Avg Weighted Accuracy: 0.7398989898989898
Avg F1 Score: 0.7660785930983437
Avg Precision: 0.8013135475574208
Avg Recall: 0.7398989898989898

Classifier: DecisionTreeClassifier
Avg Weighted Accuracy: 0.6515151515151516
Avg F1 Score: 0.7022634474953423
Avg Precision: 0.7708254848529765
Avg Recall: 0.6515151515151516

Classifier: SVC
Avg Weighted Accuracy: 0.808080808080808
Avg F1 Score: 0.7945374357437812
Avg Precision: 0.7823749108414894
Avg Recall: 0.808080808080808

Classifier: MLPClassifier
Avg Weighted Accuracy: 0.8015151515151515
Avg F1 Score: 0.7950563377775619
Avg Precision: 0.78977161593549
Avg Recall: 0.8015151515151515


Feature: top100
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8429292929292929
Avg F1 Score: 0.8113838836378312
Avg Precision: 0.7

In [9]:
with open('./top_features_gen_dt.txt', 'r') as file:
    genetic_dt = file.read().splitlines()

with open('./top_features_gen_rf.txt', 'r') as file:
    genetic_rf = file.read().splitlines()

with open('./top_features_gen_nn.txt', 'r') as file:
    genetic_nn = file.read().splitlines()

with open('./top_features_gen_svm.txt', 'r') as file:
    genetic_svm = file.read().splitlines()

with open('./top_features_gen_xgb.txt', 'r') as file:
    genetic_xgb = file.read().splitlines()

features_gen = {
    'genetic_dt': genetic_dt,
    'genetic_rf': genetic_rf, 
    'genetic_nn': genetic_nn, 
    'genetic_svm': genetic_svm, 
    'genetic_xgb': genetic_xgb
}

In [10]:
maxF1Score = [0, 0, "", ""]
maxAcc = [0, 0, "", ""]
for key, features in features_gen.items():
    print(f"\nFeature: {key}")
    for classifier in classifiers:
        features = list(set(features))
        acc, f1, _, _ = k_fold(classifier=classifier, X=X[features], y=y, verbose=False)
        if f1 > maxF1Score[1]:
            maxF1Score[0] = acc
            maxF1Score[1] = f1
            maxF1Score[2] = key
            maxF1Score[3] = classifier.__class__.__name__
        if acc > maxAcc[0]:
            maxAcc[0] = acc
            maxAcc[1] = f1
            maxAcc[2] = key
            maxAcc[3] = classifier.__class__.__name__
            
print(f"Max f1 score: {maxF1Score[1]}, acc: {maxF1Score[0]},  feature: {maxF1Score[2]}, classifier: {maxF1Score[3]}")
print(f"Max Accuracy: {maxAcc[0]}, f1: {maxAcc[1]}, feature: {maxAcc[2]}, classifier: {maxAcc[3]}")


Feature: genetic_dt
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8414141414141415
Avg F1 Score: 0.8125396882438073
Avg Precision: 0.7914850991456395
Avg Recall: 0.8414141414141415

Classifier: XGBClassifier
Avg Weighted Accuracy: 0.743939393939394
Avg F1 Score: 0.767891428699887
Avg Precision: 0.7980416966357123
Avg Recall: 0.743939393939394

Classifier: DecisionTreeClassifier
Avg Weighted Accuracy: 0.6737373737373737
Avg F1 Score: 0.7206999698259813
Avg Precision: 0.786368901931722
Avg Recall: 0.6737373737373737

Classifier: SVC
Avg Weighted Accuracy: 0.8287878787878789
Avg F1 Score: 0.8004692194382947
Avg Precision: 0.7788412372359994
Avg Recall: 0.8287878787878789

Classifier: MLPClassifier
Avg Weighted Accuracy: 0.8111111111111111
Avg F1 Score: 0.798562662539648
Avg Precision: 0.7878801513860799
Avg Recall: 0.8111111111111111


Feature: genetic_rf
Classifier: RandomForestClassifier
Avg Weighted Accuracy: 0.8474747474747474
Avg F1 Score: 0.8142146298772263
Avg Precis