## Import Packages

In [1]:
# import packages
# Importing all necessary packages

import numpy as np
import pandas as pd
#from sklearn.decomposition import PCA
import collections
from sklearn.model_selection import train_test_split

# importing os module
import os
import time
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import mutual_info_classif, SelectKBest

from sklearn.feature_selection import RFECV
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn import svm
from sklearn.svm import SVC

from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import ShuffleSplit
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import roc_curve
from sklearn.metrics import RocCurveDisplay

from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, roc_auc_score

from sklearn.impute import SimpleImputer
from sklearn.impute import KNNImputer
#import shap
import random

import warnings
warnings.filterwarnings("ignore")

## Functions

#### 1. Split Function: Splits the data into train and test

x: input

y:output

In [2]:
def split(x, y, random_state): # x and y are raw
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, shuffle=True, random_state=random_state)
    # data without imputation and target values
    return x_train, x_test, y_train, y_test # _0 retaines sample type information _1 are without sample type info

#### 2. Random Selection: randomly selects a set of n features

In [3]:
def random_selection(num, all_features, random_state):
    # num is the feature set size to be selected
    # all_features = total pool of features
    # random selection of columns
    random.seed(random_state)
    features = []
    features = random.sample(all_features, num)
    return features

3. Min-Max Scaling


In [4]:
def minmax_scaling(x_train, x_test):

    scaler = MinMaxScaler()
    scaler.fit(x_train)
    x_train = pd.DataFrame(scaler.transform(x_train), columns=x_train.columns, index=x_train.index)
    x_test = pd.DataFrame(scaler.transform(x_test), columns=x_test.columns, index=x_test.index)
    return x_train, x_test


Evaluation

In [5]:
def eval(y_true, y_pred):
    acc = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label = "Tumor")
    precision = precision_score(y_true,y_pred, pos_label = "Tumor")
    recall = recall_score(y_true, y_pred, pos_label = "Tumor")
    auc = roc_auc_score(y_true, y_pred)
    return acc, f1, precision, recall

#### Classification task

In [6]:
def classification_task(x_train, x_test, y_train, y_test, models, model_names, random_state):
        
        # accuracy dictionary
        accuracy_cv = {} # iteration over seeds
        accuracy_pred = {} # iteration over seeds
      
        i = 0
        for est in models: 
                model_name = model_names[i]
                i = i + 1
                est = est.fit(x_train, y_train)

                #  Stratified Cross validation
                cv = StratifiedKFold(n_splits=5, shuffle=True, random_state = random_state)
                y_pred_CV = cross_val_predict(est, x_train, y_train, cv=cv)               
                report_CV = classification_report(y_train, y_pred_CV, target_names=["Normal", "Tumor"], output_dict=True)
                acc = report_CV["accuracy"]
                accuracy_cv[model_name] = acc


                # prediction
                y_pred = est.predict(x_test)

                report = classification_report(y_test, y_pred, target_names=["Normal", "Tumor"], output_dict=True)
                acc = report["accuracy"]
                accuracy_pred[model_name] = acc
                # returns directory for models
        return accuracy_cv, accuracy_pred

In [7]:
def get_miscores(X, y, random_state):
    mi_scores = mutual_info_classif(X, y, random_state=random_state)
    return mi_scores
def get_migenes(x, y, k, random_state):
    mi_df  = pd.DataFrame(columns=["gene", "MI_score"])
    mi_df["gene"] = x.columns
    random.seed(random_state)
    seeds = random.sample(range(0, 1000), 5)
    for seed in seeds: # you can increase the list of seeds
        mi_scores= get_miscores(x, y, random_state=seed)
        mi_df[seed] = mi_scores
        # Mi_score column is mean of all seeds
    mi_df["MI_score"] = mi_df.iloc[:, 1:6].mean(axis=1)

    mi_df = mi_df.sort_values(by="MI_score", ascending=False)
        
    top_1000_df = mi_df.head(k)
    top_genes_mi = mi_df["gene"].tolist()[:k]
    
    # returns the list and dataframe
    return top_genes_mi, top_1000_df 


#### SVMRFE

In [10]:
def svm_rfe(x_train, y_train, features, random_state): # returns dataframe of features 
    # min = min numbers of features to be selected
    
    # cross validation to be used in RFECV
    cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = random_state)
    
    # using SVC estimator to check Features importance using RFE 
    svc = svm.SVC(kernel = "linear", random_state=random_state)
    # RFECV optimizes the minimum set of features needed for cassification task
    rfecv_SVM = RFECV(svc, min_features_to_select = 10, cv=cv, scoring='f1_weighted',  n_jobs=-1)
    fit_rfecv_SVM = rfecv_SVM.fit(x_train, y_train)
    
    RFECV_SVM_Top_Feat_df = pd.DataFrame({'Features': features, 'Selected': fit_rfecv_SVM.support_,
                                  'Rank': fit_rfecv_SVM.ranking_})
    RFECV_SVM_Top_Feat_df = RFECV_SVM_Top_Feat_df.sort_values(by = 'Rank')
 
    return RFECV_SVM_Top_Feat_df
    
    
    
    

#### Ploting ROC

In [11]:
# function to plot ROC curve
def roc(model_list, x_test, y_test):
    disp = RocCurveDisplay.from_estimator(model_list[0], x_test, y_test, pos_label='Tumor', name ="LogR", lw = 5)
    RocCurveDisplay.from_estimator(model_list[1], x_test, y_test, pos_label='Tumor', ax=disp.ax_, name ="RF", lw = 5);
    RocCurveDisplay.from_estimator(model_list[2], x_test, y_test, pos_label='Tumor', ax=disp.ax_, name ="MLP", lw = 5);
    RocCurveDisplay.from_estimator(model_list[3], x_test, y_test, pos_label='Tumor', ax=disp.ax_, name ="SVC", lw = 5); 
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.rc("font", size = 20)
    plt.legend(fontsize = "16", frameon = False)
    plt.title("100 genes (ANOVA)")
    plt.savefig("00-1_ROC_Top_100_Cancer.png", dpi=400, bbox_inches='tight')
    plt.show()


## MAIN

### Getting the data ready for the main process

In [12]:
data_df = pd.read_csv("data/processed/RNAseq_processed.csv", index_col=0)
print(data_df.shape)
x = data_df.drop(columns=["Sample_Type"])
y = data_df["Sample_Type"]


(162, 32298)


In [13]:
data_df.head(5)

Unnamed: 0,A1BG,A1BG-AS1,A1CF,A2M,A2M-AS1,A2ML1,A2MP1,A3GALT2,A4GALT,A4GNT,...,ZWINT,ZXDA,ZXDB,ZXDC,ZYG11A,ZYG11B,ZYX,ZZEF1,ZZZ3,Sample_Type
C3L-00977,7.58,6.45,4.19,13.68,6.22,6.52,3.56,1.85,9.81,1.85,...,8.28,8.94,9.48,10.83,6.56,11.05,9.36,12.09,11.7,Normal
C3L-00994,5.41,5.23,3.2,13.65,6.74,16.31,2.35,2.03,9.51,2.35,...,9.29,8.72,8.78,10.96,7.38,12.69,9.39,12.18,11.72,Normal
C3L-00995,5.02,4.91,4.77,15.13,7.74,12.66,4.23,0.0,9.71,3.03,...,7.69,9.04,8.57,11.04,6.53,12.57,10.83,12.05,11.11,Normal
C3L-00997,5.83,5.91,3.2,14.01,6.67,16.36,3.03,0.0,10.21,2.35,...,8.87,8.43,8.84,10.95,7.07,11.5,10.16,12.11,11.52,Normal
C3L-00999,4.12,4.27,5.01,14.47,7.02,14.86,1.28,0.0,9.82,3.2,...,8.78,8.76,8.99,11.14,6.71,12.48,10.11,12.19,11.63,Normal


## MAIN LOOP -- Loops for 10 random sates

In [14]:
# iteration of random seeds
random.seed(42)

# dictionaries to store results
# dict to store selected genes from MI and SVM-RFE
# dtaaframe to store results
dict_mi = {}
dict_svmrfe = {}
dict_acc_cv = {"LogR":[], "RF":[], "MLP":[], "SVC":[]}
dict_acc_pred = {"LogR":[], "RF":[], "MLP":[], "SVC":[]}
dict_acc_cv_random = {"LogR":[], "RF":[], "MLP":[], "SVC":[]}
dict_acc_pred_random = {"LogR":[], "RF":[], "MLP":[], "SVC":[]}
for i in range(100):
    random_state = random.randint(0,1000)
    print("------------------------------------------------------------------")
    print("Iteration: ", i+1)
    print ("Random state: ", random_state)
    # train-test split
    x_train, x_test, y_train, y_test = split(x,y,random_state)

    # Min-Max Scaling
    x_train, x_test = minmax_scaling(x_train, x_test)

    # Mutual Information based feature selection -- gives top 1000 genes
    top_genes_mi, top_1000_df = get_migenes(x_train, y_train, 1000, random_state)
    dict_mi[random_state] = top_genes_mi

    print("Top 1000 genes selected using MI")
   
    # updated x_train and x_test
    x_train_mi = x_train[top_genes_mi]
    x_test_mi = x_test[top_genes_mi]
    
    # SVM-RFE based feature selection -- gives top genes selected by SVM-RFE (not a fixed range)
    svm_rfe_df = svm_rfe(x_train_mi, y_train, top_genes_mi, random_state) # returns dataframe of features (all features with selected True/False)

# selected features from SVM-RFE
    features_svmrfe = svm_rfe_df [svm_rfe_df['Selected'] == True]['Features'].tolist()
    dict_svmrfe[random_state] = features_svmrfe

    print("Top genes selected using SVM-RFE: ", len(features_svmrfe))

    # updated x_train and x_test
    x_train_svmrfe = x_train_mi[features_svmrfe]
    x_test_svmrfe = x_test_mi[features_svmrfe]

    # running 4 classifiers for evaltuion
    logr = LogisticRegression(random_state=random_state, max_iter=800, solver='liblinear')
    rf = RandomForestClassifier(random_state=random_state, n_estimators=500)
    mlp = MLPClassifier(random_state=random_state, max_iter=800, activation="relu", solver='lbfgs', alpha=1e-5)
    svc = SVC(random_state=random_state, kernel = "linear", C = 0.1)


    models = [logr, rf, mlp, svc]
    model_names = ["LogR", "RF", "MLP", "SVC"]

    # returns dictionary for models
    acc_cv, acc_pred = classification_task(x_train_svmrfe, x_test_svmrfe, y_train, y_test, models, model_names, random_state)    
    for key in acc_cv.keys():
        dict_acc_cv[key].append(acc_cv[key])
    for key in acc_pred.keys():
        dict_acc_pred[key].append(acc_pred[key])

    print("Classification task using SVM-RFE selected genes done")

# random features for comparison

    num_features = len(features_svmrfe)
    all_features = x.columns.tolist()
    random_features = random_selection(num_features, all_features, random_state)
    x_train_random = x_train[random_features]
    x_test_random = x_test[random_features]

    # classification task
    acc_cv, acc_pred = classification_task(x_train_random, x_test_random, y_train, y_test, models, model_names, random_state)    
    for key in acc_cv.keys():
        dict_acc_cv_random[key].append(acc_cv[key])
    for key in acc_pred.keys():
        dict_acc_pred_random[key].append(acc_pred[key])

    print("Classification task using Randomly selected genes done")





------------------------------------------------------------------
Iteration:  1
Random state:  654
Top 1000 genes selected using MI
Top genes selected using SVM-RFE:  10
Classification task using SVM-RFE selected genes done
Classification task using Randomly selected genes done
------------------------------------------------------------------
Iteration:  2
Random state:  620
Top 1000 genes selected using MI
Top genes selected using SVM-RFE:  70
Classification task using SVM-RFE selected genes done
Classification task using Randomly selected genes done
------------------------------------------------------------------
Iteration:  3
Random state:  517
Top 1000 genes selected using MI
Top genes selected using SVM-RFE:  10
Classification task using SVM-RFE selected genes done
Classification task using Randomly selected genes done
------------------------------------------------------------------
Iteration:  4
Random state:  815
Top 1000 genes selected using MI
Top genes selected using SV

In [15]:
svm_rfe_df.head(20)

Unnamed: 0,Features,Selected,Rank
3,CAB39L,True,1
992,FLAD1,True,1
987,CTNNA2,True,1
605,STAT1,True,1
598,CD80,True,1
47,BMX,True,1
40,GREM2,True,1
110,PLIN1,True,1
111,IBSP,True,1
658,LINC01416,True,1


In [None]:
# saving results
df_mi_df = pd.DataFrame()
for key in dict_mi.keys():
    df_mi_df[key] = dict_mi[key]
df_mi_df.to_csv("results/SO/R/00-1_Top_1000_genes_MI.csv")
df_svmrfe = pd.DataFrame()
for key in dict_svmrfe.keys():
    df_svmrfe[key] = [dict_svmrfe[key]]
df_svmrfe.to_csv("results/SO/R/00-1_Top_genes_SVMRFE.csv") 

df_acc_cv = pd.DataFrame(dict_acc_cv)
df_acc_cv.to_csv("results/SO/R/00-1_Accuracy_CV_SVMRFE.csv")
df_acc_pred = pd.DataFrame(dict_acc_pred)
df_acc_pred.to_csv("results/SO/R/00-1_Accuracy_Pred_SVMRFE.csv")
df_acc_cv_random = pd.DataFrame(dict_acc_cv_random)
df_acc_cv_random.to_csv("results/SO/R/00-1_Accuracy_CV_Random.csv")
df_acc_pred_random = pd.DataFrame(dict_acc_pred_random)
df_acc_pred_random.to_csv("results/SO/R/00-1_Accuracy_Pred_Random.csv")

In [34]:
len((list(df_svmrfe[654]))[0])

403

## FINAL results over selected gene set

In [None]:
# Stratified CV - RF

cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)

y_pred_CV_RF = cross_val_predict(rf,  x_train, y_train, cv = cv)
cm_CV_RF = confusion_matrix(y_train, y_pred_CV_RF)
print(f"Classification matrix:\n {cm_CV_RF}")

report_CV_RF = classification_report(y_train, y_pred_CV_RF)
print(report_CV_RF)
report_CV_RF = classification_report(y_train, y_pred_CV_RF, output_dict=True)
report_CV_RF_df = pd.DataFrame(report_CV_RF).transpose()

results_RF = cross_validate(rf,  x_train, y_train, cv = cv)
test_score_RF = results_RF["test_score"]
print(f"\nAverage accuracy: "
      f"{test_score_RF.mean():.3f} ± {test_score_RF.std():.3f}")


In [None]:
results_RF

In [None]:
# prediction
y_pred_RF = rf.predict(x_test_mi)
cm_RF = confusion_matrix(y_test, y_pred_RF)

In [None]:
# Printing the Classification matrix & report of classifier - RF

cm_RF = confusion_matrix(y_test, y_pred_RF)
print(f"Classification matrix:\n {cm_RF}\n")

report_RF = classification_report(y_test, y_pred_RF)
print(report_RF)

report_RF = classification_report(y_test, y_pred_RF, output_dict=True)
report_RF_df = pd.DataFrame(report_RF).transpose()



In [None]:
importances = rf.feature_importances_

# Put into DataFrame for ranking
gene_importance = pd.DataFrame({
    'Gene': x_train_mi.columns,
    'Importance': importances
})

# Sort descending
gene_importance = gene_importance.sort_values(by='Importance', ascending=False).reset_index(drop=True)

print(gene_importance.head(20))  # Top 20 important genes

In [None]:
top_n = 20
plt.figure(figsize=(8,6))
plt.barh(gene_importance['Gene'][:top_n][::-1], 
         gene_importance['Importance'][:top_n][::-1])
plt.xlabel("Feature Importance (Gini importance)")
plt.title(f"Top {top_n} Important Genes (Random Forest)")
plt.show()

In [None]:
features_svmrfe_df = svm_rfe(x_train_mi, y_train, x_train_mi.columns)

In [None]:
features_svmrfe = features_svmrfe_df [features_svmrfe_df['Selected'] == True]['Features'].tolist()

In [None]:
len(features_svmrfe)

Testing selected genes using classification task

In [None]:
# models to be used
logr = LogisticRegression(random_state=1, max_iter=800, solver='liblinear')

In [None]:
x_train_rfe = x_train_mi[features_svmrfe]
x_test_rfe = x_test_mi[features_svmrfe]

In [None]:
logr.fit(x,y)


In [None]:
logr.transforme

LogR

In [None]:
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 42)
y_pred_CV = cross_val_predict(logr, x, y, cv=cv)               

cv_result = cross_validate(logr, x, y, cv = cv)
report_CV = classification_report(y, y, target_names=["Normal", "Tumor"], output_dict = True)

In [None]:
report_CV

In [None]:
y_pred_rfe = logr.predict(x_test_rfe)


# confusion matrix for prediction
# cm = confusion_matrix(y_test, y_pred)

# evaluation --> prediction
#         scores = eval(y_test, y_pred)
report_test= classification_report(y_test, y_pred_rfe)
report_df = pd.DataFrame(report_CV).transpose()
scores = list(report_df.iloc[4,:])
roc_auc_tumor = roc_auc_score(y_test, logr.predict_proba(x_test_rfe)[:,1])

In [None]:
logr.classes_[1]

In [None]:
logr.predict_proba(x_test_rfe)