# **Configuração Inicial**

In [None]:
import sys

from google.colab import drive
drive.mount('/content/drive')

PATH = "drive/MyDrive/ML4AML"
sys.path.append(PATH)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%pip install interpret


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
import pandas as pd

df_clinical_global = pd.read_csv(PATH+'/clinical_data/clinicaldata_merged_final.csv', sep=',', index_col=None) 
df_expressions_global = pd.read_csv(PATH+'/genetic_data/expressions/expressions_merged_intersection.csv', index_col=None) 

df_mutation_global = pd.read_csv(PATH+'/genetic_data/mutations/mutations_merged_intersection_final.csv', index_col=None) 

# **Funções Auxiliares**

In [None]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from scipy.stats import zscore

PATH = 'drive/MyDrive/ML4AML/'
KEY_MERGE = 'SAMPLE_ID'
CLASS_VARIABLE = "Overall Survival Status"
TREATMENT =  'Treatment Intensity'


def mergeDataSetsByKey(dataset1, dataset2, key):
    
    newDataSet = pd.merge(dataset1, dataset2, on=key)
    
    return newDataSet

def convertClassToNumber(df_dataset, class_label):
    label_encoder = LabelEncoder()
    # print(df_dataset[class_label])
    df_dataset[class_label] = label_encoder.fit_transform(df_dataset[class_label])

    return df_dataset

def normalizeData(df_dataset):

    column_list = ["Diagnosis Age", "Bone Marrow Blast Percentage", "Mutation Count", "PB Blast Percentage", "WBC"]

    for column in column_list:
        df_dataset[column] = zscore(df_dataset[column]) 
    return df_dataset

def preprocessingDataModel(df_dataset, to_drop=['Study ID', KEY_MERGE, 'Overall Survival (Months)']):
    df_dataset = df_dataset.drop(columns=to_drop)
    return pd.get_dummies(df_dataset)


def preprocess_validation(data):
        
    X_train_id = pd.read_csv(PATH+'merged_data/train_test_split/X_train_id.csv', sep=',', index_col=None) 
    X_valid_id = pd.read_csv(PATH+'merged_data/train_test_split/X_valid_id.csv', sep=',', index_col=None)
    
    X_full = pd.concat([X_train_id,X_valid_id])

    X_train = mergeDataSetsByKey(X_train_id, data, KEY_MERGE)
    X_valid = mergeDataSetsByKey(X_valid_id, data, KEY_MERGE)
    X_full = mergeDataSetsByKey(X_full, data, KEY_MERGE)

    y_train = X_train[CLASS_VARIABLE]
    y_valid = X_valid[CLASS_VARIABLE]
    y_full = X_full[CLASS_VARIABLE]

    X_train = X_train.drop(columns=[CLASS_VARIABLE, KEY_MERGE])
    X_valid = X_valid.drop(columns=[CLASS_VARIABLE, KEY_MERGE])
    X_full = X_full.drop(columns=[CLASS_VARIABLE, KEY_MERGE])
    
    return X_train, y_train, X_valid, y_valid, X_full, y_full

def preprocess_finaltest(data):
        
  X_id = pd.read_csv(PATH+'merged_data/train_test_split/X_final_test_id.csv', sep=',', index_col=None) 
    

  X = mergeDataSetsByKey(X_id, data, KEY_MERGE)
  y = X[CLASS_VARIABLE]
  X = X.drop(columns=[CLASS_VARIABLE, KEY_MERGE])

  return X, y



def preprocess(data_clin, data_mut=None, data_exp=None, features_mut=None, features_exp=None, to_merge="CLIN", drop_key=False, final=False):

  data = convertClassToNumber(data_clin, CLASS_VARIABLE)
  id_table = data[KEY_MERGE]

  if "CLIN" in to_merge:
    data = preprocessingDataModel(data)
    data = normalizeData(data)
    data.insert(0, KEY_MERGE, id_table)
  
  else:

    if "TREAT" in to_merge:

      data = data[[KEY_MERGE, TREATMENT, CLASS_VARIABLE]]    
      data = preprocessingDataModel(data, to_drop=[KEY_MERGE])
      data.insert(0, KEY_MERGE, id_table)
    
    else:
      data = data[[KEY_MERGE, CLASS_VARIABLE]]
 
    
    
  if "MUT" in to_merge:
    f_mut = features_mut.copy()
    f_mut.append(KEY_MERGE)
      
    df_to_append = data_mut[f_mut]
    different_columns = df_to_append.columns.difference(data.drop(columns=[KEY_MERGE]).columns)
    
    data = mergeDataSetsByKey(data, df_to_append[different_columns], KEY_MERGE)
  
  if "EXP" in to_merge:
    f_exp = features_exp.copy()
    f_exp.append(KEY_MERGE)
      
    df_to_append = data_exp[f_exp]
    different_columns = df_to_append.columns.difference(data.drop(columns=[KEY_MERGE]).columns)
    
    data = mergeDataSetsByKey(data, df_to_append[different_columns], KEY_MERGE)
  

  X_train_id = pd.read_csv(PATH+'merged_data/train_test_split/X_train_id.csv', sep=',', index_col=None) 
  X_valid_id = pd.read_csv(PATH+'merged_data/train_test_split/X_valid_id.csv', sep=',', index_col=None)
  X_test_id = pd.read_csv(PATH+'merged_data/train_test_split/X_test_id.csv', sep=',', index_col=None) 
  
  X_train_id = pd.concat([X_train_id, X_valid_id])
  
  X_id = pd.concat([X_train_id, X_test_id])
  
  if(final == True):
    X_train_id = pd.concat([X_train_id, X_test_id])
    X_test_id = pd.read_csv(PATH+'merged_data/train_test_split/X_final_test_id.csv', sep=',', index_col=None)
    X_id = pd.concat([X_id, X_test_id])

 

  X = mergeDataSetsByKey(X_id, data, KEY_MERGE)
  y = X[CLASS_VARIABLE]
  X = X.drop(columns=[CLASS_VARIABLE, KEY_MERGE])

  X_train = mergeDataSetsByKey(X_train_id, data, KEY_MERGE)
  X_test = mergeDataSetsByKey(X_test_id, data, KEY_MERGE)
  y_train = X_train[CLASS_VARIABLE]
  y_test = X_test[CLASS_VARIABLE]

  if drop_key:
    X_train = X_train.drop(columns=[CLASS_VARIABLE, KEY_MERGE])
  
  X_test = X_test.drop(columns=[CLASS_VARIABLE, KEY_MERGE])
  
  return X_train, y_train, X_test, y_test, [X,y]


# **Seleção de Atributos de Mutação**

In [None]:
#from data_preprocessing import preprocess

features_mut = df_mutation_global.columns
features_mut = features_mut.to_list()
features_mut.pop(0) #removendo SAMPLE ID

selected_mut_features = features_mut

data_mut_train, y_train, data_mut_test, y_test, [X,y] = preprocess(df_clinical_global, data_mut=df_mutation_global, data_exp=None, features_mut=features_mut, features_exp=None, to_merge="MUT", drop_key=True)

In [None]:
from sklearn.feature_selection import chi2

chi_scores = chi2(data_mut_train, y_train)
p_values = pd.Series(chi_scores[1],index = data_mut_train.columns)
p_values.sort_values(ascending = False , inplace = True)

alpha = 0.05
selected_mut_features = p_values.loc[lambda x: x < alpha]
selected_mut_features = list(selected_mut_features.index)
print(selected_mut_features)


['PHF6', 'TP53']



# **Seleção de Atributos de Expressão**


## **Seleção por Lasso**

In [None]:

features_exp = df_expressions_global.columns
features_exp = features_exp.to_list()
features_exp.pop(0) #removendo SAMPLE ID


data_exp_train, y_train, data_exp_test, y_test, [X,y] = preprocess(df_clinical_global, data_mut=None, data_exp=df_expressions_global, features_mut=None, features_exp=features_exp, to_merge="EXP", drop_key=True)
features_exp = data_exp_train.columns



In [None]:
from sklearn.svm import LinearSVC
from sklearn.feature_selection import SelectFromModel


selected_exp_features = []

lsvc = LinearSVC(C=0.01, penalty="l1", dual=False, random_state=0)
lsvc.fit(data_exp_train, y_train)
selector = SelectFromModel(lsvc, prefit=True)

selected_exp_features = list(features_exp[selector.get_support()])

#index = 99
#for i in range(13):
    #print(index)
#    lsvc.fit(data_exp_train[0:index], y_train[0:index])
#    selector = SelectFromModel(lsvc, prefit=True)

    #print(f"Features selected: {features_exp[selector.get_support()]}")
#    selected_features = list(features_exp[selector.get_support()])
    #print(len(selected_features))
#    selected_exp_features.append(selected_features)
    
#    index += 10

#selected_exp_features = [item for sublist in selected_exp_features for item in sublist]
#selected_exp_features = pd.Series(selected_exp_features).value_counts()
#print(selected_exp_features)

#selected_exp_features = list(selected_exp_features[selected_exp_features.values > 0].index)

print(selected_exp_features)

['CCDC144A', 'CPNE8', 'CYP2E1', 'CYTL1', 'HAS1', 'KIAA0141', 'KIAA1549', 'LAMA2', 'LTK', 'MICALL2', 'MX1', 'PPM1H', 'PTH2R', 'PTP4A3', 'RAD21', 'RGS9BP', 'SLC29A2', 'TMED4', 'TNFSF11', 'TNK1', 'TSKS', 'XIST']


## **Seleção por ANOVA**

In [None]:
from sklearn.feature_selection import SelectKBest, f_classif

anova_selection = SelectKBest(score_func=f_classif, k='all')
anova_selection.fit(data_exp_train, y_train)

anova_features = list(features_exp[anova_selection.pvalues_ < 0.001])
print(len(anova_features))

125


# **Features Utilizadas**

In [None]:
#todos os atributos de expressão
#selected_exp_features = list(data_exp_train.columns)

#atributos de expressão selecionados por anova
#selected_exp_features = anova_features


#lasso + literatura
literature_features = ['FLT3', 'NPM1', 'DNMT3A', 'IDH1', 'IDH2', 'TP53', 'TET2', 'ASXL1', 'RUNX1', 'CEBPA', 'NRAS', 'KRAS', 'SF3B1', 'U2AF1', 'SRSF2']
#selected_exp_features = literature_features
#selected_exp_features = list(set(selected_exp_features + literature_features))

selected_mut_features = list(set(selected_mut_features + literature_features))
#selected_mut_features = literature_features

#lasso
print(selected_mut_features)
print(len(selected_mut_features))

print(selected_exp_features)
print(len(selected_exp_features))

['U2AF1', 'KRAS', 'NRAS', 'ASXL1', 'DNMT3A', 'PHF6', 'TP53', 'IDH1', 'SRSF2', 'NPM1', 'FLT3', 'SF3B1', 'IDH2', 'CEBPA', 'TET2', 'RUNX1']
16
['CCDC144A', 'CPNE8', 'CYP2E1', 'CYTL1', 'HAS1', 'KIAA0141', 'KIAA1549', 'LAMA2', 'LTK', 'MICALL2', 'MX1', 'PPM1H', 'PTH2R', 'PTP4A3', 'RAD21', 'RGS9BP', 'SLC29A2', 'TMED4', 'TNFSF11', 'TNK1', 'TSKS', 'XIST']
22



# **Gerando Modelos Individuais**

## **Funções Auxiliares**

In [None]:
import seaborn as sns
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV
from interpret.glassbox import ExplainableBoostingClassifier


def conf_matrix(y_test, y_pred):

    total_cm = metrics.confusion_matrix(y_test, y_pred)

    group_counts = ["{0:0.0f}".format(value) for value in total_cm.flatten()]
    group_percentages = ["{0:.2%}".format(value) for value in total_cm.flatten()/np.sum(total_cm)]
    labels = [f"{v1}\n{v2}" for v1,v2 in zip(group_counts,group_percentages)]
    labels = np.asarray(labels).reshape(2,2)
    ax = sns.heatmap(total_cm, annot=labels, fmt="", annot_kws={"size":20})

    #ax = sns.heatmap(total_cm, annot=True, annot_kws={"size":20})
    ax.set_xlabel('\nPredicted Values')
    ax.set_ylabel('Actual Values ')
    #plt.title("Ensemble | Holdout")
    #plt.savefig("confusion_matrix_holdout.png")
    plt.show()


def checkPerformance(y_test, y_pred, verbose=True):

    f1_result = metrics.f1_score(y_test, y_pred, average="weighted", zero_division=1)
    pres = metrics.precision_score(y_test, y_pred, average="weighted", zero_division=1)
    recall = metrics.recall_score(y_test, y_pred, average='weighted', zero_division=1)
    acc = metrics.accuracy_score(y_test, y_pred)
    
    
    try:
        roc = metrics.roc_auc_score(y_test, y_pred)
    except ValueError:
        roc = 0
    
    if (verbose == True): 
      print("F1: %.4f%%" % (f1_result))
      print("Precision: %.4f%%" % (pres))
      print("Recall: %.4f%%" % (recall))
      print("Accuracy: %.4f%%" % (acc))
      print("\nAUC: %.4f%%" % (roc))

    return f1_result, pres, recall, acc, roc
    

def explainableModel(X_train, y_train, X_test, y_test, name):
    
    X_train_i, y_train_i, X_val, y_val, X_trainval, y_trainval = preprocess_validation(X_train)

    X_train = X_train.drop(columns=[CLASS_VARIABLE, KEY_MERGE])

    grid = { 
        'random_state' : [1],
        "min_samples_leaf":[2, 3, 4],
        'learning_rate':[0.001, 0.01]
    }

    cv = GridSearchCV(estimator=ExplainableBoostingClassifier(), param_grid=grid, cv=5)
    cv.fit(X_trainval, y_trainval)

   

    ebm = ExplainableBoostingClassifier(**cv.best_params_)
    ebm.fit(X_train, y_train)

    from interpret import show

    ebm_global = ebm.explain_global()
    show(ebm_global)

    y_pred = ebm.predict(X_test)

    #
    y_probs = ebm.predict_proba(X_test)[:,1]
   
    #print("Training Performance")
    #a, b, c, d, e = checkPerformance(y_train, y_pred_train)
    
    print("Test Performance")
    f1_result, pres, recall, acc, roc = checkPerformance(y_test, y_pred)

    results = {"model":name, "F1-Score":f1_result, "AUC": roc,
        "Accuracy": acc, 
        "Precision":pres, 
        "Recall":recall, 
        }
    results = pd.DataFrame(data=[results])

    return y_pred, y_probs, results, ebm


ebm_df = pd.DataFrame()


## **Dados Clínicos**

In [None]:

ebm_df = pd.DataFrame()

# clinical data model
print("\nClinical Data")

data_clin_train, y_train, data_clin_test, y_test, [data_clin, y_clin] =  preprocess(df_clinical_global, data_mut=None, data_exp=None, features_mut=None, features_exp=None, to_merge="CLIN", drop_key=False, final=True)

print("\n\nEBM Model")
clin_pred, prob, results, clin_ebm = explainableModel(data_clin_train, y_train, data_clin_test, y_test, "CLIN")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)





Clinical Data


EBM Model


Test Performance
F1: 0.6324%
Precision: 0.6286%
Recall: 0.6429%
Accuracy: 0.6429%

AUC: 0.5889%


## **Dados de Mutação**

In [None]:

print("\nMutation Data")
data_mut_train, y_train, data_mut_test, y_test, [data_mut,y] = preprocess(df_clinical_global, data_mut=df_mutation_global, data_exp=None, features_mut=selected_mut_features, features_exp=None, to_merge=["MUT", "TREAT"], drop_key=False,  final=True)

print("\n\EBM Model")
mut_pred, prob, results, mut_ebm = explainableModel(data_mut_train, y_train, data_mut_test, y_test, "MUT")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Mutation Data

\EBM Model


Test Performance
F1: 0.6149%
Precision: 0.6363%
Recall: 0.6071%
Accuracy: 0.6071%

AUC: 0.6056%


## **Dados de Expressão**

In [None]:

print("\nExpression Data")
data_exp_train, y_train, data_exp_test, y_test, [data_exp,y] = preprocess(df_clinical_global, data_mut=None, data_exp=df_expressions_global, features_mut=None, features_exp=selected_exp_features, to_merge=["EXP", "TREAT"], drop_key=False, final=True)

print("\n\nEBM Model")
exp_pred, prob, results, exp_ebm = explainableModel(data_exp_train, y_train, data_exp_test, y_test, "EXP")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Expression Data


EBM Model


Test Performance
F1: 0.8530%
Precision: 0.8589%
Recall: 0.8571%
Accuracy: 0.8571%

AUC: 0.8222%


## **Dados Clínicos + Mutação**

In [None]:

print("\nClinical + Mutation Data")
data_clinmut_train, y_train, data_clinmut_test, y_test, [data_clinmut,y] =  preprocess(df_clinical_global, data_mut=df_mutation_global, data_exp=None, features_mut=selected_mut_features, features_exp=None, to_merge=["MUT", "CLIN"], drop_key=False, final=True)

print("\n\nEBM Model")
clinmut_pred, prob, results, clinmut_ebm = explainableModel(data_clinmut_train, y_train, data_clinmut_test, y_test, "CLIN+MUT")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Clinical + Mutation Data


EBM Model


Test Performance
F1: 0.5589%
Precision: 0.5518%
Recall: 0.5714%
Accuracy: 0.5714%

AUC: 0.5111%


## **Dados Clínicos + Expressão**

In [None]:
print("\nClinical + Expression Data")
data_clinexp_train, y_train, data_clinexp_test, y_test, [data_clinexp,y] = preprocess(df_clinical_global, data_mut=None, data_exp=df_expressions_global, features_mut=None, features_exp=selected_exp_features, to_merge=["EXP", "CLIN"], drop_key=False, final=True)

print("\n\nEBM Model")
clinexp_pred, prob, results, clinexp_ebm = explainableModel(data_clinexp_train, y_train, data_clinexp_test, y_test, "CLIN+EXP")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Clinical + Expression Data


EBM Model


Test Performance
F1: 0.7794%
Precision: 0.7821%
Recall: 0.7857%
Accuracy: 0.7857%

AUC: 0.7444%


## **Dados de Mutação + Expressão**

In [None]:
print("\nMutation + Expression Data")
data_mutexp_train, y_train, data_mutexp_test, y_test, [data_mutexp,y] = preprocess(df_clinical_global, data_mut=df_mutation_global, data_exp=df_expressions_global, features_mut=selected_mut_features, features_exp=selected_exp_features, to_merge=["MUT", "EXP","TREAT"], drop_key=False, final=True)

print("\n\nEBM Model")
mutexp_pred, prob, results, mutexp_ebm = explainableModel(data_mutexp_train, y_train, data_mutexp_test, y_test, "MUT+EXP")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Mutation + Expression Data


EBM Model




1 fits failed out of a total of 30.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
1 fits failed with the following error:
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/usr/local/lib/python3.10/dist-packages/interpret/glassbox/ebm/ebm.py", line 719, in fit
    bagged_ranked_interaction = provider.parallel(
  File "/usr/local/lib/python3.10/dist-packages/interpret/provider/compute.py", line 19, in parallel
    results = Parallel(n_jobs=self.n_jobs)(
  File "/usr/local/lib/python3.10/dist-packages/joblib/parallel.py", line 1088, in __call__
    while self.dispatch_one_batch(

Test Performance
F1: 0.8191%
Precision: 0.8191%
Recall: 0.8214%
Accuracy: 0.8214%

AUC: 0.7944%


## **Dados Clínicos + Mutação + Expressão**

In [None]:

print("\nClinical + Mutation + Expression Data")
data_all_train, y_train, data_all_test, y_test, [data_all,y] =  preprocess(df_clinical_global, data_mut=df_mutation_global, data_exp=df_expressions_global, features_mut=selected_mut_features, features_exp=selected_exp_features, to_merge=["MUT", "EXP", "CLIN"], drop_key=False, final=True)

print("\n\nRandom Forest")
all_pred, prob, results, all_ebm = explainableModel(data_all_train, y_train, data_all_test, y_test, "CLIN+MUT+EXP")
ebm_df = pd.concat([ebm_df, results], ignore_index=True)



Clinical + Mutation + Expression Data


Random Forest


Test Performance
F1: 0.7468%
Precision: 0.7456%
Recall: 0.7500%
Accuracy: 0.7500%

AUC: 0.7167%


## **Salvando Resultados...**

In [None]:
ebm_df.to_csv(PATH+"ebm_results.csv", index=False)

## **Gerando Código Latex**

In [None]:
print(ebm_df.to_latex())

\begin{tabular}{llrrrrr}
\toprule
{} &         model &  F1-Score &       AUC &  Accuracy &  Precision &    Recall \\
\midrule
0 &          CLIN &  0.632414 &  0.588889 &  0.642857 &   0.628571 &  0.642857 \\
1 &           MUT &  0.614907 &  0.605556 &  0.607143 &   0.636264 &  0.607143 \\
2 &           EXP &  0.852966 &  0.822222 &  0.857143 &   0.858929 &  0.857143 \\
3 &      CLIN+MUT &  0.558897 &  0.511111 &  0.571429 &   0.551786 &  0.571429 \\
4 &      CLIN+EXP &  0.779449 &  0.744444 &  0.785714 &   0.782143 &  0.785714 \\
5 &       MUT+EXP &  0.819142 &  0.794444 &  0.821429 &   0.819131 &  0.821429 \\
6 &  CLIN+MUT+EXP &  0.746799 &  0.716667 &  0.750000 &   0.745614 &  0.750000 \\
\bottomrule
\end{tabular}




In future versions `DataFrame.to_latex` is expected to utilise the base implementation of `Styler.to_latex` for formatting and rendering. The arguments signature may therefore change. It is recommended instead to use `DataFrame.style.to_latex` which also contains additional functionality.



## **Interpretabilidade de predições**



In [None]:
#isolando as predições para as amostras de teste

data_exp_test['label'] = y_test
data_exp_test['prediction'] = exp_pred
data_exp_test['probability'] = prob


true_predictions = data_exp_test.loc[data_exp_test['prediction'] == data_exp_test['label']]
false_predictions = data_exp_test.loc[data_exp_test['prediction'] != data_exp_test['label']]


true_predictions = true_predictions.sort_values(['probability'], ascending=False)
false_predictions = false_predictions.sort_values(['probability'], ascending=False)


true_sample = true_predictions.drop(columns=['label', 'prediction', 'probability'])
y_true_sample = true_predictions['label']

false_sample = false_prediction.drop(columns=['label', 'prediction', 'probability'])
y_false_sample = false_predictions['label']

In [None]:
from interpret import show

show(exp_ebm.explain_local(true_sample, y_true_sample))

In [None]:
from interpret import show

show(exp_ebm.explain_local(false_sample, y_false_sample))