In [None]:
import pandas as pd
import numpy as np
from numpy import mean
from numpy import std
import matplotlib.pyplot as plt
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import confusion_matrix, accuracy_score, matthews_corrcoef, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import StackingClassifier

from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D

In [None]:
estimators= [('RF', RandomForestClassifier()), ('XGB', XGBClassifier()), ('DT', DecisionTreeClassifier()), ('SVM', SVC()), ('KNN', KNeighborsClassifier()), ('LR', LogisticRegression()), ('LGBM', LGBMClassifier())]
stack = StackingClassifier( estimators=estimators, final_estimator= LogisticRegression())


#**AAC_IL13**

In [None]:
AAC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/AAC_Train.csv')


In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
AAC_ILTrain.dtypes

In [None]:
for column in AAC_ILTrain.columns:
  if column == 'Target':
    AAC_ILTrain[column] = AAC_ILTrain[column].astype('category')
  AAC_ILTrain[column] = pd.to_numeric(AAC_ILTrain[column], errors='coerce')
  if column == 'Target':
    AAC_ILTrain[column] = AAC_ILTrain[column].astype('category')
AAC_ILTrain.fillna(0, inplace=True)

In [None]:
X = AAC_ILTrain.drop(['Target'], axis=1)
y = AAC_ILTrain.Target
AAC_ILTrain_Metrics = []
AAC_ILTrain_Metrics = pd.DataFrame(AAC_ILTrain_Metrics)
AAC_ILTrain_Metrics['Classifier'] = 'Classifier'
AAC_ILTrain_Metrics['Accuracy'] = 'Accuracy'
AAC_ILTrain_Metrics['mcc'] = 'mcc'
AAC_ILTrain_Metrics['auc'] = 'auc'
AAC_ILTrain_Metrics['sensitivity'] = 'sensitivity'
AAC_ILTrain_Metrics['specificity'] = 'specificity'

cv = KFold(n_splits=10, random_state=1, shuffle=True)

# create model

models = [LogisticRegression(penalty='l2',tol=0.0006,
                           C=3, fit_intercept=True, intercept_scaling=1.0,
                           class_weight=None, random_state=10),
          RandomForestClassifier(n_estimators=100,criterion='entropy',max_features='sqrt',random_state=10),
          SVC(probability=True),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]

for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv
                             =cv, n_jobs=-1)
  y_proba = cross_val_predict(model, X, y, cv
                             =cv, method='predict_proba')
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  AAC_ILTrain_Metrics.loc[len(AAC_ILTrain_Metrics.index)] = [model,Accuracy, mcc, auc, sensitivity,specificity]

print(AAC_ILTrain_Metrics)
AAC_ILTrain_Metrics.to_csv("AAC_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("AAC_ILTrain_Probability.csv")

In [None]:
AAC_ILTrain_Metrics

In [None]:
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)

In [None]:
pd.DataFrame(prob).to_csv("AAC_Probability.csv")

#**APAAC_IL13**

In [None]:
APAAC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/APAAC_Train.csv')

for column in APAAC_ILTrain.columns:
  APAAC_ILTrain[column] = pd.to_numeric(APAAC_ILTrain[column], errors='coerce')
  if column == 'Target':
    APAAC_ILTrain[column] = APAAC_ILTrain[column].astype('category')

APAAC_ILTrain.fillna(0, inplace=True)
APAAC_ILTrain.dtypes

In [None]:
X = APAAC_ILTrain.drop(['Target'], axis=1)
y = APAAC_ILTrain.Target
APAAC_ILTrain_Metrics = []
APAAC_ILTrain_Metrics = pd.DataFrame(APAAC_ILTrain_Metrics)
APAAC_ILTrain_Metrics['Classifier'] = 'Classifier'
APAAC_ILTrain_Metrics['Accuracy'] = 'Accuracy'
APAAC_ILTrain_Metrics['mcc'] = 'mcc'
APAAC_ILTrain_Metrics['auc'] = 'auc'
APAAC_ILTrain_Metrics['sensitivity'] = 'sensitivity'
APAAC_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  APAAC_ILTrain_Metrics.loc[len(APAAC_ILTrain_Metrics.index)] = [model,Accuracy, mcc, auc, sensitivity, specificity]

print(APAAC_ILTrain_Metrics)
APAAC_ILTrain_Metrics.to_csv("APAAC_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("APAAC_ILTrain_Probability.csv")
print(prob)

In [None]:
APAAC_ILTrain_Metrics

#**CKSAAP_IL13**

In [None]:
CKSAAP_Train  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP_Train.csv')

for column in CKSAAP_Train.columns:
  CKSAAP_Train[column] = pd.to_numeric(CKSAAP_Train[column], errors='coerce')
  if column == 'Target':
    CKSAAP_Train[column] = CKSAAP_Train[column].astype('category')

CKSAAP_Train.fillna(0, inplace=True)
CKSAAP_Train.dtypes

In [None]:
CKSAAP_ILTest  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP_Test.csv')

for column in CKSAAP_ILTest.columns:
  CKSAAP_ILTest[column] = pd.to_numeric(CKSAAP_ILTest[column], errors='coerce')
  if column == 'Target':
    CKSAAP_ILTest[column] = CKSAAP_ILTest[column].astype('category')

CKSAAP_ILTest.fillna(0, inplace=True)
CKSAAP_ILTest.dtypes

In [None]:
X_train = CKSAAP_Train.drop(['Target'], axis=1)
y_train = CKSAAP_Train.Target

X_test = CKSAAP_ILTest.drop(['Target'], axis=1)
y_test = CKSAAP_ILTest.Target

CKSAAP_ILTest_Metrics = pd.DataFrame(columns=['Classifier', 'Accuracy', 'mcc', 'auc', 'sensitivity', 'specificity'])

models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm1 = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    sensitivity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    specificity = cm1[1, 1] / (cm1[1, 0] + cm1[1, 1])
    CKSAAP_ILTest_Metrics.loc[len(CKSAAP_ILTest_Metrics)] = [model.__class__.__name__, Accuracy, mcc, auc, sensitivity, specificity]

print(CKSAAP_ILTest_Metrics)
CKSAAP_ILTest_Metrics.to_csv("CKSAAP_ILTest_Metrics.csv")

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X_train, y_train)
pd.DataFrame(prob).to_csv("CKSAAP_ILTest_Probability.csv")
print(prob)


In [None]:
CKSAAP_ILTest_Metrics

#**TPC_IL13**

In [None]:
TPC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/TPC_Train.csv')

for column in TPC_ILTrain.columns:
  TPC_ILTrain[column] = pd.to_numeric(TPC_ILTrain[column], errors='coerce')
  if column == 'Target':
    TPC_ILTrain[column] = TPC_ILTrain[column].astype('category')

TPC_ILTrain.fillna(0, inplace=True)
TPC_ILTrain.dtypes

In [None]:
X = TPC_ILTrain.drop(['Target'], axis=1)
y = TPC_ILTrain.Target

TPC_ILTrain_Metrics = []
TPC_ILTrain_Metrics = pd.DataFrame(TPC_ILTrain_Metrics)
TPC_ILTrain_Metrics['Classifier'] = 'Classifier'
TPC_ILTrain_Metrics['Accuracy'] = 'Accuracy'
TPC_ILTrain_Metrics['mcc'] = 'mcc'
TPC_ILTrain_Metrics['auc'] = 'auc'
TPC_ILTrain_Metrics['sensitivity'] = 'sensitivity'
TPC_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  TPC_ILTrain_Metrics.loc[len(TPC_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity,specificity]

print(TPC_ILTrain_Metrics)
TPC_ILTrain_Metrics.to_csv("TPC_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("TPC_ILTrain_Probability.csv")
print(prob)

In [None]:
TPC_ILTrain_Metrics

#**CTDC_IL13**

In [None]:
CTDC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC_Train.csv')

for column in CTDC_ILTrain.columns:
  CTDC_ILTrain[column] = pd.to_numeric(CTDC_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_ILTrain[column] = CTDC_ILTrain[column].astype('category')

CTDC_ILTrain.fillna(0, inplace=True)
CTDC_ILTrain.dtypes

In [None]:
CTDC_Test  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC_Test.csv')

for column in CTDC_Test.columns:
  CTDC_Test[column] = pd.to_numeric(CTDC_Test[column], errors='coerce')
  if column == 'Target':
    CTDC_Test[column] = CTDC_Test[column].astype('category')

CTDC_Test.fillna(0, inplace=True)
CTDC_Test.dtypes

In [None]:
X_train = CTDC_ILTrain.drop(['Target'], axis=1)
y_train = CTDC_ILTrain.Target

X_test = CTDC_Test.drop(['Target'], axis=1)
y_test = CTDC_Test.Target

CTDC_ILTest_Metrics = pd.DataFrame(columns=['Classifier', 'Accuracy', 'mcc', 'auc', 'sensitivity', 'specificity'])

models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm1 = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    sensitivity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    specificity = cm1[1, 1] / (cm1[1, 0] + cm1[1, 1])
    CTDC_ILTest_Metrics.loc[len(CTDC_ILTest_Metrics)] = [model.__class__.__name__, Accuracy, mcc, auc, sensitivity, specificity]

print(CTDC_ILTest_Metrics)
CTDC_ILTest_Metrics.to_csv("CTDC_ILTest_Metrics.csv")

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X_train, y_train)
pd.DataFrame(prob).to_csv("CTDC_ILTest_Probability.csv")
print(prob)


In [None]:
CTDC_ILTest_Metrics

#**CTriad_IL13**

In [None]:
CTriad_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid_Train.csv')

for column in CTriad_ILTrain.columns:
  CTriad_ILTrain[column] = pd.to_numeric(CTriad_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTriad_ILTrain[column] = CTriad_ILTrain[column].astype('category')

CTriad_ILTrain.fillna(0, inplace=True)
CTriad_ILTrain.dtypes

In [None]:
CTraid_Test  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid_Test.csv')

for column in CTraid_Test.columns:
  CTraid_Test[column] = pd.to_numeric(CTraid_Test[column], errors='coerce')
  if column == 'Target':
    CTraid_Test[column] = CTraid_Test[column].astype('category')

CTraid_Test.fillna(0, inplace=True)
CTraid_Test.dtypes

In [None]:
X_train = CTriad_ILTrain.drop(['Target'], axis=1)
y_train = CTriad_ILTrain.Target

X_test = CTraid_Test.drop(['Target'], axis=1)
y_test = CTraid_Test.Target

CTraid_ILTest_Metrics = pd.DataFrame(columns=['Classifier', 'Accuracy', 'mcc', 'auc', 'sensitivity', 'specificity'])

models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm1 = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    sensitivity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    specificity = cm1[1, 1] / (cm1[1, 0] + cm1[1, 1])
    CTraid_ILTest_Metrics.loc[len(CTraid_ILTest_Metrics)] = [model.__class__.__name__, Accuracy, mcc, auc, sensitivity, specificity]

print(CTraid_ILTest_Metrics)
CTraid_ILTest_Metrics.to_csv("CTraid_ILTest_Metrics.csv")

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X_train, y_train)
pd.DataFrame(prob).to_csv("CTraid_ILTest_Probability.csv")
print(prob)


In [None]:
CTraid_ILTest_Metrics

#**DPC_IL13**

In [None]:
DPC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC_Train.csv')

for column in DPC_ILTrain.columns:
  DPC_ILTrain[column] = pd.to_numeric(DPC_ILTrain[column], errors='coerce')
  if column == 'Target':
    DPC_ILTrain[column] = DPC_ILTrain[column].astype('category')

DPC_ILTrain.fillna(0, inplace=True)
DPC_ILTrain.dtypes

In [None]:
DPC_Test  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC_Test.csv')

for column in DPC_Test.columns:
  DPC_Test[column] = pd.to_numeric(DPC_Test[column], errors='coerce')
  if column == 'Target':
    DPC_Test[column] = DPC_Test[column].astype('category')

DPC_Test.fillna(0, inplace=True)
DPC_Test.dtypes

In [None]:
X_train = DPC_ILTrain.drop(['Target'], axis=1)
y_train = DPC_ILTrain.Target

X_test = DPC_Test.drop(['Target'], axis=1)
y_test = DPC_Test.Target

DPC_ILTest_Metrics = pd.DataFrame(columns=['Classifier', 'Accuracy', 'mcc', 'auc', 'sensitivity', 'specificity'])

models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]

for model in models:
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    cm1 = confusion_matrix(y_test, y_pred)
    Accuracy = accuracy_score(y_test, y_pred)
    mcc = matthews_corrcoef(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred)
    sensitivity = cm1[0, 0] / (cm1[0, 0] + cm1[0, 1])
    specificity = cm1[1, 1] / (cm1[1, 0] + cm1[1, 1])
    DPC_ILTest_Metrics.loc[len(DPC_ILTest_Metrics)] = [model.__class__.__name__, Accuracy, mcc, auc, sensitivity, specificity]

print(DPC_ILTest_Metrics)
DPC_ILTest_Metrics.to_csv("DPC_ILTest_Metrics.csv")

clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X_train, y_train)
pd.DataFrame(prob).to_csv("DPC_ILTest_Probability.csv")
print(prob)

In [None]:
DPC_ILTest_Metrics

#**Moran_IL13**

In [None]:
Moran_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/Moran_Train.csv')

for column in Moran_ILTrain.columns:
  Moran_ILTrain[column] = pd.to_numeric(Moran_ILTrain[column], errors='coerce')
  if column == 'Target':
    Moran_ILTrain[column] = Moran_ILTrain[column].astype('category')

Moran_ILTrain.fillna(0, inplace=True)
Moran_ILTrain.dtypes

In [None]:
X = Moran_ILTrain.drop(['Target'], axis=1)
y = Moran_ILTrain.Target

Moran_ILTrain_Metrics = []
Moran_ILTrain_Metrics = pd.DataFrame(Moran_ILTrain_Metrics)
Moran_ILTrain_Metrics['Classifier'] = 'Classifier'
Moran_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Moran_ILTrain_Metrics['mcc'] = 'mcc'
Moran_ILTrain_Metrics['auc'] = 'auc'
Moran_ILTrain_Metrics['sensitivity'] = 'sensitivity'

Moran_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Moran_ILTrain_Metrics.loc[len(Moran_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Moran_ILTrain_Metrics)
Moran_ILTrain_Metrics.to_csv("Moran_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Moran_ILTrain_Probability.csv")
print(prob)

In [None]:
Moran_ILTrain_Metrics

#**PAAC_IL13**

In [None]:
PAAC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/PAAC_Train.csv')

for column in PAAC_ILTrain.columns:
  PAAC_ILTrain[column] = pd.to_numeric(PAAC_ILTrain[column], errors='coerce')
  if column == 'Target':
    PAAC_ILTrain[column] = PAAC_ILTrain[column].astype('category')

PAAC_ILTrain.fillna(0, inplace=True)
PAAC_ILTrain.dtypes

In [None]:
X = PAAC_ILTrain.drop(['Target'], axis=1)
y = PAAC_ILTrain.Target

PAAC_ILTrain_Metrics = []
PAAC_ILTrain_Metrics = pd.DataFrame(PAAC_ILTrain_Metrics)
PAAC_ILTrain_Metrics['Classifier'] = 'Classifier'
PAAC_ILTrain_Metrics['Accuracy'] = 'Accuracy'
PAAC_ILTrain_Metrics['mcc'] = 'mcc'
PAAC_ILTrain_Metrics['auc'] = 'auc'
PAAC_ILTrain_Metrics['sensitivity'] = 'sensitivity'
PAAC_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  PAAC_ILTrain_Metrics.loc[len(PAAC_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(PAAC_ILTrain_Metrics)
PAAC_ILTrain_Metrics.to_csv("PAAC_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("PAAC_ILTrain_Probability.csv")
print(prob)

In [None]:
PAAC_ILTrain_Metrics

#**PseNaRAAC_IL13**

In [None]:
PseKRAAC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/PsekRAAC_Train.csv')

for column in PseKRAAC_ILTrain.columns:
  PseKRAAC_ILTrain[column] = pd.to_numeric(PseKRAAC_ILTrain[column], errors='coerce')
  if column == 'Target ':
    PseKRAAC_ILTrain[column] = PseKRAAC_ILTrain[column].astype('category')

PseKRAAC_ILTrain.fillna(0, inplace=True)
PseKRAAC_ILTrain.dtypes

In [None]:
X = PseKRAAC_ILTrain.drop(['Target'], axis=1)
y = PseKRAAC_ILTrain['Target']

PseKRAAC_ILTrain_Metrics = []
PseKRAAC_ILTrain_Metrics = pd.DataFrame(PseKRAAC_ILTrain_Metrics)
PseKRAAC_ILTrain_Metrics['Classifier'] = 'Classifier'
PseKRAAC_ILTrain_Metrics['Accuracy'] = 'Accuracy'
PseKRAAC_ILTrain_Metrics['mcc'] = 'mcc'
PseKRAAC_ILTrain_Metrics['auc'] = 'auc'
PseKRAAC_ILTrain_Metrics['sensitivity'] = 'sensitivity'
PseKRAAC_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  PseKRAAC_ILTrain_Metrics.loc[len(PseKRAAC_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(PseKRAAC_ILTrain_Metrics)
PseKRAAC_ILTrain_Metrics.to_csv("PseKRAAC_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("PseKRAAC_ILTrain_Probability.csv")
print(prob)

In [None]:
PseKRAAC_ILTrain_Metrics

#**Combined (DATASET) Recursive Shaply Value (Feature Selection) **

In [None]:
!pip install SHAP
!pip install probatus
!pip install lightgbm

In [None]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


In [None]:
Combined_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/Combined_Train.csv')
for column in Combined_ILTrain.columns:
  Combined_ILTrain[column] = pd.to_numeric(Combined_ILTrain[column], errors='coerce')
  if column == 'Target ':
    Combined_ILTrain[column] = Combined_ILTrain[column].astype('category')

Combined_ILTrain.fillna(0, inplace=True)
Combined_ILTrain.dtypes

In [None]:
import shap as SHAP
Combined_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/Combined_Train.csv')


In [None]:
feature_names = list(Combined_ILTrain.columns.values.tolist())


In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2075, class_sep=0.05, n_informative=6, n_features=2045,
                           random_state=0, n_redundant=10, n_clusters_per_class=1)
X = pd.DataFrame(X, columns=feature_names)

In [None]:
feature_names = Combined_ILTrain.columns

In [None]:
X[feature_names[:5]].head()


# **LGBMC**

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=5, class_weight='balanced')

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('LGBMClassifier.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

In [None]:
shap_elimination.get_reduced_features_set(num_features=10)


# **RF CLASSIFIER**

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=2)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('RandomForestClassifier.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

In [None]:
shap_elimination.get_reduced_features_set(num_features=10)


# **LR**

In [None]:
from sklearn.linear_model import LogisticRegression


In [None]:
clf = LogisticRegression(tol=0.0001, C=1.0)

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('LogisticRegression.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

In [None]:
shap_elimination.get_reduced_features_set(num_features=10)


# **SVM**

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(C=0.2, kernel='linear', degree=3)

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SVCFfile.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

In [None]:
shap_elimination.get_reduced_features_set(num_features=10)


# **XGB CLASSIFIER**

In [None]:
from xgboost import XGBClassifier


In [None]:
clf = XGBClassifier(C=0.2, kernel='linear', degree=3)
param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('XGBClassifier.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

# **DT CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='gini', splitter='best')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('DecisionTreeClassifier.csv')

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

performance_plot = shap_elimination.plot()

# **RECURSIVE SHAPLY VALUE FEATURE SELECTION DONE**

# **COMBINED Selected Features ML Model FROM ALL DATA SET BY RECURSIVE SHAPLY VALUE**

In [None]:
SF_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/Combined Final Selected Features.csv')

for column in SF_ILTrain.columns:
  SF_ILTrain[column] = pd.to_numeric(SF_ILTrain[column], errors='coerce')
  if column == 'Target':
    SF_ILTrain[column] = SF_ILTrain[column].astype('category')

SF_ILTrain.fillna(0, inplace=True)
SF_ILTrain.dtypes

In [None]:
X = SF_ILTrain.drop(['Target'], axis=1)
y = SF_ILTrain.Target

SF_ILTrain_Metrics = []
SF_ILTrain_Metrics = pd.DataFrame(SF_ILTrain_Metrics)
SF_ILTrain_Metrics['Classifier'] = 'Classifier'
SF_ILTrain_Metrics['Accuracy'] = 'Accuracy'
SF_ILTrain_Metrics['mcc'] = 'mcc'
SF_ILTrain_Metrics['auc'] = 'auc'
SF_ILTrain_Metrics['sensitivity'] = 'sensitivity'
SF_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  SF_ILTrain_Metrics.loc[len(SF_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(SF_ILTrain_Metrics)
SF_ILTrain_Metrics.to_csv("SF_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("SF_ILTrain_Probability.csv")
print(prob)

In [None]:
SF_ILTrain_Metrics

# **INDIVIDUAL DATA SET FEATURE SELECTION START**

# **CKSAAP FEATURE SELECTION**

In [None]:
!pip install SHAP
!pip install probatus
!pip install lightgbm

In [None]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# **ALL Model FOR CKSAAP**

In [None]:
SF_CKSAAP_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP_Train.csv')

for column in SF_CKSAAP_ILTrain.columns:
  SF_CKSAAP_ILTrain[column] = pd.to_numeric(SF_CKSAAP_ILTrain[column], errors='coerce')
  if column == 'Target ':
    SF_CKSAAP_ILTrain[column] = SF_CKSAAP_ILTrain[column].astype('category')

SF_CKSAAP_ILTrain.fillna(0, inplace=True)
SF_CKSAAP_ILTrain.dtypes

In [None]:
import shap as SHAP
SF_AAC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP_Train.csv')

In [None]:
feature_names = list(SF_CKSAAP_ILTrain.columns.values.tolist())


In [None]:
SF_CKSAAP_ILTrain.info()

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2575, class_sep=0.05, n_informative=6, n_features=1601,
                           random_state=0, n_redundant=10, n_clusters_per_class=1)
X = pd.DataFrame(X, columns=feature_names)

In [None]:
feature_names = SF_CKSAAP_ILTrain.columns

In [None]:
X[feature_names[:5]].head()


# **LGBM CLASSIFIER**

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=5, class_weight='balanced')

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LGBMClassifier.csv')

# **ML Model For Best Features of LGBM Classifier (CKSAAP)**

# **LGBM FIRST SET**

In [None]:
CKSAAP_LGBM_First_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_LGBMClassifier_CKSAAP_Train.csv')

for column in CKSAAP_LGBM_First_Set_ILTrain.columns:
  CKSAAP_LGBM_First_Set_ILTrain[column] = pd.to_numeric(CKSAAP_LGBM_First_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CKSAAP_LGBM_First_Set_ILTrain[column] = CKSAAP_LGBM_First_Set_ILTrain[column].astype('category')

CKSAAP_LGBM_First_Set_ILTrain.fillna(0, inplace=True)
CKSAAP_LGBM_First_Set_ILTrain.dtypes


In [None]:
X = CKSAAP_LGBM_First_Set_ILTrain.drop(['Target'], axis=1)
y = CKSAAP_LGBM_First_Set_ILTrain.Target

CKSAAP_LGBM_First_Set_ILTrain_Metrics = []
CKSAAP_LGBM_First_Set_ILTrain_Metrics = pd.DataFrame(CKSAAP_LGBM_First_Set_ILTrain_Metrics)
CKSAAP_LGBM_First_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CKSAAP_LGBM_First_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CKSAAP_LGBM_First_Set_ILTrain_Metrics['mcc'] = 'mcc'
CKSAAP_LGBM_First_Set_ILTrain_Metrics['auc'] = 'auc'
CKSAAP_LGBM_First_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CKSAAP_LGBM_First_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CKSAAP_LGBM_First_Set_ILTrain_Metrics.loc[len(CKSAAP_LGBM_First_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CKSAAP_LGBM_First_Set_ILTrain_Metrics)
CKSAAP_LGBM_First_Set_ILTrain_Metrics.to_csv("CKSAAP_LGBM_First_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_LGBM_First_Set_ILTrain_Metrics.csv")
print(prob)


In [None]:
CKSAAP_LGBM_First_Set_ILTrain_Metrics

In [None]:
CKSAAP_LGBM_First_Set_ILTrain_Metrics.to_csv('CKSAAP_LGBM_First_Set.csv')

# **LGBM SECOND SET**

In [None]:
CKSAAP_LGBM_Second_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/Second_SF_LGBMClassifier_CKSAAP_Train.csv')

for column in CKSAAP_LGBM_Second_Set_ILTrain.columns:
  CKSAAP_LGBM_Second_Set_ILTrain[column] = pd.to_numeric(CKSAAP_LGBM_Second_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CKSAAP_LGBM_Second_Set_ILTrain[column] = CKSAAP_LGBM_Second_Set_ILTrain[column].astype('category')

CKSAAP_LGBM_Second_Set_ILTrain.fillna(0, inplace=True)
CKSAAP_LGBM_Second_Set_ILTrain.dtypes

In [None]:
X = CKSAAP_LGBM_Second_Set_ILTrain.drop(['Target'], axis=1)
y = CKSAAP_LGBM_Second_Set_ILTrain.Target

CKSAAP_LGBM_Second_Set_ILTrain_Metrics = []
CKSAAP_LGBM_Second_Set_ILTrain_Metrics = pd.DataFrame(CKSAAP_LGBM_Second_Set_ILTrain_Metrics)
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['mcc'] = 'mcc'
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['auc'] = 'auc'
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CKSAAP_LGBM_Second_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CKSAAP_LGBM_Second_Set_ILTrain_Metrics.loc[len(CKSAAP_LGBM_Second_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CKSAAP_LGBM_Second_Set_ILTrain_Metrics)
CKSAAP_LGBM_Second_Set_ILTrain_Metrics.to_csv("CKSAAP_LGBM_Second_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_LGBM_Second_Set_ILTrain_Metrics.csv")
print(prob)


In [None]:
CKSAAP_LGBM_Second_Set_ILTrain_Metrics

In [None]:
CKSAAP_LGBM_Second_Set_ILTrain_Metrics.to_csv('CKSAAP_LGBM_Second_Set.csv')

# **LGBM THIRD SET**

In [None]:
CKSAAP_LGBM_Third_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/Third_SF_LGBMClassifier_CKSAAP_Train.csv')

for column in CKSAAP_LGBM_Third_Set_ILTrain.columns:
  CKSAAP_LGBM_Third_Set_ILTrain[column] = pd.to_numeric(CKSAAP_LGBM_Third_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CKSAAP_LGBM_Third_Set_ILTrain[column] = CKSAAP_LGBM_Third_Set_ILTrain[column].astype('category')

CKSAAP_LGBM_Third_Set_ILTrain.fillna(0, inplace=True)
CKSAAP_LGBM_Third_Set_ILTrain.dtypes

In [None]:
X = CKSAAP_LGBM_Third_Set_ILTrain.drop(['Target'], axis=1)
y = CKSAAP_LGBM_Third_Set_ILTrain.Target

CKSAAP_LGBM_Third_Set_ILTrain_Metrics = []
CKSAAP_LGBM_Third_Set_ILTrain_Metrics = pd.DataFrame(CKSAAP_LGBM_Third_Set_ILTrain_Metrics)
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['mcc'] = 'mcc'
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['auc'] = 'auc'
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CKSAAP_LGBM_Third_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CKSAAP_LGBM_Third_Set_ILTrain_Metrics.loc[len(CKSAAP_LGBM_Third_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CKSAAP_LGBM_Third_Set_ILTrain_Metrics)
CKSAAP_LGBM_Third_Set_ILTrain_Metrics.to_csv("CKSAAP_LGBM_Third_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_LGBM_Third_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
CKSAAP_LGBM_Third_Set_ILTrain_Metrics

In [None]:
CKSAAP_LGBM_Third_Set_ILTrain_Metrics.to_csv('CKSAAP_LGBM_Third_Set.csv')

# **LGBM FOURTH SET**

In [None]:
CKSAAP_LGBM_Fourth_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/Fourth_SF_LGBMClassifier_CKSAAP_Train.csv')

for column in CKSAAP_LGBM_Fourth_Set_ILTrain.columns:
  CKSAAP_LGBM_Fourth_Set_ILTrain[column] = pd.to_numeric(CKSAAP_LGBM_Fourth_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CKSAAP_LGBM_Fourth_Set_ILTrain[column] = CKSAAP_LGBM_Fourth_Set_ILTrain[column].astype('category')

CKSAAP_LGBM_Fourth_Set_ILTrain.fillna(0, inplace=True)
CKSAAP_LGBM_Fourth_Set_ILTrain.dtypes

In [None]:
X = CKSAAP_LGBM_Fourth_Set_ILTrain.drop(['Target'], axis=1)
y = CKSAAP_LGBM_Fourth_Set_ILTrain.Target

CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics = []
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics = pd.DataFrame(CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics)
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['mcc'] = 'mcc'
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['auc'] = 'auc'
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics.loc[len(CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics)
CKSAAP_LGBM_Third_Set_ILTrain_Metrics.to_csv("CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics

In [None]:
CKSAAP_LGBM_Fourth_Set_ILTrain_Metrics.to_csv('CKSAAP_LGBM_Fourth_Set.csv')

# **LogisticRegression Model**

In [None]:
clf = LogisticRegression(tol=0.0001, C=1.0)

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LogisticRegression.csv')

# **ML Model For Best Features of LOGISTICS REGRESSION MODEL (CKSAAP)**

In [None]:
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_LogisticsRegression.csv')

for column in CKSAAP_First_SF_LogisticsRegression_Set_ILTrain.columns:
  CKSAAP_First_SF_LogisticsRegression_Set_ILTrain[column] = pd.to_numeric(CKSAAP_First_SF_LogisticsRegression_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CKSAAP_First_SF_LogisticsRegression_Set_ILTrain[column] = CKSAAP_First_SF_LogisticsRegression_Set_ILTrain[column].astype('category')

CKSAAP_First_SF_LogisticsRegression_Set_ILTrain.fillna(0, inplace=True)
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain.dtypes

In [None]:
X = CKSAAP_First_SF_LogisticsRegression_Set_ILTrain.drop(['Target'], axis=1)
y = CKSAAP_First_SF_LogisticsRegression_Set_ILTrain.Target

CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics = []
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics = pd.DataFrame(CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics)
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['mcc'] = 'mcc'
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['auc'] = 'auc'
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.loc[len(CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics)
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.to_csv("CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics

In [None]:
CKSAAP_First_SF_LogisticsRegression_Set_ILTrain_Metrics.to_csv('CKSAAP_First_SF_LogisticsRegression.csv')



```
# This is formatted as code
```

# **Random Forest Classifier Model**

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=2)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_RandomForestClassifier.csv')

# **ML Model For Best Features of RF CLASSIFIER MODEL (CKSAAP)**

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_RandomForestClassifier_CKSAAP_Train.csv')

for column in First_SF_RandomForestClassifier_Set_ILTrain .columns:
  First_SF_RandomForestClassifier_Set_ILTrain [column] = pd.to_numeric(First_SF_RandomForestClassifier_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    First_SF_RandomForestClassifier_Set_ILTrain [column] = First_SF_RandomForestClassifier_Set_ILTrain [column].astype('category')

First_SF_RandomForestClassifier_Set_ILTrain .fillna(0, inplace=True)
First_SF_RandomForestClassifier_Set_ILTrain .dtypes

In [None]:
X = First_SF_RandomForestClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_RandomForestClassifier_Set_ILTrain.Target

First_SF_RandomForestClassifier_Set_ILTrain_Metrics = []
First_SF_RandomForestClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_RandomForestClassifier_Set_ILTrain_Metrics.loc[len(First_SF_RandomForestClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv('First_SF_RandomForestClassifier_Set.csv')

# **SVM**

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(C=0.2, kernel='linear', degree=3)

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:

#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_SVCFfile.csv')

# **ML Model For Best Features of SVM MODEL (CKSAAP)**

In [None]:
First_SF_SVC_CKSAAP_Train_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_SVC_CKSAAP_Train.csv')

for column in First_SF_SVC_CKSAAP_Train_Set_ILTrain .columns:
  First_SF_SVC_CKSAAP_Train_Set_ILTrain [column] = pd.to_numeric(First_SF_SVC_CKSAAP_Train_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    First_SF_SVC_CKSAAP_Train_Set_ILTrain [column] = First_SF_SVC_CKSAAP_Train_Set_ILTrain [column].astype('category')

First_SF_SVC_CKSAAP_Train_Set_ILTrain .fillna(0, inplace=True)
First_SF_SVC_CKSAAP_Train_Set_ILTrain .dtypes

In [None]:
X = First_SF_SVC_CKSAAP_Train_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_SVC_CKSAAP_Train_Set_ILTrain.Target

First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics = []
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics = pd.DataFrame(First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics)
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics.loc[len(First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics)
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics.to_csv("First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics

In [None]:
First_SF_SVC_CKSAAP_Train_Set_ILTrain_Metrics.to_csv('First_SF_SVC_CKSAAP_Train_Set.csv')

# **XGB**

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(C=0.2, kernel='linear', degree=3)
param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_XGBClassifier.csv')

# **ML Model For Best Features of XGB MODEL (CKSAAP)**

In [None]:
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_XGBClassifier_CKSAAP_Train.csv')

for column in First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain .columns:
  First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain [column] = pd.to_numeric(First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain [column] = First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain [column].astype('category')

First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain .fillna(0, inplace=True)
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain .dtypes

In [None]:
X = First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain.Target

First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics = []
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics = pd.DataFrame(First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics)
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics.loc[len(First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics)
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics.to_csv("First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics")
print(prob)


In [None]:
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics

In [None]:
First_SF_XGBClassifier_CKSAAP_Train_Set_ILTrain_Metrics.to_csv('First_SF_XGBClassifier_CKSAAP_Set.csv')

# **Decision Tree Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='gini', splitter='best')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_DecisionTreeClassifier.csv')

# **ML Model For Best Features of DT CLASSIFIER MODEL (CKSAAP)**

In [None]:
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CKSAAP/First_SF_DecisionTreeClassifier_CKSAAP_Train.csv')

for column in First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain.columns:
  First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain[column] = pd.to_numeric(First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain[column] = First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain[column].astype('category')

First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain.fillna(0, inplace=True)
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain.dtypes

In [None]:
X = First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain.Target

First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics = []
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics = pd.DataFrame(First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.loc[len(First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.to_csv("First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics

In [None]:
First_SF_DecisionTreeClassifier_CKSAAP_Set_ILTrain_Metrics.to_csv('First_SF_DecisionTreeClassifier_CKSAAP_Set.csv')

# **CTraid Feature Selection**

In [None]:
!pip install SHAP
!pip install probatus
!pip install lightgbm

In [None]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

# **All Model For CTraid**

In [None]:
SF_CTraid_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid_Train.csv')

for column in SF_CTraid_ILTrain.columns:
  SF_CTraid_ILTrain[column] = pd.to_numeric(SF_CTraid_ILTrain[column], errors='coerce')
  if column == 'Target ':
    SF_CTraid_ILTrain[column] = SF_CTraid_ILTrain[column].astype('category')

SF_CTraid_ILTrain.fillna(0, inplace=True)
SF_CTraid_ILTrain.dtypes

In [None]:
import shap as SHAP
SF_CTraid_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid_Train.csv')

In [None]:
feature_names = list(SF_CTraid_ILTrain.columns.values.tolist())


In [None]:
SF_CTraid_ILTrain.info()

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2575, class_sep=0.05, n_informative=6, n_features=344,
                           random_state=0, n_redundant=10, n_clusters_per_class=1)
X = pd.DataFrame(X, columns=feature_names)

In [None]:
feature_names = SF_CTraid_ILTrain.columns

In [None]:
X[feature_names[:5]].head()


# **LGBM CLASSIFIER**

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=5, class_weight='balanced')

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LGBMCLASSIFIER.csv')

# **ML Model For Best Features of LGBM MODEL (CTraid)**

# **First LGBM Set**

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_LGBMCLASSIFIER.csv')

for column in First_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  First_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(First_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_LGBMCLASSIFIER_Set_ILTrain[column] = First_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

First_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
First_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes


In [None]:
X = First_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_LGBMCLASSIFIER_Set_ILTrain.Target

First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('First_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Second LGBM Set**

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Second_SF_LGBMCLASSIFIER.csv')

for column in Second_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Second_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Second_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Second_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Second_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Second_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Second_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Second_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Second_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Second_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Third LGBM Set**

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Third_SF_LGBMCLASSIFIER.csv')

for column in Third_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Third_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Third_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Third_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Third_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Third_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Third_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Third_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Third_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Third_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Fourth LGBM Set**

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Fourth_SF_LGBMCLASSIFIER.csv')

for column in Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Fourth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Fourth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Fourth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Fifth LGBM Set**

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Fifth_SF_LGBMCLASSIFIER.csv')

for column in Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Fifth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Fifth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Fifth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Sixth LGBM Set**

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Sixth_SF_LGBMCLASSIFIER.csv')

for column in Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Sixth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Sixth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Sixth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Seventh LGBM Set**

In [None]:
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Seventh_SF_LGBMCLASSIFIER.csv')

for column in Seventh_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Seventh_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Seventh_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Seventh_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Seventh_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Seventh_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Seventh_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Seventh_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Seventh_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Seventh_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Eighth LGBM Set**

In [None]:
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Eighth_SF_LGBMCLASSIFIER.csv')

for column in Eighth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Eighth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Eighth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Eighth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Eighth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Eighth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Eighth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Eighth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Eighth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Eighth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Ninth LGBM Set**

In [None]:
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Ninth_SF_LGBMCLASSIFIER.csv')

for column in Ninth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Ninth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Ninth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Ninth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Ninth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Ninth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Ninth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Ninth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Ninth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Ninth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Tenth LGBM Set**

In [None]:
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/Tenth_SF_LGBMCLASSIFIER.csv')

for column in Tenth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Tenth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(Tenth_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    Tenth_SF_LGBMCLASSIFIER_Set_ILTrain[column] = Tenth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Tenth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Tenth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Tenth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Tenth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('Tenth_SF_LGBMCLASSIFIER_Set_CTraid_Set_ML Result.csv')

# **Logistics Regression Model**

In [None]:
clf = LogisticRegression(tol=0.0001, C=1.0)

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:

#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LogisticRegression.csv')

# **ML Model For Best Features of LOGISTICS REGRESSION MODEL (CTraid)**

# **CTraid LR First Set**

In [None]:
First_SF_LogisticRegression_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_LogisticRegression.csv')

for column in First_SF_LogisticRegression_Set_ILTrain.columns:
  First_SF_LogisticRegression_Set_ILTrain[column] = pd.to_numeric(First_SF_LogisticRegression_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_LogisticRegression_Set_ILTrain[column] = First_SF_LogisticRegression_Set_ILTrain[column].astype('category')

First_SF_LogisticRegression_Set_ILTrain.fillna(0, inplace=True)
First_SF_LogisticRegression_Set_ILTrain.dtypes


In [None]:
X = First_SF_LogisticRegression_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_LogisticRegression_Set_ILTrain.Target

First_SF_LogisticRegression_Set_ILTrain_Metrics = []
First_SF_LogisticRegression_Set_ILTrain_Metrics = pd.DataFrame(First_SF_LogisticRegression_Set_ILTrain_Metrics)
First_SF_LogisticRegression_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_LogisticRegression_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_LogisticRegression_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_LogisticRegression_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_LogisticRegression_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_LogisticRegression_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_LogisticRegression_Set_ILTrain_Metrics.loc[len(First_SF_LogisticRegression_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_LogisticRegression_Set_ILTrain_Metrics)
First_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv("First_SF_LogisticRegression_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_LogisticRegression_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_LogisticRegression_Set_ILTrain_Metrics

In [None]:
First_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv('First_SF_LogisticRegression_Set_CTraid_Set_ML Result.csv')

# **RANDOM FOREST CLASSIFIER MODEL**

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=2)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_RandomForestClassifier.csv')

# **ML Model For Best Features of RF MODEL (CTraid)**

# **First RF Set**

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_RandomForestClassifier.csv')

for column in First_SF_RandomForestClassifier_Set_ILTrain.columns:
  First_SF_RandomForestClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_RandomForestClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_RandomForestClassifier_Set_ILTrain[column] = First_SF_RandomForestClassifier_Set_ILTrain[column].astype('category')

First_SF_RandomForestClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_RandomForestClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_RandomForestClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_RandomForestClassifier_Set_ILTrain.Target

First_SF_RandomForestClassifier_Set_ILTrain_Metrics = []
First_SF_RandomForestClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_RandomForestClassifier_Set_ILTrain_Metrics.loc[len(First_SF_RandomForestClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv('First_SF_RandomForestClassifier_Set_CTraid_Set_ML Result.csv')

# **SVM**

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(C=0.2, kernel='linear', degree=3)

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:

#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_SVCFfile.csv')

# **ML Model For Best Features of SVC MODEL (CTraid)**

# **First SVC Set**

In [None]:
First_SF_SVCFfile_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_SVCFfile.csv')

for column in First_SF_SVCFfile_Set_ILTrain.columns:
  First_SF_SVCFfile_Set_ILTrain[column] = pd.to_numeric(First_SF_SVCFfile_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_SVCFfile_Set_ILTrain[column] = First_SF_SVCFfile_Set_ILTrain[column].astype('category')

First_SF_SVCFfile_Set_ILTrain.fillna(0, inplace=True)
First_SF_SVCFfile_Set_ILTrain.dtypes

In [None]:
X = First_SF_SVCFfile_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_SVCFfile_Set_ILTrain.Target

First_SF_SVCFfile_Set_ILTrain_Metrics = []
First_SF_SVCFfile_Set_ILTrain_Metrics = pd.DataFrame(First_SF_SVCFfile_Set_ILTrain_Metrics)
First_SF_SVCFfile_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_SVCFfile_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_SVCFfile_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_SVCFfile_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_SVCFfile_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_SVCFfile_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_SVCFfile_Set_ILTrain_Metrics.loc[len(First_SF_SVCFfile_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_SVCFfile_Set_ILTrain_Metrics)
First_SF_SVCFfile_Set_ILTrain_Metrics.to_csv("First_SF_SVCFfile_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_SVCFfile_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_SVCFfile_Set_ILTrain_Metrics

In [None]:
First_SF_SVCFfile_Set_ILTrain_Metrics.to_csv('First_SF_SVCFfile_CTraid_Set_ML Result.csv')

# **XGB**

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(C=0.2, kernel='linear', degree=3)
param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_XGBClassifier.csv')

# **ML Model For Best Features of XGB MODEL (CTraid)**

# **First XGB Set**

In [None]:
First_SF_XGBClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_XGBClassifier.csv')

for column in First_SF_XGBClassifier_Set_ILTrain.columns:
  First_SF_XGBClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_XGBClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_XGBClassifier_Set_ILTrain[column] = First_SF_XGBClassifier_Set_ILTrain[column].astype('category')

First_SF_XGBClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_XGBClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_XGBClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_XGBClassifier_Set_ILTrain.Target

First_SF_XGBClassifier_Set_ILTrain_Metrics = []
First_SF_XGBClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_XGBClassifier_Set_ILTrain_Metrics)
First_SF_XGBClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_XGBClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_XGBClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_XGBClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_XGBClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_XGBClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_XGBClassifier_Set_ILTrain_Metrics.loc[len(First_SF_XGBClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_XGBClassifier_Set_ILTrain_Metrics)
First_SF_XGBClassifier_Set_ILTrain_Metrics.to_csv("First_SF_XGBClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_XGBClassifier_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_XGBClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_XGBClassifier_Set_ILTrain_Metrics.to_csv('First_SF_XGBClassifier_CTraid_Set_ML Result.csv')

# **DECISION TREE CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='gini', splitter='best')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_DecisionTreeClassifier.csv')

# **ML Model For Best Features of DTC MODEL (CTraid)**

# **First DTC Set**

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTraid/First_SF_DecisionTreeClassifier.csv')

for column in First_SF_DecisionTreeClassifier_Set_ILTrain.columns:
  First_SF_DecisionTreeClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_DecisionTreeClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_DecisionTreeClassifier_Set_ILTrain[column] = First_SF_DecisionTreeClassifier_Set_ILTrain[column].astype('category')

First_SF_DecisionTreeClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_DecisionTreeClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_DecisionTreeClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_DecisionTreeClassifier_Set_ILTrain.Target

First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = []
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.loc[len(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.to_csv("First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.to_csv('First_SF_DecisionTreeClassifier_CTraid_Set_ML Result.csv')

# **DPC FEATURE SELECTION**

In [None]:
!pip install SHAP
!pip install probatus
!pip install lightgbm

In [None]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV


# **ALL MODEL FOR DPC**

In [None]:
SF_DPC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC_Train.csv')

for column in SF_DPC_ILTrain.columns:
  SF_DPC_ILTrain[column] = pd.to_numeric(SF_DPC_ILTrain[column], errors='coerce')
  if column == 'Target ':
    SF_DPC_ILTrain[column] = SF_DPC_ILTrain[column].astype('category')

SF_DPC_ILTrain.fillna(0, inplace=True)
SF_DPC_ILTrain.dtypes

In [None]:
import shap as SHAP
SF_DPC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC_Train.csv')

In [None]:
feature_names = list(SF_DPC_ILTrain.columns.values.tolist())

In [None]:
SF_DPC_ILTrain.info()

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2575, class_sep=0.05, n_informative=6, n_features=401,
                           random_state=0, n_redundant=10, n_clusters_per_class=1)
X = pd.DataFrame(X, columns=feature_names)

In [None]:
feature_names = SF_DPC_ILTrain.columns

In [None]:
X[feature_names[:5]].head()

# **LGBM CLASSIFIER**

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=5, class_weight='balanced')

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LGBMCLASSIFIER.csv')

# **ML Model For Best Features of LGBM Classifier (DPC)**

# **FIRST LGBM SET**

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First_SF_LGBMCLASSIFIER.csv')

for column in First_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  First_SF_LGBMCLASSIFIER_Set_ILTrain[column] = pd.to_numeric(First_SF_LGBMCLASSIFIER_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_LGBMCLASSIFIER_Set_ILTrain[column] = First_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

First_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
First_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = First_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_LGBMCLASSIFIER_Set_ILTrain.Target

First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CKSAAP_LGBM_First_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
First_SF_LGBMCLASSIFIER_Set_ILTrain.to_csv('DPC_First_SF_LGBMCLASSIFIER_Set.csv')

# **SECOND LGBM SET**

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/Second_SF_LGBMCLASSIFIER.csv')

for column in Second_SF_LGBMCLASSIFIER_Set_ILTrain .columns:
  Second_SF_LGBMCLASSIFIER_Set_ILTrain [column] = pd.to_numeric(Second_SF_LGBMCLASSIFIER_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    Second_SF_LGBMCLASSIFIER_Set_ILTrain [column] = Second_SF_LGBMCLASSIFIER_Set_ILTrain [column].astype('category')

Second_SF_LGBMCLASSIFIER_Set_ILTrain .fillna(0, inplace=True)
Second_SF_LGBMCLASSIFIER_Set_ILTrain .dtypes

In [None]:
X = Second_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Second_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Second_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('DPC_Second_SF_LGBMCLASSIFIER_Set.csv')

# **Third LGBM SET**

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/Third_SF_LGBMCLASSIFIER.csv')

for column in Third_SF_LGBMCLASSIFIER_Set_ILTrain .columns:
  Third_SF_LGBMCLASSIFIER_Set_ILTrain [column] = pd.to_numeric(Third_SF_LGBMCLASSIFIER_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    Third_SF_LGBMCLASSIFIER_Set_ILTrain [column] = Third_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Third_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Third_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Third_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Third_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Third_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('DPC_Third_SF_LGBMCLASSIFIER_Set.csv')

# **FOURTH LGBM SET**

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/Fourth_SF_LGBMCLASSIFIER.csv')

for column in Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Fourth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = pd.to_numeric(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    Fourth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Fourth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Fourth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('DPC_Fourth_SF_LGBMCLASSIFIER_Set.csv')

# **Fifth LGBM SET**

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/Fifth_SF_LGBMCLASSIFIER.csv')

for column in Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Fifth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = pd.to_numeric(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    Fifth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Fifth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Fifth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('DPC_Fifth_SF_LGBMCLASSIFIER_Set.csv')

# **Sixth LGBM SET**

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/Sixth_SF_LGBMCLASSIFIER.csv')

for column in Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.columns:
  Sixth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = pd.to_numeric(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain [column], errors='coerce')
  if column == 'Target':
    Sixth_SF_LGBMCLASSIFIER_Set_ILTrain [column] = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain[column].astype('category')

Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.fillna(0, inplace=True)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.dtypes

In [None]:
X = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.drop(['Target'], axis=1)
y = Sixth_SF_LGBMCLASSIFIER_Set_ILTrain.Target

Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = []
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics = pd.DataFrame(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['mcc'] = 'mcc'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['auc'] = 'auc'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.loc[len(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics)
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv("Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics")
print(prob)

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics

In [None]:
Sixth_SF_LGBMCLASSIFIER_Set_ILTrain_Metrics.to_csv('DPC_Sixth_SF_LGBMCLASSIFIER_Set.csv')

# **LRM**

In [None]:
clf = LogisticRegression(tol=0.0001, C=1.0)

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)


In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_LogisticRegression.csv')

# **ML Model For Best Features of LR MODEL (DPC)**

# **First LR Model**

In [None]:
First_SF_LogisticRegression_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First_SF_LogisticRegression.csv')

for column in First_SF_LogisticRegression_Set_ILTrain.columns:
  First_SF_LogisticRegression_Set_ILTrain[column] = pd.to_numeric(First_SF_LogisticRegression_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_LogisticRegression_Set_ILTrain[column] = First_SF_LogisticRegression_Set_ILTrain[column].astype('category')

First_SF_LogisticRegression_Set_ILTrain.fillna(0, inplace=True)
First_SF_LogisticRegression_Set_ILTrain.dtypes


In [None]:
X = First_SF_LogisticRegression_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_LogisticRegression_Set_ILTrain.Target

First_SF_LogisticRegression_Set_ILTrain_Metrics = []
First_SF_LogisticRegression_Set_ILTrain_Metrics = pd.DataFrame(First_SF_LogisticRegression_Set_ILTrain_Metrics)
First_SF_LogisticRegression_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_LogisticRegression_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_LogisticRegression_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_LogisticRegression_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_LogisticRegression_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_LogisticRegression_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_LogisticRegression_Set_ILTrain_Metrics.loc[len(First_SF_LogisticRegression_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_LogisticRegression_Set_ILTrain_Metrics)
First_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv("First_SF_LogisticRegression_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_LogisticRegression_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_LogisticRegression_Set_ILTrain_Metrics

In [None]:
First_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv('DPC_First_SF_LogisticRegression_Set.csv')

## **RF Classifier**

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=2)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_RandomForestClassifier.csv')


# **FIRST RF Set**

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First_SF_RandomForestClassifier.csv')

for column in First_SF_RandomForestClassifier_Set_ILTrain.columns:
  First_SF_RandomForestClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_RandomForestClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_RandomForestClassifier_Set_ILTrain[column] = First_SF_RandomForestClassifier_Set_ILTrain[column].astype('category')

First_SF_RandomForestClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_RandomForestClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_RandomForestClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_RandomForestClassifier_Set_ILTrain.Target

First_SF_RandomForestClassifier_Set_ILTrain_Metrics = []
First_SF_RandomForestClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_RandomForestClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_RandomForestClassifier_Set_ILTrain_Metrics.loc[len(First_SF_RandomForestClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_RandomForestClassifier_Set_ILTrain_Metrics)
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_RandomForestClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv('DPC_First_SF_RandomForestClassifier_Set.csv')

# **SVM**

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(C=0.2, kernel='linear', degree=3)

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_SVCFfile.csv')

# **First SVC Set**

In [None]:
First_SF_SVCFfile_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First-SF_SVCFfile.csv')

for column in First_SF_SVCFfile_Set_ILTrain.columns:
  First_SF_SVCFfile_Set_ILTrain[column] = pd.to_numeric(First_SF_SVCFfile_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_SVCFfile_Set_ILTrain[column] = First_SF_SVCFfile_Set_ILTrain[column].astype('category')

First_SF_SVCFfile_Set_ILTrain.fillna(0, inplace=True)
First_SF_SVCFfile_Set_ILTrain.dtypes

In [None]:
X = First_SF_SVCFfile_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_SVCFfile_Set_ILTrain.Target

First_SF_SVCFfile_Set_ILTrain_Metrics = []
First_SF_SVCFfile_Set_ILTrain_Metrics = pd.DataFrame(First_SF_SVCFfile_Set_ILTrain_Metrics)
First_SF_SVCFfile_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_SVCFfile_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_SVCFfile_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_SVCFfile_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_SVCFfile_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_SVCFfile_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_SVCFfile_Set_ILTrain_Metrics.loc[len(First_SF_SVCFfile_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_SVCFfile_Set_ILTrain_Metrics)
First_SF_SVCFfile_Set_ILTrain_Metrics.to_csv("First_SF_SVCFfile_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_SVCFfile_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_SVCFfile_Set_ILTrain_Metrics

In [None]:
First_SF_SVCFfile_Set_ILTrain_Metrics.to_csv('DPC_First_SF_SVCFfile_Set.csv')

# **XGB**

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(C=0.2, kernel='linear', degree=3)
param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_XGBClassifier.csv')

# **FIRST XGB Set**

In [None]:
First_SF_XGBClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First_SF_XGBClassifier.csv')

for column in First_SF_XGBClassifier_Set_ILTrain.columns:
  First_SF_XGBClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_XGBClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_XGBClassifier_Set_ILTrain[column] = First_SF_XGBClassifier_Set_ILTrain[column].astype('category')

First_SF_XGBClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_XGBClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_XGBClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_XGBClassifier_Set_ILTrain.Target

First_SF_XGBClassifier_Set_ILTrain_Metrics = []
First_SF_XGBClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_XGBClassifier_Set_ILTrain_Metrics)
First_SF_XGBClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_XGBClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_XGBClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_XGBClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_XGBClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_XGBClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_XGBClassifier_Set_ILTrain_Metrics.loc[len(First_SF_XGBClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_SVCFfile_Set_ILTrain_Metrics)
First_SF_XGBClassifier_Set_ILTrain_Metrics.to_csv("First_SF_XGBClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_XGBClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_XGBClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_XGBClassifier_Set_ILTrain_Metrics.to_csv('DPC_First_SF_XGBClassifier_Set.csv')

# **DT Classifier**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='gini', splitter='best')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_DecisionTreeClassifier.csv')

# **First DT Set**

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/DPC/First_SF_DecisionTreeClassifier.csv')

for column in First_SF_DecisionTreeClassifier_Set_ILTrain.columns:
  First_SF_DecisionTreeClassifier_Set_ILTrain[column] = pd.to_numeric(First_SF_DecisionTreeClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    First_SF_DecisionTreeClassifier_Set_ILTrain[column] = First_SF_DecisionTreeClassifier_Set_ILTrain[column].astype('category')

First_SF_DecisionTreeClassifier_Set_ILTrain.fillna(0, inplace=True)
First_SF_DecisionTreeClassifier_Set_ILTrain.dtypes

In [None]:
X = First_SF_DecisionTreeClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = First_SF_DecisionTreeClassifier_Set_ILTrain.Target

First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = []
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = pd.DataFrame(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.loc[len(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.to_csv("First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics

In [None]:
First_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.to_csv('DPC_First_SF_DecisionTreeClassifier_Set.csv')

# **CTDC FEATURE SELECTION**

In [None]:
!pip install SHAP
!pip install probatus
!pip install lightgbm

In [None]:
from probatus.feature_elimination import ShapRFECV
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
import numpy as np
import pandas as pd
import lightgbm
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import GridSearchCV

In [None]:
SF_CTDC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC_Train.csv')

for column in SF_CTDC_ILTrain.columns:
  SF_CTDC_ILTrain[column] = pd.to_numeric(SF_CTDC_ILTrain[column], errors='coerce')
  if column == 'Target ':
    SF_CTDC_ILTrain[column] = SF_CTDC_ILTrain[column].astype('category')

SF_CTDC_ILTrain.fillna(0, inplace=True)
SF_CTDC_ILTrain.dtypes

In [None]:
import shap as SHAP
SF_CTDC_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC_Train.csv')

In [None]:
feature_names = list(SF_CTDC_ILTrain.columns.values.tolist())

In [None]:
SF_CTDC_ILTrain.info()

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=2575, class_sep=0.05, n_informative=6, n_features=40,
                           random_state=0, n_redundant=10, n_clusters_per_class=1)
X = pd.DataFrame(X, columns=feature_names)

In [None]:
feature_names = SF_CTDC_ILTrain.columns

In [None]:
X[feature_names[:5]].head()

# **LGBM CLASSIFIER**

In [None]:
clf = lightgbm.LGBMClassifier(max_depth=5, class_weight='balanced')

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf=search, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('CTDC_SF_LGBMCLASSIFIER.csv')

# **ML Model For Best Features of LGBM Classifier (CTDC)**

# **LGBM FIRST SET**

In [None]:
CTDC_SF_FIRSTLGBMCLASSIFIER  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_FIRSTLGBMCLASSIFIER.csv')

for column in CTDC_SF_FIRSTLGBMCLASSIFIER.columns:
  CTDC_SF_FIRSTLGBMCLASSIFIER[column] = pd.to_numeric(CTDC_SF_FIRSTLGBMCLASSIFIER[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_FIRSTLGBMCLASSIFIER[column] = CTDC_SF_FIRSTLGBMCLASSIFIER[column].astype('category')

CTDC_SF_FIRSTLGBMCLASSIFIER.fillna(0, inplace=True)
CTDC_SF_FIRSTLGBMCLASSIFIER.dtypes

In [None]:
X = CTDC_SF_FIRSTLGBMCLASSIFIER.drop(['Target'], axis=1)
y = CTDC_SF_FIRSTLGBMCLASSIFIER.Target

CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics = []
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics = pd.DataFrame(CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics)
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['Classifier'] = 'Classifier'
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['mcc'] = 'mcc'
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['auc'] = 'auc'
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.loc[len(CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics)
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.to_csv("CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.csv")
print(prob)

In [None]:
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics

In [None]:
CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.to_csv('CTDC_SF_FIRSTLGBMCLASSIFIER_Metrics.csv')

# **LOGISTICS REGRESSION CLASSIFIER**

In [None]:
clf = LogisticRegression(tol=0.0001, C=1.0)

param_grid = {
    'n_estimators': [5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)


In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('CTDC_SF_LogisticRegression.csv')

# **ML Model For Best Features of LOGISTICS REGRESSION MODEL (CTDC)**

In [None]:
CTDC_SF_LogisticRegression_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_LogisticRegression.csv')

for column in CTDC_SF_LogisticRegression_Set_ILTrain.columns:
  CTDC_SF_LogisticRegression_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_LogisticRegression_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_LogisticRegression_Set_ILTrain[column] = CTDC_SF_LogisticRegression_Set_ILTrain[column].astype('category')

CTDC_SF_LogisticRegression_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_LogisticRegression_Set_ILTrain.dtypes

In [None]:
X = CTDC_SF_LogisticRegression_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_LogisticRegression_Set_ILTrain.Target

CTDC_SF_LogisticRegression_Set_ILTrain_Metrics = []
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_LogisticRegression_Set_ILTrain_Metrics)
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.loc[len(CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_LogisticRegression_Set_ILTrain_Metrics)
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv("CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.csv")
print(prob)

In [None]:
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics

In [None]:
CTDC_SF_LogisticRegression_Set_ILTrain_Metrics.to_csv('CTDC_SF_LogisticRegression_Set_ILTrain_Metrics Result.csv')

# **RANDOM FOREST CLASSIFIER**

In [None]:
clf = RandomForestClassifier(max_depth=3, class_weight='balanced')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)


In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=2)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_RandomForestClassifier.csv')

# **ML Model For Best Features of RANDOM FOREST CLASSIFIER MODEL (CTDC)**

In [None]:
CTDC_SF_RandomForestClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_RandomForestClassifier.csv')

for column in CTDC_SF_RandomForestClassifier_Set_ILTrain.columns:
  CTDC_SF_RandomForestClassifier_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_RandomForestClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_RandomForestClassifier_Set_ILTrain[column] = CTDC_SF_RandomForestClassifier_Set_ILTrain[column].astype('category')

CTDC_SF_RandomForestClassifier_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_RandomForestClassifier_Set_ILTrain.dtypes


In [None]:
X = CTDC_SF_RandomForestClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_RandomForestClassifier_Set_ILTrain.Target

CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics = []
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics)
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics.loc[len(CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics)
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv("CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics

In [None]:
CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics.to_csv('CTDC_SF_RandomForestClassifier_Set_ILTrain_Metrics_Result.csv')

# **SVM**

In [None]:
from sklearn import svm

In [None]:
clf = svm.SVC(C=0.2, kernel='linear', degree=3)

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_SVCFfile.csv')

# **ML Model For Best Features of SVC MODEL (CTDC)**

In [None]:
CTDC_SF_SVCFfile_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_SVCFfile.csv')

for column in CTDC_SF_SVCFfile_Set_ILTrain.columns:
  CTDC_SF_SVCFfile_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_SVCFfile_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_SVCFfile_Set_ILTrain[column] = CTDC_SF_SVCFfile_Set_ILTrain[column].astype('category')

CTDC_SF_SVCFfile_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_SVCFfile_Set_ILTrain.dtypes

In [None]:
X = CTDC_SF_SVCFfile_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_SVCFfile_Set_ILTrain.Target

CTDC_SF_SVCFfile_Set_ILTrain_Metrics = []
CTDC_SF_SVCFfile_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_SVCFfile_Set_ILTrain_Metrics)
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_SVCFfile_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_SVCFfile_Set_ILTrain_Metrics.loc[len(CTDC_SF_SVCFfile_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_SVCFfile_Set_ILTrain_Metrics)
CTDC_SF_SVCFfile_Set_ILTrain_Metrics.to_csv("CTDC_SF_SVCFfile_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_SVCFfile_Set_ILTrain_Metrics")
print(prob)

In [None]:
CTDC_SF_SVCFfile_Set_ILTrain_Metrics

In [None]:
CTDC_SF_SVCFfile_Set_ILTrain_Metrics.to_csv('CTDC_SF_SVCFfile_Set_ILTrain_Metrics_Result.csv')

# **XGB CLASSIFIER**

In [None]:
from xgboost import XGBClassifier

In [None]:
clf = XGBClassifier(C=0.2, kernel='linear', degree=3)
param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_XGBClassifier.csv')

# **ML MODEL FOR FIRST XGB CLASSIFIER**

In [None]:
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_FIRSTXGBClassifier.csv')

for column in CTDC_SF_FIRSTXGBClassifier_Set_ILTrain.columns:
  CTDC_SF_FIRSTXGBClassifier_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_FIRSTXGBClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_FIRSTXGBClassifier_Set_ILTrain[column] = CTDC_SF_FIRSTXGBClassifier_Set_ILTrain[column].astype('category')

CTDC_SF_FIRSTXGBClassifier_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain.dtypes

In [None]:
X = CTDC_SF_FIRSTXGBClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_FIRSTXGBClassifier_Set_ILTrain.Target

CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics = []
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics)
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics.loc[len(CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics)
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics.to_csv("CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics

In [None]:
CTDC_SF_FIRSTXGBClassifier_Set_ILTrain_Metrics.to_csv('CTDC_First_SF_XGBClassifier_Set_Result.csv')

# **ML MODEL FOR SECOND XGB CLASSIFIER**

In [None]:
CTDC_SF_SECONDXGBClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_SECONDXGBClassifier.csv')

for column in CTDC_SF_SECONDXGBClassifier_Set_ILTrain.columns:
  CTDC_SF_SECONDXGBClassifier_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_SECONDXGBClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_SECONDXGBClassifier_Set_ILTrain[column] = CTDC_SF_SECONDXGBClassifier_Set_ILTrain[column].astype('category')

CTDC_SF_SECONDXGBClassifier_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_SECONDXGBClassifier_Set_ILTrain.dtypes

In [None]:
X = CTDC_SF_SECONDXGBClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_SECONDXGBClassifier_Set_ILTrain.Target

CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics = []
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics)
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.loc[len(CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics)
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.to_csv("CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics

In [None]:
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.to_csv('CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.csv')

# **DT CLASSIFIER**

In [None]:
from sklearn.tree import DecisionTreeClassifier

In [None]:
clf = DecisionTreeClassifier(criterion='gini', splitter='best')

param_grid = {
    'n_estimators': [ 5, 7, 10],
    'num_leaves': [3, 5, 7, 10],
}
search = RandomizedSearchCV(clf, param_grid)

In [None]:
shap_elimination = ShapRFECV(
    clf, step=0.2, cv=10, scoring='roc_auc', n_jobs=3)
report = shap_elimination.fit_compute(X, y)

In [None]:
#First 5 rows of first 5 columns
report[['num_features', 'features_set', 'val_metric_mean']]

In [None]:
report[['num_features', 'features_set', 'val_metric_mean']].to_csv('SF_DecisionTreeClassifier.csv')

# **ML MODEL FOR DT CLASSIFIER**

In [None]:
CTDC_SF_DecisionTreeClassifier_Set_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/CTDC/CTDC_SF_DecisionTreeClassifier.csv')

for column in CTDC_SF_DecisionTreeClassifier_Set_ILTrain.columns:
  CTDC_SF_DecisionTreeClassifier_Set_ILTrain[column] = pd.to_numeric(CTDC_SF_DecisionTreeClassifier_Set_ILTrain[column], errors='coerce')
  if column == 'Target':
    CTDC_SF_DecisionTreeClassifier_Set_ILTrain[column] = CTDC_SF_DecisionTreeClassifier_Set_ILTrain[column].astype('category')

CTDC_SF_DecisionTreeClassifier_Set_ILTrain.fillna(0, inplace=True)
CTDC_SF_DecisionTreeClassifier_Set_ILTrain.dtypes

In [None]:
X = CTDC_SF_DecisionTreeClassifier_Set_ILTrain.drop(['Target'], axis=1)
y = CTDC_SF_DecisionTreeClassifier_Set_ILTrain.Target

CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = []
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics = pd.DataFrame(CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Classifier'] = 'Classifier'
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['Accuracy'] = 'Accuracy'
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['mcc'] = 'mcc'
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['auc'] = 'auc'
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['sensitivity'] = 'sensitivity'
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.loc[len(CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.index)] = [model,Accuracy, mcc,auc, sensitivity, specificity]

print(CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics)
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.to_csv("CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics")
print(prob)

In [None]:
CTDC_SF_DecisionTreeClassifier_Set_ILTrain_Metrics

In [None]:
CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics.to_csv('CTDC_SF_SECONDXGBClassifier_Set_ILTrain_Metrics_Result.csv')

# **Final COMBINATION (CKSAAP,CTDC,DPC) ML MODEL**

In [None]:
Final_Combined_ILTrain  = pd.read_csv('/content/drive/MyDrive/Masters Thesis/IL13 Data Set/Final_Combined(CTDC_CKSAAP_DPC)_Train.csv')

for column in Final_Combined_ILTrain.columns:
  Final_Combined_ILTrain[column] = pd.to_numeric(Final_Combined_ILTrain[column], errors='coerce')
  if column == 'Target':
    Final_Combined_ILTrain[column] = Final_Combined_ILTrain[column].astype('category')

Final_Combined_ILTrain.fillna(0, inplace=True)
Final_Combined_ILTrain.dtypes

In [None]:
X = Final_Combined_ILTrain.drop(['Target'], axis=1)
y = Final_Combined_ILTrain.Target

Final_Combined_ILTrain_Metrics = []
Final_Combined_ILTrain_Metrics = pd.DataFrame(Final_Combined_ILTrain_Metrics)
Final_Combined_ILTrain_Metrics['Classifier'] = 'Classifier'
Final_Combined_ILTrain_Metrics['Accuracy'] = 'Accuracy'
Final_Combined_ILTrain_Metrics['mcc'] = 'mcc'
Final_Combined_ILTrain_Metrics['auc'] = 'auc'
Final_Combined_ILTrain_Metrics['sensitivity'] = 'sensitivity'
Final_Combined_ILTrain_Metrics['specificity'] = 'specificity'
cv = KFold(n_splits=10, random_state=1, shuffle=True)
# create model
models = [LogisticRegression(),
          RandomForestClassifier(),
          SVC(),
          XGBClassifier(),
          DecisionTreeClassifier(),
          KNeighborsClassifier(),
          LGBMClassifier(),
          stack]
for model in models:
  # evaluate model
  # scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1)
  y_pred = cross_val_predict(model, X, y, cv=cv, n_jobs=-1)
  cm1 = confusion_matrix(y, y_pred)
  # report performance
  Accuracy = accuracy_score(y, y_pred)
  mcc = matthews_corrcoef(y, y_pred)
  auc = roc_auc_score(y, y_pred)
  sensitivity = cm1[0,0]/(cm1[0,0]+cm1[0,1])
  specificity = cm1[1,1]/(cm1[1,0]+cm1[1,1])
  Final_Combined_ILTrain_Metrics.loc[len(Final_Combined_ILTrain_Metrics.index)] = [model,Accuracy, mcc, auc, sensitivity, specificity]

print(Final_Combined_ILTrain_Metrics)
Final_Combined_ILTrain_Metrics.to_csv("Final_Combined_ILTrain_Metrics.csv")
clf = StackingClassifier( estimators=estimators, final_estimator=LogisticRegression())
prob = clf.fit_transform(X, y)
pd.DataFrame(prob).to_csv("Final_Combined_ILTrain_Metrics.csv")
print(prob)


In [None]:
Final_Combined_ILTrain_Metrics

In [None]:
Final_Combined_ILTrain_Metrics.to_csv('Final_Combined_ILTrain_Metrics.csv')