In [2]:
# read df from excel file
import pandas as pd

# read df from xlsx file
df = pd.read_excel('../data/Z-Alizadeh sani dataset.xlsx')

df_remove_col = ["BBB"]
df_numeric_features = [
    "Weight",
    "Length", 
    "BMI", 
    "BP", 
    "PR",
    "Age",
    "FBS",
    "CR",
    "TG",
    "LDL",
    "HDL",
    "BUN",
    "ESR",
    "HB",
    "K",
    "Na",
    "WBC",
    "Lymph",
    "Neut",
    "PLT",
    "EF-TTE",
    "Region RWMA"
]
df_categorical_features = [
    "Sex",
    "DM",
    "HTN",
    "Current Smoker",
    "EX-Smoker",
    "FH",
    "Obesity",
    "CRF",
    "CVA",
    "Airway disease",
    "Thyroid Disease",
    "CHF",
    "DLP",
    "Edema",
    "Weak Peripheral Pulse",
    "Lung rales",
    "Systolic Murmur",
    "Diastolic Murmur",
    "Typical Chest Pain",
    "Dyspnea",
    "Function Class",
    "Atypical",
    "Nonanginal",
    "Exertional CP",
    "LowTH Ang",
    "Q Wave",
    "St Elevation",
    "St Depression",
    "Tinversion",
    "LVH",
    "Poor R Progression",
    "VHD"
]
df_lable_name = ["Cath"]

df.shape

(303, 56)

In [3]:
def one_hot_encode(df):
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse=False)
    enc.fit(df)
    col_name = enc.get_feature_names_out(df_categorical_features)
    df = pd.DataFrame(enc.transform(df), columns = col_name)
    return df

def scale_df(df):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), columns= df_numeric_features)
    return df

def label_binarize(df):
    from sklearn.preprocessing import LabelBinarizer
    enc = LabelBinarizer()
    enc.fit(df)
    df = pd.DataFrame(enc.transform(df), columns= df_lable_name)
    return df

def preprocess_df(df):
    # remove columns
    df.drop("BBB", axis=1, inplace=True)

    df_numberic = df[df_numeric_features]
    df_categorical = df[df_categorical_features]
    df_lable = df[df_lable_name]
    
    df_numberic = scale_df(df_numberic)    
    df_categorical = one_hot_encode(df_categorical)
    df_lable = label_binarize(df_lable)
    
    df = pd.concat([df_numberic, df_categorical, df_lable], axis=1)
    return df

df = preprocess_df(df)
df.head()

Unnamed: 0,Weight,Length,BMI,BP,PR,Age,FBS,CR,TG,LDL,...,Tinversion_1,LVH_N,LVH_Y,Poor R Progression_N,Poor R Progression_Y,VHD_Moderate,VHD_N,VHD_Severe,VHD_mild,Cath
0,0.583333,0.729167,0.494721,0.2,0.5,0.410714,0.08284,0.117647,0.210267,0.640187,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,0.305556,0.354167,0.451314,0.5,0.5,0.660714,0.053254,0.294118,0.268509,0.481308,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
2,0.083333,0.5,0.086105,0.1,0.833333,0.428571,0.068047,0.294118,0.065153,0.242991,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,0.263889,0.375,0.382846,0.1,0.5,0.642857,0.047337,0.411765,0.025666,0.172897,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
4,0.541667,0.270833,0.836058,0.2,0.5,0.357143,0.12426,0.294118,0.131293,0.429907,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [4]:
df.to_excel('../data/Z-Alizadeh sani dataset_preprocessed.xlsx')

In [5]:
# how to get the train and test set for each fold
# for train_index , test_index in kf.split(x):
#     X_train , X_test = x.iloc[train_index,:],x.iloc[test_index,:]
#     y_train , y_test = y[train_index] , y[test_index]
#     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [6]:
from sklearn.ensemble import RandomForestClassifier # RF
from sklearn.ensemble import ExtraTreesClassifier # ET
from sklearn.ensemble import AdaBoostClassifier # ADB
from sklearn.svm import SVC # svc
from sklearn.neural_network import MLPClassifier # MLP
from xgboost import XGBClassifier # XGB
from sklearn.gaussian_process import GaussianProcessClassifier # GPC
from sklearn.naive_bayes import GaussianNB # GNB
from sklearn.linear_model import LogisticRegression # LR
from sklearn.ensemble import GradientBoostingClassifier # GBC

l = {
    'RF': RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2,random_state=0),
    'GNB': GaussianNB(priors=None,var_smoothing=1e-09),
    'ADB': AdaBoostClassifier(n_estimators=50, random_state=0),
    'ET': ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_split=2),
    'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, loss='deviance'),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='adam',alpha=0.00001),
    'XGB': XGBClassifier(random_state=1, learning_rate=0.5, n_estimators=7, maxdepth=5,eta=0.05, objective='binary:logistic'),
    'LR': LogisticRegression(solver='newton-cg', C=100),
}


In [10]:
%%capture
import numpy as np

def algorithm1(R, S):
    '''
    The Process of Building Base-Level Model
    
        parameters:
            R: nine folds as train set
            S: One fold as test set

        return:
            train: [trainRF , trainET , ...trainGB]
            test: [testRF , testET , ...testGB
    
    '''
    kf1 = KFold(n_splits=10)
    train = {}
    test = {}
    
    R = (R.iloc[:-(len(R)%10),:])
    
    for classifier in l:
        kf_splits_1 = kf1.split(R)
        train_l = []
        test_l = []
        for train_index1, validation_index1 in kf_splits_1:
            
            R_kt = R.iloc[train_index1,:] # train set
            R_kv = R.iloc[validation_index1,:] # validation set
    
            R_kt_x = R_kt.iloc[:,:-1]
            R_kt_y = R_kt.iloc[:,-1]
            
            R_kv_x = R_kv.iloc[:,:-1]
            R_kv_y = R_kv.iloc[:,-1]
            
            S_x = S.iloc[:,:-1]
            S_y = S.iloc[:,-1]
            
            # use R_kt to train ξl
            l[classifier].fit(R_kt_x,R_kt_y)
            
            train_l.append(l[classifier].predict(R_kv_x))
            test_l.append(l[classifier].predict(S_x))
            a = 0 # for debug
            
        train_l = np.sum(np.array(train_l), axis=0)
        test_l = (np.sum(np.array(test_l), axis=0)/10) 
        
        train[classifier] = train_l
        test[classifier] = test_l
        
    
    return train, test
    
    

# split into 10 folds
from sklearn.model_selection import KFold

k=10
kf = KFold(n_splits=k, shuffle=True, random_state=None)

kf_splits = kf.split(df.iloc[:,:]) # x is training and y is tests


alg_1_result = []
for train_index, test_index in kf_splits:
    r = df.iloc[train_index,:]
    s = df.iloc[test_index,:]
    
    # print(r.shape, s.shape)
    
    (train,test) = algorithm1(r,s)
    alg_1_result.append([train,test])


    

In [19]:
alg_1_result

[[{'RF': array([1, 1, 1, 1, 3, 3, 5, 2, 2, 1, 2, 1, 6, 3, 4, 2, 3, 4, 2, 1, 3, 2,
          4, 1, 3, 2, 3]),
   'GNB': array([ 8,  7,  8,  7,  8,  8,  8,  8,  7,  8,  8,  8,  9,  8,  8,  7, 10,
           8,  7,  8,  6,  8, 10,  9,  9, 10,  8]),
   'ADB': array([2, 2, 2, 1, 2, 3, 5, 2, 2, 1, 1, 6, 5, 3, 2, 1, 5, 3, 2, 3, 4, 4,
          5, 2, 5, 3, 4]),
   'ET': array([1, 2, 1, 0, 4, 3, 4, 3, 3, 0, 3, 4, 6, 4, 4, 2, 4, 4, 1, 2, 3, 3,
          5, 2, 5, 2, 3]),
   'GB': array([0, 1, 2, 0, 4, 3, 4, 3, 2, 0, 2, 3, 6, 5, 3, 1, 5, 4, 2, 1, 4, 2,
          3, 3, 3, 2, 3]),
   'MLP': array([0, 2, 2, 1, 3, 3, 5, 2, 3, 1, 3, 5, 6, 4, 5, 1, 4, 2, 2, 3, 2, 4,
          5, 4, 3, 3, 2]),
   'XGB': array([1, 1, 2, 2, 4, 2, 5, 3, 2, 0, 5, 3, 7, 5, 4, 1, 4, 4, 3, 1, 4, 2,
          4, 2, 4, 2, 4]),
   'LR': array([1, 2, 2, 2, 3, 2, 5, 3, 4, 1, 4, 5, 6, 4, 5, 2, 4, 2, 2, 3, 2, 2,
          4, 3, 4, 3, 3])},
  {'RF': array([0.4, 0. , 0. , 0. , 0. , 0. , 0.1, 0.1, 0. , 0. , 0. , 0.7, 0.1,
          0. , 