In [136]:
# read df from excel file
import pandas as pd

# read df from xlsx file
df = pd.read_excel('../data/Z-Alizadeh sani dataset.xlsx')

df_remove_col = ["BBB"]
df_numeric_features = [
    "Weight",
    "Length", 
    "BMI", 
    "BP", 
    "PR",
    "Age",
    "FBS",
    "CR",
    "TG",
    "LDL",
    "HDL",
    "BUN",
    "ESR",
    "HB",
    "K",
    "Na",
    "WBC",
    "Lymph",
    "Neut",
    "PLT",
    "EF-TTE",
    "Region RWMA"
]
df_categorical_features = [
    "Sex",
    "DM",
    "HTN",
    "Current Smoker",
    "EX-Smoker",
    "FH",
    "Obesity",
    "CRF",
    "CVA",
    "Airway disease",
    "Thyroid Disease",
    "CHF",
    "DLP",
    "Edema",
    "Weak Peripheral Pulse",
    "Lung rales",
    "Systolic Murmur",
    "Diastolic Murmur",
    "Typical Chest Pain",
    "Dyspnea",
    "Function Class",
    "Atypical",
    "Nonanginal",
    "Exertional CP",
    "LowTH Ang",
    "Q Wave",
    "St Elevation",
    "St Depression",
    "Tinversion",
    "LVH",
    "Poor R Progression",
    "VHD"
]
df_lable_name = ["Cath"]

df.shape

(303, 56)

In [137]:
def one_hot_encode(df):
    from sklearn.preprocessing import OneHotEncoder
    enc = OneHotEncoder(sparse=False)
    enc.fit(df)
    col_name = enc.get_feature_names_out(df_categorical_features)
    df = pd.DataFrame(enc.transform(df), columns = col_name)
    return df

def scale_df(df):
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    scaler.fit(df)
    df = pd.DataFrame(scaler.transform(df), columns= df_numeric_features)
    return df

def label_binarize(df):
    from sklearn.preprocessing import LabelBinarizer
    enc = LabelBinarizer()
    enc.fit(df)
    df = pd.DataFrame(enc.transform(df), columns= df_lable_name)
    return df

def preprocess_df(df):
    # remove columns
    df.drop("BBB", axis=1, inplace=True)

    df_numberic = df[df_numeric_features]
    df_categorical = df[df_categorical_features]
    df_lable = df[df_lable_name]
    
    df_numberic = scale_df(df_numberic)    
    df_categorical = one_hot_encode(df_categorical)
    df_lable = label_binarize(df_lable)
    
    df = pd.concat([df_numberic, df_categorical, df_lable], axis=1)
    return df

df = preprocess_df(df)
df.head()

Unnamed: 0,Weight,Length,BMI,BP,PR,Age,FBS,CR,TG,LDL,...,Tinversion_1,LVH_N,LVH_Y,Poor R Progression_N,Poor R Progression_Y,VHD_Moderate,VHD_N,VHD_Severe,VHD_mild,Cath
0,0.583333,0.729167,0.494721,0.2,0.5,0.410714,0.08284,0.117647,0.210267,0.640187,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
1,0.305556,0.354167,0.451314,0.5,0.5,0.660714,0.053254,0.294118,0.268509,0.481308,...,1.0,1.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0
2,0.083333,0.5,0.086105,0.1,0.833333,0.428571,0.068047,0.294118,0.065153,0.242991,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0
3,0.263889,0.375,0.382846,0.1,0.5,0.642857,0.047337,0.411765,0.025666,0.172897,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1
4,0.541667,0.270833,0.836058,0.2,0.5,0.357143,0.12426,0.294118,0.131293,0.429907,...,0.0,1.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,1


In [139]:
df.to_excel('../data/Z-Alizadeh sani dataset_preprocessed.xlsx')

In [None]:
# split into 10 folds
from sklearn.model_selection import KFold

k=10
kf  = KFold(n_splits=k, shuffle=True, random_state=None)

x = df.iloc[:,:-1]
y = df.iloc[:,-1]


In [None]:
# how to get the train and test set for each fold
# for train_index , test_index in kf.split(x):
#     X_train , X_test = x.iloc[train_index,:],x.iloc[test_index,:]
#     y_train , y_test = y[train_index] , y[test_index]
#     print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [None]:
from sklearn.ensemble import RandomForestClassifier # RF
from sklearn.ensemble import ExtraTreesClassifier # ET
from sklearn.ensemble import AdaBoostClassifier # ADB
from sklearn.svm import SVC # svc
from sklearn.neural_network import MLPClassifier # MLP
from xgboost import XGBModel # XGB
from sklearn.gaussian_process import GaussianProcessClassifier # GPC
from sklearn.naive_bayes import GaussianNB # GNB
from sklearn.linear_model import LogisticRegression # LR
from sklearn.ensemble import GradientBoostingClassifier # GBC

l = {
    'RF': RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=2,random_state=0),
    'GNB': GaussianNB(priors=None,var_smoothing=1e-09),
    'ADB': AdaBoostClassifier(n_estimators=50, random_state=0),
    'ET': ExtraTreesClassifier(n_estimators=100, criterion='gini', min_samples_split=2),
    'GB': GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, loss='deviance'),
    'MLP': MLPClassifier(hidden_layer_sizes=(100,),activation='relu',solver='adam',alpha=0.00001),
    'XGB': XGBModel(random_state=1, learning_rate=0.5, n_estimators=7, maxdepth=5,eta=0.05, objective='binary:logistic'),
    'LR': LogisticRegression(solver='newtoncg', C=100),
}


In [None]:
for classifier in l:
    # print(classifier)
    for train_index, test_index in kf.split(x):
        
        X_train, X_test = x.iloc[train_index,:],x.iloc[test_index,:]
        y_train, y_test = y[train_index] , y[test_index]
        
        # l[classifier].fit(X_train, y_train)
        # print(classifier, l[classifier].score(X_test, y_test))
    # print('\n')

(303, 23857)