In [1]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
import numpy as np
from imblearn.over_sampling import SMOTE
import joblib

# random seed
seed = 123

In [2]:
# Read original dataset
data_df = pd.read_csv("data/data.csv")
data_df.sample(frac=1, random_state=seed)
data_df = data_df.dropna(subset=["Diagnosis"])
data_df.fillna(-1, inplace = True)
data_df = data_df[['Age', 
             'BMI', 
             'Sex', 
             'Height',
             'Weight',
             'Appendix_Diameter',
             'Migratory_Pain', 
             'Lower_Right_Abd_Pain',
             'Contralateral_Rebound_Tenderness',
             'Coughing_Pain',
             'Loss_of_Appetite',
             'Body_Temperature',
             'WBC_Count',
             'Neutrophil_Percentage',
             'Segmented_Neutrophils',
             'Neutrophilia',
             'RBC_Count',
             'Hemoglobin',
             'RDW',
             'Thrombocyte_Count',
             'Ketones_in_Urine',
             'RBC_in_Urine',
             'WBC_in_Urine',
             'CRP',
             'Dysuria',
             'Stool',
             'Peritonitis',
             'Psoas_Sign',
             'Diagnosis'
            ]]
    
val_map = [('Sex', {'male':0, 'female':1}), 
                    ('Migratory_Pain', {'yes': 2, 'no':3}), 
                    ('Lower_Right_Abd_Pain', {'yes': 2, 'no':3}),
                    ('Contralateral_Rebound_Tenderness', {'yes': 2, 'no':3}),
                    ('Coughing_Pain', {'yes': 2, 'no':3}),
                    ('Loss_of_Appetite', {'yes': 2, 'no':3}),
                    ('Neutrophilia', {'yes': 2, 'no':3}),
                    ('Ketones_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('RBC_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('WBC_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('Dysuria', {'yes': 2, 'no':3}),
                    ('Stool', {'normal': 7, 'constipation':8, 'diarrhea':9, 'constipation, diarrhea':10}),
                    ('Peritonitis', {'no': 3, 'local':11, 'generalized':12}),
                    ('Psoas_Sign', {'yes': 2, 'no':3}),
                    ('Diagnosis', {'appendicitis':1, 'no appendicitis': 0})]
for column, themap in val_map:
    data_df[column] = data_df[column].replace(themap)
print(data_df)
    
#X = X.astype(float)
#X = X.round(3)


      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
0   12.68 16.90    1  148.00   37.00               7.10               3   
1   14.10 31.90    0  147.00   69.50              -1.00               2   
2   14.14 23.30    1  163.00   62.00              -1.00               3   
3   16.37 20.60    1  165.00   56.00              -1.00               2   
4   11.08 16.90    1  163.00   45.00               7.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
777 12.41 25.25    1  166.50   70.00               7.50               2   
778 17.09 20.43    1  158.00   51.00              -1.00               3   
779 14.99 19.91    1  152.00   46.00              -1.00               3   
780  7.20 14.30    0  129.30   23.90              14.00               2   
781 11.51 18.17    0  146.50   39.00               8.00               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
0                    

In [4]:
'''#Random oversampling
minority_class = data_df[data_df["Diagnosis"] == 0]
majority_class = data_df[data_df["Diagnosis"] == 1]
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class),random_state=42)
balanced_data = pd.concat([majority_class, minority_upsampled])
data_df = balanced_data
print(data_df)'''

      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
0   12.68 16.90    1  148.00   37.00               7.10               3   
4   11.08 16.90    1  163.00   45.00               7.00               3   
9   14.34 14.90    0  174.00   45.50               8.00               3   
10  11.87 15.70    0  147.00   34.00               9.00               2   
14  15.98 19.70    1  164.00   53.00               9.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
160 14.11 18.50    1  156.00   45.00              -1.00               3   
155 14.99 20.80    1  161.00   54.00               5.10               3   
197 16.58 18.80    1  158.00   47.00              -1.00               3   
179 17.30 22.70    0  173.00   68.00               4.00               3   
324  6.67 25.04    1  124.00   38.50               3.50               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
0                    

In [39]:
#Random undersampling
minority_class = data_df[data_df["Diagnosis"] == 0]
majority_class = data_df[data_df["Diagnosis"] == 1]
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class),random_state=42)
balanced_data = pd.concat([minority_class, majority_downsampled])
data_df = balanced_data
print(data_df)


      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
1   14.10 31.90    0  147.00   69.50              -1.00               2   
2   14.14 23.30    1  163.00   62.00              -1.00               3   
3   16.37 20.60    1  165.00   56.00              -1.00               2   
5   11.05 30.70    0  121.00   45.00              -1.00               2   
6    8.98 19.40    1  140.00   38.50              -1.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
537  9.04 16.33    1  140.00   32.00              11.00               3   
237  8.67 15.99    1  133.50   28.50               8.00               3   
118 12.59 16.70    0  148.50   36.50               7.50               3   
556  8.20 13.75    1  133.50   24.50               8.00               3   
722 11.38 21.08    0  154.00   50.00              12.00               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
1                    

In [3]:
X = data_df[['Age', 
             'BMI', 
             'Sex', 
             'Height',
             'Weight',
             'Appendix_Diameter',
             'Migratory_Pain', 
             'Lower_Right_Abd_Pain',
             'Contralateral_Rebound_Tenderness',
             'Coughing_Pain',
             'Loss_of_Appetite',
             'Body_Temperature',
             'WBC_Count',
             'Neutrophil_Percentage',
             'Segmented_Neutrophils',
             'Neutrophilia',
             'RBC_Count',
             'Hemoglobin',
             'RDW',
             'Thrombocyte_Count',
             'Ketones_in_Urine',
             'RBC_in_Urine',
             'WBC_in_Urine',
             'CRP',
             'Dysuria',
             'Stool',
             'Peritonitis',
             'Psoas_Sign'
            ]]
             
y = data_df[['Diagnosis']]

# split data into train and test sets
# 70% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=y)




In [4]:
X_train["Diagnosis"] = y_train["Diagnosis"]


#Random oversampling
minority_class = X_train[X_train["Diagnosis"] == 0]
majority_class = X_train[X_train["Diagnosis"] == 1]
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class),random_state=42)
balanced_data = pd.concat([majority_class, minority_upsampled])
X_train = balanced_data
y_train = X_train["Diagnosis"]
X_train = X_train.drop(columns=["Diagnosis"])


X_train.to_csv("data/train_sample_over.csv")
y_train.to_csv("data/train_diagnosis_over.csv")
X_test.to_csv("data/test_sample_over.csv")
y_test.to_csv("data/test_diagnosis_over.csv")



In [4]:
X_train["Diagnosis"] = y_train["Diagnosis"]

# Synthetic Minority Over-sampling Technique (SMOTE)
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

y_train = X_train["Diagnosis"]
X_train = X_train.drop(columns=["Diagnosis"])

X_train.to_csv("data/train_sample_smote.csv")
y_train.to_csv("data/train_diagnosis_smote.csv")
X_test.to_csv("data/test_sample_smote.csv")
y_test.to_csv("data/test_diagnosis_smote.csv")



In [5]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
with open("results_models/results_lazypredict.txt", "w") as file_write:
    file_write.write(str(models))

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 17.39it/s]

[LightGBM] [Info] Number of positive: 370, number of negative: 370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000389 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2349
[LightGBM] [Info] Number of data points in the train set: 740, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
RandomForestClassifier             0.92               0.93     0.93      0.92   
BaggingClassifier                  0.92               0.92     0.92      0.92   
LGBMClassifier                     0.91               0.92     0.92      0.91   
ExtraTreesClassifier               0.91               0.92     0.92      0.91   
XGBClassifier    




In [6]:
#Cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
count = 1
for train_idx, test_idx in cv.split(X,y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train, X_test, y_train, y_test)
    print(models)
    
    with open("results_models/results_lazypredict_cross"+str(count)+".txt", "w") as file_write:
        file_write.write(str(models))
        count = count + 1

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 18.62it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000186 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1566
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.592949 -> initscore=0.376169
[LightGBM] [Info] Start training from score 0.376169
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.90               0.89     0.89      0.90   
RandomForestClassifier             0.90               0.88     0.88      0.90   
BaggingClassifier                  0.88               0.87     0.87      0.88   
ExtraTreesClassifier               0.88               0.87     0.87      0.88   
LinearDiscriminantAnalysis  

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 20.13it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000139 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1579
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.592949 -> initscore=0.376169
[LightGBM] [Info] Start training from score 0.376169
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.94               0.93     0.93      0.94   
LGBMClassifier                     0.92               0.92     0.92      0.92   
AdaBoostClassifier                 0.92               0.91     0.91      0.92   
RandomForestClassifier             0.92               0.91     0.91      0.92   
BaggingClassifier           

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 20.02it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 254
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000219 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1564
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.592949 -> initscore=0.376169
[LightGBM] [Info] Start training from score 0.376169
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
AdaBoostClassifier                 0.93               0.92     0.92      0.93   
LGBMClassifier                     0.92               0.91     0.91      0.92   
XGBClassifier                      0.92               0.91     0.91      0.92   
BaggingClassifier                  0.90               0.90     0.90      0.90   
SGDClassifier               

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 20.55it/s]


[LightGBM] [Info] Number of positive: 371, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1562
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.594551 -> initscore=0.382813
[LightGBM] [Info] Start training from score 0.382813
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.92               0.92     0.92      0.92   
LGBMClassifier                     0.90               0.90     0.90      0.90   
AdaBoostClassifier                 0.89               0.89     0.89      0.89   
RandomForestClassifier             0.89               0.89     0.89      0.89   
BaggingClassifier           

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 22.12it/s]

[LightGBM] [Info] Number of positive: 371, number of negative: 253
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000262 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1575
[LightGBM] [Info] Number of data points in the train set: 624, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.594551 -> initscore=0.382813
[LightGBM] [Info] Start training from score 0.382813
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
RandomForestClassifier             0.94               0.93     0.93      0.94   
XGBClassifier                      0.92               0.93     0.93      0.92   
LGBMClassifier                     0.92               0.92     0.92      0.92   
ExtraTreesClassifier               0.92               0.91     0.91      0.92   
AdaBoostClassifier          


