In [2]:
import pandas as pd
from lazypredict.Supervised import LazyClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.utils import resample
import numpy as np
from imblearn.over_sampling import SMOTE
import joblib

# random seed
seed = 123

In [3]:
# Read original dataset
data_df = pd.read_csv("data/data.csv")
data_df.sample(frac=1, random_state=seed)
data_df = data_df.dropna(subset=["Diagnosis"])
data_df.fillna(-1, inplace = True)
data_df = data_df[['Age', 
             'BMI', 
             'Sex', 
             'Height',
             'Weight',
             'Appendix_Diameter',
             'Migratory_Pain', 
             'Lower_Right_Abd_Pain',
             'Contralateral_Rebound_Tenderness',
             'Coughing_Pain',
             'Loss_of_Appetite',
             'Body_Temperature',
             'WBC_Count',
             'Neutrophil_Percentage',
             'Segmented_Neutrophils',
             'Neutrophilia',
             'RBC_Count',
             'Hemoglobin',
             'RDW',
             'Thrombocyte_Count',
             'Ketones_in_Urine',
             'RBC_in_Urine',
             'WBC_in_Urine',
             'CRP',
             'Dysuria',
             'Stool',
             'Peritonitis',
             'Psoas_Sign',
             'Diagnosis'
            ]]
    
val_map = [('Sex', {'male':0, 'female':1}), 
                    ('Migratory_Pain', {'yes': 2, 'no':3}), 
                    ('Lower_Right_Abd_Pain', {'yes': 2, 'no':3}),
                    ('Contralateral_Rebound_Tenderness', {'yes': 2, 'no':3}),
                    ('Coughing_Pain', {'yes': 2, 'no':3}),
                    ('Loss_of_Appetite', {'yes': 2, 'no':3}),
                    ('Neutrophilia', {'yes': 2, 'no':3}),
                    ('Ketones_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('RBC_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('WBC_in_Urine', {'no': 3, '+':4, '++':5, '+++':6}),
                    ('Dysuria', {'yes': 2, 'no':3}),
                    ('Stool', {'normal': 7, 'constipation':8, 'diarrhea':9, 'constipation, diarrhea':10}),
                    ('Peritonitis', {'no': 3, 'local':11, 'generalized':12}),
                    ('Psoas_Sign', {'yes': 2, 'no':3}),
                    ('Diagnosis', {'appendicitis':1, 'no appendicitis': 0})]
for column, themap in val_map:
    data_df[column] = data_df[column].replace(themap)
print(data_df)
    
#X = X.astype(float)
#X = X.round(3)


      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
0   12.68 16.90    1  148.00   37.00               7.10               3   
1   14.10 31.90    0  147.00   69.50              -1.00               2   
2   14.14 23.30    1  163.00   62.00              -1.00               3   
3   16.37 20.60    1  165.00   56.00              -1.00               2   
4   11.08 16.90    1  163.00   45.00               7.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
777 12.41 25.25    1  166.50   70.00               7.50               2   
778 17.09 20.43    1  158.00   51.00              -1.00               3   
779 14.99 19.91    1  152.00   46.00              -1.00               3   
780  7.20 14.30    0  129.30   23.90              14.00               2   
781 11.51 18.17    0  146.50   39.00               8.00               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
0                    

In [4]:
#Random oversampling
minority_class = data_df[data_df["Diagnosis"] == 0]
majority_class = data_df[data_df["Diagnosis"] == 1]
minority_upsampled = resample(minority_class, replace=True, n_samples=len(majority_class),random_state=42)
balanced_data = pd.concat([majority_class, minority_upsampled])
data_df = balanced_data
print(data_df)

      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
0   12.68 16.90    1  148.00   37.00               7.10               3   
4   11.08 16.90    1  163.00   45.00               7.00               3   
9   14.34 14.90    0  174.00   45.50               8.00               3   
10  11.87 15.70    0  147.00   34.00               9.00               2   
14  15.98 19.70    1  164.00   53.00               9.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
160 14.11 18.50    1  156.00   45.00              -1.00               3   
155 14.99 20.80    1  161.00   54.00               5.10               3   
197 16.58 18.80    1  158.00   47.00              -1.00               3   
179 17.30 22.70    0  173.00   68.00               4.00               3   
324  6.67 25.04    1  124.00   38.50               3.50               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
0                    

In [39]:
#Random undersampling
minority_class = data_df[data_df["Diagnosis"] == 0]
majority_class = data_df[data_df["Diagnosis"] == 1]
majority_downsampled = resample(majority_class, replace=False, n_samples=len(minority_class),random_state=42)
balanced_data = pd.concat([minority_class, majority_downsampled])
data_df = balanced_data
print(data_df)


      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
1   14.10 31.90    0  147.00   69.50              -1.00               2   
2   14.14 23.30    1  163.00   62.00              -1.00               3   
3   16.37 20.60    1  165.00   56.00              -1.00               2   
5   11.05 30.70    0  121.00   45.00              -1.00               2   
6    8.98 19.40    1  140.00   38.50              -1.00               3   
..    ...   ...  ...     ...     ...                ...             ...   
537  9.04 16.33    1  140.00   32.00              11.00               3   
237  8.67 15.99    1  133.50   28.50               8.00               3   
118 12.59 16.70    0  148.50   36.50               7.50               3   
556  8.20 13.75    1  133.50   24.50               8.00               3   
722 11.38 21.08    0  154.00   50.00              12.00               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
1                    

In [5]:
X = data_df[['Age', 
             'BMI', 
             'Sex', 
             'Height',
             'Weight',
             'Appendix_Diameter',
             'Migratory_Pain', 
             'Lower_Right_Abd_Pain',
             'Contralateral_Rebound_Tenderness',
             'Coughing_Pain',
             'Loss_of_Appetite',
             'Body_Temperature',
             'WBC_Count',
             'Neutrophil_Percentage',
             'Segmented_Neutrophils',
             'Neutrophilia',
             'RBC_Count',
             'Hemoglobin',
             'RDW',
             'Thrombocyte_Count',
             'Ketones_in_Urine',
             'RBC_in_Urine',
             'WBC_in_Urine',
             'CRP',
             'Dysuria',
             'Stool',
             'Peritonitis',
             'Psoas_Sign'
            ]]
             
y = data_df[['Diagnosis']]

'''# Synthetic Minority Over-sampling Technique (SMOTE)
smote = SMOTE(random_state=42)
X, y = smote.fit_resample(X,y)'''
# split data into train and test sets
# 70% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=seed, stratify=y)

print(X_test)
print(y_test)
X_test.to_csv("data/test_sample.csv")
y_test.to_csv("data/test_diagnosis.csv")



      Age   BMI  Sex  Height  Weight  Appendix_Diameter  Migratory_Pain  \
157  7.72 20.10    0  129.50   33.80               5.50               3   
6    8.98 19.40    1  140.00   38.50              -1.00               3   
423  9.56 15.94    1  133.70   28.50               6.60               3   
138 10.65 13.30    1  136.50   25.00               8.20               3   
575  8.10 14.81    0  135.00   27.00               6.00               2   
..    ...   ...  ...     ...     ...                ...             ...   
53   7.20 14.10    1  125.00   22.00               8.50               3   
269 12.55 20.32    0  161.50   53.00              -1.00               3   
408 17.37 22.64    1  153.00   53.00              -1.00               3   
232 14.29 18.32    1  167.50   51.40               9.00               3   
521  7.44 15.09    1  130.00   25.50               9.00               3   

     Lower_Right_Abd_Pain  Contralateral_Rebound_Tenderness  Coughing_Pain  \
157                  

In [6]:
clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
models,predictions = clf.fit(X_train, X_test, y_train, y_test)
print(models)
with open("results_models/results_lazypredict.txt", "w") as file_write:
    file_write.write(str(models))

100%|██████████████████████████████████████████| 29/29 [00:02<00:00, 13.51it/s]

[LightGBM] [Info] Number of positive: 370, number of negative: 370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000851 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1691
[LightGBM] [Info] Number of data points in the train set: 740, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.95               0.95     0.95      0.95   
ExtraTreesClassifier               0.95               0.95     0.95      0.95   
XGBClassifier                      0.95               0.95     0.95      0.95   
RandomForestClassifier             0.95               0.95     0.95      0.95   
AdaBoostClassifie




In [7]:
#Cross validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
count = 1
for train_idx, test_idx in cv.split(X,y):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    clf = LazyClassifier(verbose=0, ignore_warnings=True, custom_metric=None)
    models,predictions = clf.fit(X_train, X_test, y_train, y_test)
    print(models)
    
    with open("results_models/results_lazypredict_cross"+str(count)+".txt", "w") as file_write:
        file_write.write(str(models))
        count = count + 1

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 18.44it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 370
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000129 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1627
[LightGBM] [Info] Number of data points in the train set: 740, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500000 -> initscore=0.000000
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
ExtraTreesClassifier               0.95               0.95     0.95      0.95   
XGBClassifier                      0.94               0.94     0.94      0.94   
BaggingClassifier                  0.92               0.92     0.92      0.92   
LGBMClassifier                     0.91               0.91     0.91      0.91   
DecisionTreeClass

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 17.26it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 371
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000292 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1623
[LightGBM] [Info] Number of data points in the train set: 741, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499325 -> initscore=-0.002699
[LightGBM] [Info] Start training from score -0.002699
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
XGBClassifier                      0.97               0.97     0.97      0.97   
LGBMClassifier                     0.96               0.96     0.96      0.96   
RandomForestClassifier             0.96               0.96     0.96      0.96   
DecisionTreeClassifier             0.96               0.96     0.96      0.96   
BaggingClassifier         

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 17.58it/s]


[LightGBM] [Info] Number of positive: 370, number of negative: 371
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000300 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1627
[LightGBM] [Info] Number of data points in the train set: 741, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.499325 -> initscore=-0.002699
[LightGBM] [Info] Start training from score -0.002699
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.95               0.95     0.95      0.95   
BaggingClassifier                  0.94               0.94     0.94      0.94   
XGBClassifier                      0.94               0.94     0.94      0.94   
AdaBoostClassifier                 0.93               0.93     0.93      0.93   
RandomForestClassifier    

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 17.63it/s]


[LightGBM] [Info] Number of positive: 371, number of negative: 370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000273 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1678
[LightGBM] [Info] Number of data points in the train set: 741, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500675 -> initscore=0.002699
[LightGBM] [Info] Start training from score 0.002699
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
ExtraTreesClassifier               0.93               0.93     0.93      0.93   
XGBClassifier                      0.92               0.92     0.92      0.92   
SVC                                0.92               0.92     0.92      0.92   
RandomForestClassifier             0.91               0.91     0.91      0.91   
NuSVC                       

100%|██████████████████████████████████████████| 29/29 [00:01<00:00, 18.92it/s]

[LightGBM] [Info] Number of positive: 371, number of negative: 370
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000244 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1711
[LightGBM] [Info] Number of data points in the train set: 741, number of used features: 28
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.500675 -> initscore=0.002699
[LightGBM] [Info] Start training from score 0.002699
                               Accuracy  Balanced Accuracy  ROC AUC  F1 Score  \
Model                                                                           
LGBMClassifier                     0.96               0.96     0.96      0.96   
RandomForestClassifier             0.95               0.95     0.95      0.95   
ExtraTreesClassifier               0.95               0.95     0.95      0.95   
XGBClassifier                      0.93               0.93     0.93      0.93   
AdaBoostClassifier          


