# Notebook for Kaggle data "Horse survival data"

* This is a solution to the Kaggle "Horse Survival data" problem

    * The feature exploration is not shown in this notebook. 


## Requirements
- Install scikit-learn, pycaret and lazypredict (not really required)
- Download the data from [Kaggle](https://www.kaggle.com/datasets/yasserh/horse-survival-dataset?select=horse.css) and save it as "horse.csv' locally (if you change the name then fix the global variable below)


In [15]:
import pandas as pd
import numpy as np
#import seaborn as sns

from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import RobustScaler # ,StandardScalar
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer
from sklearn.impute import SimpleImputer


## TODO: Change this to point to the actual datafile. 
HORSE_SURVIVAL_DATA_CSV = 'horse.csv'

In [16]:
data_df = pd.read_csv(HORSE_SURVIVAL_DATA_CSV)

Y = data_df['outcome']

# Drop "hospital_number" column from input data
# In real life there might be correlation between hospital and outcomes but in this case, the correlation is very small
X = data_df.drop(columns=['hospital_number'], inplace=False)

# Drop "outcome" column from input data
X.drop(columns=['outcome'], inplace=True) 

# These columns don't have much and there is very little correlation with the outcome.
X.drop(columns=['lesion_2', 'lesion_3'], inplace=True)


In [17]:
# Split the data for test and train.
X_train_df, X_test_df, Y_train_df, Y_test_df = train_test_split(X, Y, random_state=61)
X_train_df.head()

Unnamed: 0,surgery,age,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,pain,...,nasogastric_reflux_ph,rectal_exam_feces,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,cp_data
36,no,adult,38.3,112.0,16.0,,reduced,bright_red,more_3_sec,,...,,,distend_large,51.0,6.0,cloudy,1.0,no,5205,yes
145,yes,adult,38.2,48.0,,normal,reduced,pale_pink,less_3_sec,mild_pain,...,2.0,absent,distend_large,42.0,71.0,,,yes,3111,no
53,no,adult,38.6,40.0,20.0,,,,less_3_sec,,...,,,,41.0,6.4,,,no,3111,yes
117,no,adult,39.5,,,cool,reduced,pale_cyanotic,more_3_sec,mild_pain,...,5.5,absent,distend_large,,6.7,clear,,yes,4205,no
90,no,adult,38.0,52.0,16.0,,,,,depressed,...,1.0,normal,normal,53.0,86.0,,,yes,2322,no


In [18]:
column_encoders = {
    'surgery' : ['no', 'yes'],
    'age' : ['young', 'adult'],
    'temp_of_extremities' : ['cool', 'normal', 'warm'],
    'peripheral_pulse' : ['absent', 'reduced', 'normal', 'increased'],
    'mucous_membrane' : ['dark_cyanotic', 'pale_cyanotic', 'bright_red', 'bright_pink', 'pale_pink', 'normal_pink'],
    'capillary_refill_time' : ['less_3_sec', 'more_3_sec'],
    'pain' : ['alert', 'mild_pain', 'depressed', 'severe_pain'],
    'peristalsis' : ['absent', 'hypomotile', 'hypermotile', 'normal'],
    'abdominal_distention' : ['none', 'slight', 'moderate', 'severe'],
    'nasogastric_tube' : ['none', 'slight', 'significant'],
    'nasogastric_reflux' : ['less_1_liter', 'more_1_liter'],
    'rectal_exam_feces': ['absent', 'decreased', 'normal', 'increased'],
    'abdomo_appearance' : ['clear', 'cloudy', 'serosanguious'],
    'surgical_lesion' : ['no', 'yes'],
    'cp_data' : ['no', 'yes']
}


def build_column_transformer():
    """
    Build the column transformer
    """
    column_transformer_items = []
    for key in column_encoders:
        column_transformer_items.append((key, OrdinalEncoder(dtype='int', categories=[column_encoders[key]]), [key]))
    column_transformer_items.append(('abdomen', OneHotEncoder(dtype='int', sparse_output=False), ['abdomen']))
    column_transformer = ColumnTransformer(column_transformer_items, remainder=RobustScaler(), verbose_feature_names_out=True)
    return column_transformer

column_transformer = build_column_transformer()

In [19]:


def column_custom_preprocessor(Xorig):
    """
    We setup the defaults for each column and handle any preprocessing here. 
    Setup the column transforms assuming that a column may be removed as part of model training (low correlation or Data scientist choice or no real world evidence)

    """

    X = Xorig.copy()
    
    if 'temp_of_extremities' in X.columns:
        X['temp_of_extremities'] = X['temp_of_extremities'].fillna(
            'normal').str.lower()
        X['temp_of_extremities'] = np.where(
            X['temp_of_extremities'] == 'cold', 'cool', X['temp_of_extremities'])
        X['temp_of_extremities'] = np.where(
            (X['temp_of_extremities'].str.lower() == 'none'), 'normal', X['temp_of_extremities'])

    if 'peripheral_pulse' in X.columns:
        X['peripheral_pulse'] = X['peripheral_pulse'].fillna(
            'normal').str.lower()

    if 'mucous_membrane' in X.columns:
        X['mucous_membrane'] = X['mucous_membrane'].fillna(
            'normal_pink').str.lower()

    if 'capillary_refill_time' in X.columns:
        X['capillary_refill_time'] = X['capillary_refill_time'].fillna(
            'less_3_sec').str.lower().replace(3, '3').replace('3', 'less_3_sec')

    if 'pain' in X.columns:
        X['pain'] = X['pain'].fillna('alert').str.lower()
        X['pain'] = np.where((X['pain'] == 'slight'), 'mild_pain', X['pain'])
        X['pain'] = np.where(X['pain'] == 'extreme_pain', 'severe_pain', X['pain'])

    if 'peristalsis' in X.columns:
        X['peristalsis'] = X['peristalsis'].fillna('normal').str.lower()
        X['peristalsis'] = np.where(
            (X['peristalsis'].str.lower() == 'distend_small'), 'normal', X['peristalsis'])

    if 'abdominal_distention' in X.columns:
        X['abdominal_distention'] = X['abdominal_distention'].fillna(
            'none').str.lower()

    if 'nasogastric_tube' in X.columns:
        X['nasogastric_tube'] = X['nasogastric_tube'].fillna('none').str.lower()

    if 'nasogastric_reflux' in X.columns:
        X['nasogastric_reflux'] = X['nasogastric_reflux'].fillna('less_1_liter').str.lower(
        ).replace('slight', 'less_1_liter').replace('none', 'less_1_liter')

    if 'rectal_exam_feces' in X.columns:
        X['rectal_exam_feces'] = X['rectal_exam_feces'].fillna(
            'normal').str.lower()
        X['rectal_exam_feces'] = np.where(
            (X['rectal_exam_feces'] == 'serosanguious'), 'normal', X['rectal_exam_feces'])

    if 'abdomen' in X.columns:
        X['abdomen'] = X['abdomen'].fillna('normal').str.lower()
        X['abdomen'] = np.where(
            (X['abdomen'] == 'other'), 'normal', X['abdomen'])
        
    if 'abdomo_appearance' in X.columns:
        X['abdomo_appearance'] = X['abdomo_appearance'].fillna('clear').str.lower()

    return X


preprocess_transformer = FunctionTransformer(column_custom_preprocessor, feature_names_out='one-to-one')


In [20]:
transform_pipeline = Pipeline([('preprocess_transformer', preprocess_transformer), ('column_transform', column_transformer), ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))])

###
# Random forest classifier
###
clf = RandomForestClassifier()
random_forest_pipeline = Pipeline([('transform_pipeline', transform_pipeline), ('clf', clf)])
random_forest_pipeline.fit(X_train_df, Y_train_df)


In [21]:
transform_pipeline.get_feature_names_out()

array(['surgery__surgery', 'age__age',
       'temp_of_extremities__temp_of_extremities',
       'peripheral_pulse__peripheral_pulse',
       'mucous_membrane__mucous_membrane',
       'capillary_refill_time__capillary_refill_time', 'pain__pain',
       'peristalsis__peristalsis',
       'abdominal_distention__abdominal_distention',
       'nasogastric_tube__nasogastric_tube',
       'nasogastric_reflux__nasogastric_reflux',
       'rectal_exam_feces__rectal_exam_feces',
       'abdomo_appearance__abdomo_appearance',
       'surgical_lesion__surgical_lesion', 'cp_data__cp_data',
       'abdomen__abdomen_distend_large', 'abdomen__abdomen_distend_small',
       'abdomen__abdomen_firm', 'abdomen__abdomen_normal',
       'remainder__rectal_temp', 'remainder__pulse',
       'remainder__respiratory_rate', 'remainder__nasogastric_reflux_ph',
       'remainder__packed_cell_volume', 'remainder__total_protein',
       'remainder__abdomo_protein', 'remainder__lesion_1'], dtype=object)

In [22]:
random_forest_pipeline.score(X_test_df, Y_test_df)

0.76

# Using Pycaret to get best model

In [23]:
from pycaret.classification import setup, compare_models, models

s = setup(X, target = Y, session_id = 123, preprocess=False, custom_pipeline=transform_pipeline)

# Compare models
best = compare_models(exclude=['lightgbm'])



Unnamed: 0,Description,Value
0,Session id,123
1,Target,outcome
2,Target type,Multiclass
3,Original data shape,"(299, 25)"
4,Transformed data shape,"(299, 28)"
5,Transformed train set shape,"(209, 28)"
6,Transformed test set shape,"(90, 28)"
7,Numeric features,8
8,Categorical features,16
9,Rows with missing values,98.0%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ridge,Ridge Classifier,0.7131,0.0,0.7131,0.6538,0.673,0.4383,0.4603,0.016
lr,Logistic Regression,0.7033,0.7794,0.7033,0.6671,0.6773,0.432,0.4441,0.184
rf,Random Forest Classifier,0.7031,0.8405,0.7031,0.6665,0.6687,0.4162,0.4415,0.041
lda,Linear Discriminant Analysis,0.6986,0.7649,0.6986,0.6691,0.6764,0.434,0.4453,0.017
et,Extra Trees Classifier,0.6986,0.8521,0.6986,0.6721,0.6701,0.42,0.4378,0.042
gbc,Gradient Boosting Classifier,0.689,0.8149,0.689,0.6675,0.6714,0.4287,0.4393,0.053
knn,K Neighbors Classifier,0.6502,0.7485,0.6502,0.6084,0.6242,0.3534,0.3627,0.098
dt,Decision Tree Classifier,0.6412,0.695,0.6412,0.6548,0.6377,0.3653,0.3747,0.017
svm,SVM - Linear Kernel,0.6174,0.0,0.6174,0.6296,0.589,0.2595,0.2993,0.018
ada,Ada Boost Classifier,0.6129,0.7416,0.6129,0.6346,0.6117,0.305,0.3123,0.03


# Use Lazypredict to get the best model

In [24]:
from lazypredict.Supervised import LazyClassifier

lazypredict_clf = LazyClassifier(ignore_warnings=True, custom_metric=None)
models2, predictions2 = lazypredict_clf.fit(transform_pipeline.transform(X_train_df), transform_pipeline.transform(X_test_df), Y_train_df, Y_test_df)
models2

100%|██████████| 29/29 [00:00<00:00, 48.90it/s]

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000058 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 287
[LightGBM] [Info] Number of data points in the train set: 224, number of used features: 25
[LightGBM] [Info] Start training from score -1.386294
[LightGBM] [Info] Start training from score -1.977659
[LightGBM] [Info] Start training from score -0.491665





Unnamed: 0_level_0,Accuracy,Balanced Accuracy,ROC AUC,F1 Score,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
BaggingClassifier,0.81,0.72,,0.8,0.03
BernoulliNB,0.77,0.71,,0.77,0.01
NearestCentroid,0.72,0.7,,0.72,0.03
DecisionTreeClassifier,0.71,0.66,,0.71,0.01
LGBMClassifier,0.76,0.65,,0.74,0.04
RandomForestClassifier,0.75,0.62,,0.71,0.09
GaussianNB,0.6,0.62,,0.59,0.01
LogisticRegression,0.69,0.6,,0.68,0.02
LabelPropagation,0.71,0.6,,0.68,0.01
LabelSpreading,0.71,0.6,,0.68,0.01
