In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

%matplotlib inline

In [2]:
horse_df = pd.read_csv("horse.csv")
horse_df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [3]:
horse_df.shape

(299, 28)

In [4]:
horse_df.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

###  Extracting target values

In [5]:
target = horse_df.outcome
horse_df = horse_df.drop(columns=['outcome'], axis=1)
horse_df.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,abdomen,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,distend_large,45.0,8.4,,,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,other,50.0,85.0,cloudy,2.0,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,normal,33.0,6.7,,,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,,48.0,7.2,serosanguious,5.3,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,,74.0,7.4,,,no,4300,0,0,no


#### Filtering categorical data as doesn;t go well with Decision tree.

In [6]:
horse_df.dtypes

surgery                   object
age                       object
hospital_number            int64
rectal_temp              float64
pulse                    float64
respiratory_rate         float64
temp_of_extremities       object
peripheral_pulse          object
mucous_membrane           object
capillary_refill_time     object
pain                      object
peristalsis               object
abdominal_distention      object
nasogastric_tube          object
nasogastric_reflux        object
nasogastric_reflux_ph    float64
rectal_exam_feces         object
abdomen                   object
packed_cell_volume       float64
total_protein            float64
abdomo_appearance         object
abdomo_protein           float64
surgical_lesion           object
lesion_1                   int64
lesion_2                   int64
lesion_3                   int64
cp_data                   object
dtype: object

####  Identified categorical values where type as object.

In [7]:
categs = ['surgery', 'age', 'temp_of_extremities', 'peripheral_pulse', 'mucous_membrane', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distention', 'nasogastric_tube', 'nasogastric_reflux', 'rectal_exam_feces', 'abdomen', 'abdomo_appearance', 'surgical_lesion', 'cp_data']

for catg in categs:
    print("Unique values in ",catg," => ",horse_df[catg].unique())

Unique values in  surgery  =>  ['no' 'yes']
Unique values in  age  =>  ['adult' 'young']
Unique values in  temp_of_extremities  =>  ['cool' nan 'normal' 'cold' 'warm']
Unique values in  peripheral_pulse  =>  ['reduced' nan 'normal' 'absent' 'increased']
Unique values in  mucous_membrane  =>  [nan 'pale_cyanotic' 'pale_pink' 'dark_cyanotic' 'normal_pink'
 'bright_red' 'bright_pink']
Unique values in  capillary_refill_time  =>  ['more_3_sec' 'less_3_sec' nan '3']
Unique values in  pain  =>  ['extreme_pain' 'mild_pain' 'depressed' nan 'severe_pain' 'alert']
Unique values in  peristalsis  =>  ['absent' 'hypomotile' nan 'hypermotile' 'normal']
Unique values in  abdominal_distention  =>  ['severe' 'slight' 'none' nan 'moderate']
Unique values in  nasogastric_tube  =>  [nan 'none' 'slight' 'significant']
Unique values in  nasogastric_reflux  =>  [nan 'less_1_liter' 'none' 'more_1_liter']
Unique values in  rectal_exam_feces  =>  ['decreased' 'absent' 'normal' nan 'increased']
Unique values in 

###  Converting categorical values to numerical

In [8]:
horse_cleaned_df = pd.get_dummies(horse_df, columns=categs)
horse_cleaned_df.sample(n=5)

Unnamed: 0,hospital_number,rectal_temp,pulse,respiratory_rate,nasogastric_reflux_ph,packed_cell_volume,total_protein,abdomo_protein,lesion_1,lesion_2,...,abdomen_firm,abdomen_normal,abdomen_other,abdomo_appearance_clear,abdomo_appearance_cloudy,abdomo_appearance_serosanguious,surgical_lesion_no,surgical_lesion_yes,cp_data_no,cp_data_yes
202,529685,37.2,36.0,9.0,,35.0,5.7,,31110,0,...,0,1,0,0,0,0,1,0,1,0
211,5294369,38.9,120.0,30.0,3.0,47.0,6.3,,2124,0,...,0,0,0,1,0,0,1,0,1,0
283,530439,38.5,54.0,,,40.0,6.8,7.0,0,0,...,0,0,0,0,1,0,1,0,0,1
212,528183,37.6,45.0,12.0,,39.0,7.0,1.5,2112,0,...,0,0,0,0,1,0,0,1,0,1
255,533871,37.5,72.0,,,35.0,65.0,2.0,7209,0,...,0,0,0,0,1,0,0,1,1,0


## Preparing train test data

In [9]:
x_features = horse_cleaned_df.values
y_target = target.values

### Converting target to numeric so that decision tree get better support

In [10]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [11]:
y_target = encoder.fit_transform(y_target)

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x_features, y_target, test_size=0.3, random_state=1, stratify=y_target)
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((209, 67), (90, 67), (209,), (90,))

In [13]:
# Decision Tree accuracy very sensitive to null or missing values so we need to take care of them
from sklearn.impute import SimpleImputer

imptr = SimpleImputer(missing_values=np.nan, strategy="most_frequent")
x_train = imptr.fit_transform(x_train)
x_test = imptr.fit_transform(x_test)

In [14]:
from sklearn.tree import DecisionTreeClassifier


dtc = DecisionTreeClassifier()

In [15]:
dtc.fit(x_train, y_train)

DecisionTreeClassifier()

In [16]:
y_predict = dtc.predict(x_test)

In [17]:
from sklearn.metrics import accuracy_score
acc = accuracy_score(y_predict, y_test) * 100

print("Decision Tree Accuracy: {:0.2f}%.".format(acc))


Decision Tree Accuracy: 54.44%.


# Trying Random Forest in contast of Decision Tree

In [18]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train, y_train)
y_rfc_predict =rfc.predict(x_test)

In [19]:
acc = accuracy_score(y_rfc_predict, y_test)
print("Random Forest Accuracy: {:0.2f}%.".format(acc))

Random Forest Accuracy: 0.72%.


#  Observations:
1. Random forest gives better accuracy
2. For both Decision tree and Random forest we need to impute nan values.