In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score


In [4]:
data = pd.read_csv('horse.csv')


In [5]:
data.head()

Unnamed: 0,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,capillary_refill_time,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,outcome,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data
0,no,adult,530101,38.5,66.0,28.0,cool,reduced,,more_3_sec,...,45.0,8.4,,,died,no,11300,0,0,no
1,yes,adult,534817,39.2,88.0,20.0,,,pale_cyanotic,less_3_sec,...,50.0,85.0,cloudy,2.0,euthanized,no,2208,0,0,no
2,no,adult,530334,38.3,40.0,24.0,normal,normal,pale_pink,less_3_sec,...,33.0,6.7,,,lived,no,0,0,0,yes
3,yes,young,5290409,39.1,164.0,84.0,cold,normal,dark_cyanotic,more_3_sec,...,48.0,7.2,serosanguious,5.3,died,yes,2208,0,0,yes
4,no,adult,530255,37.3,104.0,35.0,,,dark_cyanotic,more_3_sec,...,74.0,7.4,,,died,no,4300,0,0,no


In [6]:
data.isna().sum()

surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       56
peripheral_pulse          69
mucous_membrane           47
capillary_refill_time     32
pain                      55
peristalsis               44
abdominal_distention      56
nasogastric_tube         104
nasogastric_reflux       106
nasogastric_reflux_ph    246
rectal_exam_feces        102
abdomen                  118
packed_cell_volume        29
total_protein             33
abdomo_appearance        165
abdomo_protein           198
outcome                    0
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
dtype: int64

In [7]:
data.outcome.value_counts()

lived         178
died           77
euthanized     44
Name: outcome, dtype: int64

In [8]:
target = data['outcome']

In [9]:
features = data.drop(['outcome'], axis = 1)

In [10]:
features_t = pd.get_dummies(features)

In [25]:
X_train, X_test, y_train, y_test = train_test_split(features_t, target, random_state = 10)

In [26]:
imputer = SimpleImputer(missing_values = np.nan, strategy = 'most_frequent')

In [27]:
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [28]:
my_dt_model = DecisionTreeClassifier(class_weight={'died':1, 'euthanized':2, 'lived':1}, max_depth = 3, max_leaf_nodes=6, min_samples_leaf=1, min_samples_split=2)

In [29]:
my_dt_model.fit(X_train, y_train)

In [30]:
y_pred = my_dt_model.predict(X_test)

In [31]:
print(accuracy_score(y_test,y_pred))

0.6133333333333333


## Voting Classifier

In [32]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier

In [42]:
log_clf = LogisticRegression(solver='lbfgs', max_iter = 1000)
svm_clf = SVC()
dt_clf = DecisionTreeClassifier()

In [43]:
voting_clf = VotingClassifier(estimators=[('lr', log_clf), ('svc', svm_clf), ('dt', dt_clf)])

In [44]:
voting_clf.fit(X_train, y_train)

In [47]:
for clf in (log_clf, svm_clf, dt_clf, voting_clf):
  clf.fit(X_train, y_train)
  y_pred = clf.predict(X_test)
  print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.6266666666666667
SVC 0.6533333333333333
DecisionTreeClassifier 0.6133333333333333
VotingClassifier 0.6533333333333333


## Bagging

In [48]:
from sklearn.ensemble import BaggingClassifier

In [63]:
bag_clf = BaggingClassifier(SVC(), n_estimators = 150, bootstrap=True)

In [64]:
bag_clf.fit(X_train, y_train)

In [65]:
y_pred = bag_clf.predict(X_test)

In [66]:
print(accuracy_score(y_test, y_pred))

0.6533333333333333


In [67]:
bag_clf_log = BaggingClassifier(LogisticRegression(solver='lbfgs', max_iter = 1000), n_estimators = 150, bootstrap=True)

In [68]:
bag_clf_log.fit(X_train, y_train)

In [69]:
y_pred_log = bag_clf_log.predict(X_test)

In [70]:
print(accuracy_score(y_test, y_pred_log))

0.6266666666666667
