In [1]:
import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split 
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score

In [2]:
url = "https://raw.githubusercontent.com/hernanChain/csvFiles/master/finalhorse.csv"
data = pd.read_csv(url)

In [3]:
data.cp_data = data.cp_data.astype(str)
features_columns = ['surgery?', 'Age', 'rectal_temperature', 'pulse', 'respiratory_rate', 'temperature_of_extremities', 'peripheral_pulse', 'mucous_membranes', 'capillary_refill_time', 'pain', 'peristalsis', 'abdominal_distension', 'nasogastric_tube', 'nasogastric_reflux', 'nasogastric_reflux_PH', 'rectal_examination', 'abdomen', 'packed_cell_volume', 'total_protein', 'abdominocentesis_appearance', 'abdomcentesis_total_protein', 'outcome', 'surgical_lesion?', 'type_of_lesion_1', 'type_of_lesion_2', 'type_of_lesion_3']
X = data[features_columns]
y = data.cp_data

In [4]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1) # 70% training and 30% test

In [5]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()
# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = clf.predict(X_test)
#Scores with split
score_pred = metrics.accuracy_score(y_test, y_pred)

In [6]:
# Create the object for cross validation
kf = KFold(n_splits=5,shuffle=False)
kf.split(X) 

<generator object _BaseKFold.split at 0x7f69727ea678>

In [7]:
# Initialize the accuracy of the models to blank list. The accuracy of each model will be appended to this list
accuracy_model = []
 
# Iterate over each train-test split
for train_index, test_index in kf.split(X):
    # Split train-test
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    # Train the model
    model = clf.fit(X_train, y_train)
    # Append to accuracy_model the accuracy of the model
    accuracy_model.append(accuracy_score(y_test, model.predict(X_test), normalize=True))


In [8]:
print("Accuracy with Splitting: ", score_pred)
print("Accuracies with Cross Validation: ", accuracy_model)
print("Accuracy with Cross Validation: ", np.mean(accuracy_model))

Accuracy with Splitting:  0.7777777777777778
Accuracies with Cross Validation:  [0.7333333333333333, 0.8333333333333334, 0.7666666666666667, 0.75, 0.7166666666666667]
Accuracy with Cross Validation:  0.76
