In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# import warnings filter
from warnings import simplefilter
# ignore all future warnings
simplefilter(action='ignore', category=FutureWarning)


data = pd.read_csv('horse-colic.data', delimiter='\s+', header=None)
test = pd.read_csv('horse-colic.test', delimiter='\s+', header=None)

In [2]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,18,19,20,21,22,23,24,25,26,27
0,2,1,530101,38.5,66,28,3,3,?,2,...,45.0,8.4,?,?,2,2,11300,0,0,2
1,1,1,534817,39.2,88,20,?,?,4,1,...,50.0,85.0,2,2,3,2,2208,0,0,2
2,2,1,530334,38.3,40,24,1,1,3,1,...,33.0,6.7,?,?,1,2,0,0,0,1
3,1,9,5290409,39.1,164,84,4,1,6,2,...,48.0,7.2,3,5.30,2,1,2208,0,0,1
4,2,1,530255,37.3,104,35,?,?,6,2,...,74.0,7.4,?,?,2,2,4300,0,0,2


In [3]:
cols_to_drop = [2, 23, 24, 25, 26, 27]
data = data.drop(columns=cols_to_drop)
test = test.drop(columns=cols_to_drop)
data = data.replace('?', np.nan)
test = test.replace('?', np.nan)


In [4]:
data.head()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,2,1,38.5,66,28,3.0,3.0,,2,5.0,...,,,,3.0,5.0,45.0,8.4,,,2
1,1,1,39.2,88,20,,,4.0,1,3.0,...,,,,4.0,2.0,50.0,85.0,2.0,2.0,3
2,2,1,38.3,40,24,1.0,1.0,3.0,1,3.0,...,,,,1.0,1.0,33.0,6.7,,,1
3,1,9,39.1,164,84,4.0,1.0,6.0,2,2.0,...,1.0,2.0,5.0,3.0,,48.0,7.2,3.0,5.3,2
4,2,1,37.3,104,35,,,6.0,2,,...,,,,,,74.0,7.4,,,2


In [5]:
test.head()

Unnamed: 0,0,1,3,4,5,6,7,8,9,10,...,13,14,15,16,17,18,19,20,21,22
0,2,1,38.5,54,20,,1.0,2,2,3.0,...,2.0,2.0,5.9,,2.0,42.0,6.3,,,1
1,2,1,37.6,48,36,,,1,1,,...,,,,,,44.0,6.3,1.0,5.0,1
2,1,1,37.7,44,28,,4.0,3,2,5.0,...,1.0,1.0,,3.0,5.0,45.0,70.0,3.0,2.0,1
3,1,1,37.0,56,24,3.0,1.0,4,2,4.0,...,1.0,1.0,,,,35.0,61.0,3.0,2.0,3
4,2,1,38.0,42,12,3.0,,3,1,1.0,...,,,,,2.0,37.0,5.8,,,1


In [6]:
# grouping ”euthanized” with ”died”
data[22] = data[22].replace('3', '2')
test[22] = test[22].replace('3', '2')

In [42]:
# impute missing values using KNN imputation
imputer = KNNImputer(n_neighbors=3)
data = pd.DataFrame(imputer.fit_transform(data)).astype(int)
test = pd.DataFrame(imputer.fit_transform(test)).astype(int)

In [43]:
data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,2,1,38,66,28,3,3,2,2,5,...,2,1,5,3,5,45,8,1,4,2
1,1,1,39,88,20,2,2,4,1,3,...,1,1,2,4,2,50,85,2,2,2
2,2,1,38,40,24,1,1,3,1,3,...,1,1,5,1,1,33,6,1,3,1
3,1,9,39,164,84,4,1,6,2,2,...,1,2,5,3,4,48,7,3,5,2
4,2,1,37,104,35,3,2,6,2,3,...,1,2,5,3,4,74,7,2,3,2


In [44]:
test.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,12,13,14,15,16,17,18,19,20,21
0,2,1,38,54,20,2,1,2,2,3,...,2,2,5,3,2,42,6,1,3,1
1,2,1,37,48,36,2,1,1,1,2,...,1,1,6,3,3,44,6,1,5,1
2,1,1,37,44,28,2,4,3,2,5,...,1,1,5,3,5,45,70,3,2,1
3,1,1,37,56,24,3,1,4,2,4,...,1,1,5,2,4,35,61,3,2,2
4,2,1,38,42,12,3,1,3,1,1,...,1,1,5,3,2,37,5,1,2,1


In [45]:
# split the dataset into training and testing sets
X_train, X_test, y_train, y_test = data.loc[:, 0:20], test.loc[:, 0:20], data[21], test[21]

In [46]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X_train)
scaler.fit(X_test)

X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [47]:
from sklearn.model_selection import GridSearchCV

# Define the hyperparameter values to explore
param_grid = {'n_neighbors': [3, 5, 7, 9, 11, 13, 15, 17, 19]}

# Create the grid search object
grid_search = GridSearchCV(KNeighborsClassifier(), param_grid, cv=5)

# Fit the grid search object to the data
grid_search.fit(X_train, y_train)

# Print the best hyperparameters
print(grid_search.best_params_)

{'n_neighbors': 13}


In [48]:
# build the KNN classification model
knn = KNeighborsClassifier(n_neighbors=17)
knn.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=17)

In [49]:
# predict the outcome variable for the testing set
y_pred = knn.predict(X_test)

In [50]:
# evaluate the performance of the model
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.7794117647058824
