In [70]:
import pandas as pd

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn import metrics

**Read raw data from CSV. No column names are provided in the CSV, so they will be set manually**

In [43]:
label = ['classification']
features = ['alcohol', 'malic_acid', 'ash', 'ash_alcalinity', 'magnesium',
            'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanis',
            'color_intensity', 'hue', 'diluted', 'proline']

df = pd.read_csv('wine.data', header=None, names=label + features)

In [44]:
df.head()

Unnamed: 0,classification,alcohol,malic_acid,ash,ash_alcalinity,magnesium,total_phenols,flavanoids,nonflavanoid_phenols,proanthocyanis,color_intensity,hue,diluted,proline
0,1,14.23,1.71,2.43,15.6,127,2.8,3.06,0.28,2.29,5.64,1.04,3.92,1065
1,1,13.2,1.78,2.14,11.2,100,2.65,2.76,0.26,1.28,4.38,1.05,3.4,1050
2,1,13.16,2.36,2.67,18.6,101,2.8,3.24,0.3,2.81,5.68,1.03,3.17,1185
3,1,14.37,1.95,2.5,16.8,113,3.85,3.49,0.24,2.18,7.8,0.86,3.45,1480
4,1,13.24,2.59,2.87,21.0,118,2.8,2.69,0.39,1.82,4.32,1.04,2.93,735


**Check for null values**

In [45]:
df.isnull().sum()

classification          0
alcohol                 0
malic_acid              0
ash                     0
ash_alcalinity          0
magnesium               0
total_phenols           0
flavanoids              0
nonflavanoid_phenols    0
proanthocyanis          0
color_intensity         0
hue                     0
diluted                 0
proline                 0
dtype: int64

**Create separate datasets for features and labels**

In [46]:
feature_df = df.drop('classification', 1)
label_df = df[['classification']]

**Normalize features so that they range from 0 to 1**

In [48]:
min_max_scaler = MinMaxScaler()
feature_values = feature_df.values 
x_scaled = min_max_scaler.fit_transform(feature_values)
feature_df = pd.DataFrame(x_scaled, columns=features)

**Split dataset into test and train**

In [65]:
x_train, x_test, y_train, y_test = train_test_split(feature_df, label_df, test_size=0.30, random_state=5)

**Create and train the KNN classifier**

In [66]:
knn = KNeighborsClassifier(n_neighbors=3)

In [68]:
knn.fit(x_train, y_train.values.ravel())

KNeighborsClassifier(n_neighbors=3)

**Evaluate the model on the test dataset**

In [72]:
y_pred = knn.predict(x_test)

In [73]:
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.9758064516129032
