In [45]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from time import time
from sklearn.neighbors import KNeighborsClassifier

In [3]:
db = pd.read_csv("winequality.csv")

In [4]:
db.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,white,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,white,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,white,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,white,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [5]:
db.loc[:, "type"] = (db["type"] == "red").astype(int)

  db.loc[:, "type"] = (db["type"] == "red").astype(int)


In [6]:
db.head()

Unnamed: 0,type,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,0,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,0,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,0,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [21]:
X, y_quality, y_red = db.iloc[:,1:-1], db.iloc[:,-1], db.iloc[:,0]
X = X.fillna(0)

In [22]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9


In [23]:
y_quality.head()

0    6
1    6
2    6
3    6
4    6
Name: quality, dtype: int64

In [24]:
y_red.head()

0    0
1    0
2    0
3    0
4    0
Name: type, dtype: int64

In [25]:
scalar = StandardScaler()
X = scalar.fit_transform(X)

In [26]:
y_quality = np.asarray(y_quality)
y_red = np.asarray(y_red)

In [27]:
X_train, X_test, y_red_train, y_red_test = train_test_split(X, y_red, test_size=0.25, random_state=42)
X_train, X_test, y_quality_train, y_quality_test = train_test_split(X, y_quality, test_size=0.25, random_state=42)

First implementation

In [28]:
def KNN_old(X_train, Y_train, X_test, k=3):
    previsoes = np.array(list())
    X_test = np.array(X_test)
    Y_train = np.array(Y_train)
    X_train = np.array(X_train)
    for i in X_test:
        labels = np.copy(Y_train)
        distancias = np.array(list())
        quadrado = np.square(X_train - i)
        soma = np.sum(quadrado, axis=1)
        raiz = np.sqrt(soma)
        for i in range(k):
            index = np.argmin(raiz)
            distancias = np.append(distancias, labels[index])
            raiz = np.delete(raiz, index)
            labels = np.delete(labels, index)
        (values,counts) = np.unique(distancias,return_counts=True)
        ind=np.argmax(counts)
        previsoes = np.append(previsoes, values[ind])    
    return previsoes

Second implementation

In [29]:
def KNN(X_test, X_train, y_train, k=3):
    y_pred = []
    for i in X_test:
        distances = np.sqrt(np.sum(np.square(X_train - i), axis=1))
        indexs = np.argpartition(distances, k)[:k]
        labels = y_train[indexs]
        pred = np.argmax(np.bincount(labels))
        y_pred = np.append(y_pred, pred)
    return y_pred

Runtime tests

In [44]:
init = time()
pred_1 = KNN_old(X_train, y_red_train, X_test, k=3)
end = time()
print("Tempo 1: ",end - init)
init2 = time()
pred_2 = KNN(X_test, X_train,y_red_train, k=3)
end2 = time()
print("Tempo 2: ", end2 - init2)

Tempo 1:  0.3188164234161377
Tempo 2:  0.241227388381958


Comparing to sklearn

In [31]:
neigh = KNeighborsClassifier(n_neighbors=3)

In [32]:
neigh = neigh.fit(X_train, y_red_train)

In [46]:
init = time()
y_pred_3 = neigh.predict(X_test)
end = time()
end - init

0.07077455520629883

Testing if they yield the same result

In [39]:
(y_pred_3 == pred_2).all() and (pred_2 == pred_1).all()

True