<a href="https://colab.research.google.com/github/hashan789/wine-prediction/blob/main/wine_predict.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from collections import Counter

def euclidean_distance(x1, x2):
   distance = np.sqrt(np.sum((x1-x2)**2))
   return distance


class KNN:
  def __init__(self,k=3):
    self.k = k

  def fit(self,X,Y):
    self.X_train = X
    self.Y_train = Y

  def predict(self, X):
    predictions = [self._predict(x) for x in X]
    return predictions
  
  def _predict(self, x):
    #compute the distance
    distances = [euclidean_distance(x,x_train) for x_train in self.X_train]

    #get the closet x
    k_indices = np.argsort(distances)[:self.k]
    k_nearset_labels = [self.Y_train[i] for i in k_indices]

    #majority vote - get the label with majority vote
    most_common = Counter(k_nearset_labels).most_common()
    return most_common[0][0]



  
  

In [66]:
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import pandas as pd
import warnings
warnings.simplefilter(action='ignore',category=FutureWarning)

url = "https://archive.ics.uci.edu/ml/machine-learning-databases/wine/wine.data"


#Assign names to the columns of the dataset
names = ['Alcohol','Malic acid','Ash','Alcalinity of ash','Magnesium','Total phenols','Flavanoids','Nonflavanoid phenols','Proanthocyanins','Color intensity','Hue','OD280/OD315 of diluted wines','Proline']

#Read dataset
dataset = pd.read_csv(url , names=names)

x = dataset.iloc[: , 0:13].values  #columns of wine dataset - X-axis
y = dataset.iloc[: , -3].values    #values of Hue - Y-axis [13-3 = 10 -> 10th column]

#split the dataset into test and train
X_train,X_test,Y_train,Y_test=train_test_split(x,y,test_size=.2,random_state=1234)


In [63]:
y.head()

1    1.04
1    1.05
1    1.03
1    0.86
1    1.04
Name: Hue, dtype: float64

In [13]:
#---- Data Cleaning -----

# Are there any records with NaN data?
NaN_data = dataset.isnull().any()
if NaN_data.any():
    print("Some records have NaN values. These will be removed...\n")
    before_rows, before_cols = dataset.shape
    wine_df = dataset.dropna()
    after_rows, after_cols = wine_df.shape
    print("Dropped", after_rows - before_rows, "records. Cleaned dataframe has", after_rows, "records.\n")
else:
    print("There are no records with NaN values. Dataframe is already clean.\n")

There are no records with NaN values. Dataframe is already clean.



In [67]:
clf = KNN(k=3)
clf.fit(X_train, Y_train)
predictions = clf.predict(X_test)

print("predictions are ", predictions)

result = (predictions == Y_test)

accuracy_score = np.sum(result) / len(Y_test)
print("accuracy_score is",accuracy_score)

predictions are  [0.86, 0.86, 0.94, 1.04, 1.03, 1.06, 0.57, 1.25, 1.07, 0.57, 0.7, 1.23, 1.25, 0.97, 0.86, 1.06, 1.08, 0.6, 0.75, 0.89, 1.04, 0.79, 1.45, 0.89, 1.36, 0.56, 0.6, 1.19, 1.02, 1.36, 1.11, 0.56, 1.12, 1.36, 0.48, 1.23]
accuracy_score is 0.08333333333333333
