# Wine Quality Prediction

Importing libraries for pandas


In [1]:
import pandas as pd
import numpy as np

[UCI Data source](https://archive.ics.uci.edu/ml/datasets/wine+quality)

In [2]:
wine_df = pd.read_csv("./ML Datasets/winequality-red.csv",sep=";")
wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [3]:
wine_df.shape

(1599, 12)

check for null values

In [4]:
wine_df.isna().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

## Using ML for Wine Quality prediction

### Using KNN for predicting quality.

In [5]:
# putting df values into array for training the model

In [6]:
array = wine_df.values
X = array[:,0:10]
Y = array[:,11]

### Split the dataset into training and testing dataset by ratio of 90:10

In [7]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.1, random_state=123)

Apply normalization on both train and testing dataset

In [8]:
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

In [9]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 10, shuffle = True,random_state = 1111)

In [10]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor


# define a knn model with default parameter setting
knn = KNeighborsRegressor()
# run the previously defined 10-fold validation on the dataset
results = cross_val_score(knn, X_train_norm, Y_train, cv=kfold)
# print the averae r squared scores
print("Average R2 of KNN:", results.mean())

Average R2 of KNN: 0.21699232864863816


In [11]:
# fine tune parameters for knn model
from sklearn.model_selection import GridSearchCV

grid_params_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan','chebyshev','minkowski'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

knn = KNeighborsRegressor()
gs_knn_result = GridSearchCV(knn, grid_params_knn, cv=kfold).fit(X_train_norm, Y_train)
print(gs_knn_result.best_score_)

0.4323055139014639


In [12]:
gs_knn_result.best_estimator_.score(X_test_norm, Y_test)

0.47204988515661417

In [13]:
gs_knn_result.best_params_

{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 19,
 'weights': 'distance'}

## For white wine

In [14]:
white_wine_df = pd.read_csv("./ML Datasets/winequality-white.csv",sep=";")
white_wine_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [15]:
print(white_wine_df.shape)
print(white_wine_df.isna().sum())

(4898, 12)
fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64


In [16]:
array = white_wine_df.values
X = array[:,0:10]
Y = array[:,11]

In [17]:
from sklearn.model_selection import train_test_split

X_train,X_test,Y_train,Y_test = train_test_split(X, Y, test_size=0.1, random_state=1111)

In [18]:
from sklearn.preprocessing import MinMaxScaler

# fit scaler on training data
norm = MinMaxScaler().fit(X_train)

# transform training data
X_train_norm = norm.transform(X_train)

# transform testing data
X_test_norm = norm.transform(X_test)

In [19]:
from sklearn.model_selection import KFold

kfold = KFold(n_splits = 10, shuffle = True,random_state = 1111)

In [20]:
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor


# define a knn model with default parameter setting
knn = KNeighborsRegressor()
# run the previously defined 10-fold validation on the dataset
results = cross_val_score(knn, X_train_norm, Y_train, cv=kfold)
# print the averae r squared scores
print("Average R2 of KNN:", results.mean())

Average R2 of KNN: 0.30823058646236057


In [21]:
# fine tune parameters for knn model
from sklearn.model_selection import GridSearchCV

grid_params_knn = {
    'n_neighbors': [3, 5, 7, 9, 11, 19],
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan','chebyshev','minkowski'],
    'algorithm' : ['auto', 'ball_tree', 'kd_tree', 'brute']
}

knn = KNeighborsRegressor()
gs_knn_result = GridSearchCV(knn, grid_params_knn, cv=kfold).fit(X_train_norm, Y_train)
print(gs_knn_result.best_score_)

0.48854646245155536


In [22]:
gs_knn_result.best_estimator_.score(X_test_norm, Y_test)

0.562862879425591

In [23]:
gs_knn_result.best_params_

{'algorithm': 'auto',
 'metric': 'manhattan',
 'n_neighbors': 11,
 'weights': 'distance'}