In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns


In [2]:
real_df = pd.read_csv("Data set 3 (99 KB) - winequality.csv")
df = real_df.copy(deep=True)

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1599 entries, 0 to 1598
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 150.0 KB


In [4]:
df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
#categorize wine quality
bins = (2,6.5,8)
group_names = ['bad','good']
categories = pd.cut(df['quality'], bins, labels = group_names)
df['quality'] = categories

In [6]:
#splitting data to X ve y
x = df.drop(['quality'], axis = 1)
y = df['quality']

In [7]:
# Encoding our dependent variable:Quality column
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)
y

array([0, 0, 0, ..., 0, 0, 0])

In [8]:
#split into train set and test set
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=40)

In [9]:
# Feature Scaling to x_train and x_test to classify better.
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
x_train = sc.fit_transform(x_train)
x_test = sc.transform(x_test)

In [10]:
from sklearn.svm import SVC
model = SVC()

In [11]:
model.fit(df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']],df['quality'])

In [12]:
predictions = model.predict(df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']])
predictions

array(['bad', 'bad', 'bad', ..., 'bad', 'bad', 'bad'], dtype=object)

In [13]:
df=df.assign(Prediction = predictions)

In [14]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

print ('Accuracy:', accuracy_score(df['quality'], predictions))
print ('Recall:', recall_score(df['quality'], predictions, average="weighted"))
print ('Precision:', precision_score(df['quality'], predictions, average="weighted"))
confusion = confusion_matrix(df['quality'], predictions)
print('Confusion matrix:')
print(confusion)

Accuracy: 0.8655409631019387
Recall: 0.8655409631019387
Precision: 0.8836428371990478
Confusion matrix:
[[1382    0]
 [ 215    2]]


In [15]:
from sklearn.model_selection import GridSearchCV
parameters = {
    'C': [0.1, 1, 10, 100, 1000],
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': [1, 0.1, 0.01, 0.001, 0.0001]
}

In [16]:
svm_grid=GridSearchCV(estimator=model,
                      param_grid=parameters,
                      cv=3,
                      n_jobs=-1, verbose=2)
svm_grid.fit(x_train,y_train)
best_accuracy = svm_grid.best_score_
best_parameters = svm_grid.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


In [17]:
best_accuracy = svm_grid.best_score_
best_parameters = svm_grid.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 90.15 %
Best Parameters: {'C': 1, 'gamma': 1, 'kernel': 'rbf'}


In [18]:
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)
classifier.fit(df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']],df['quality'])

In [19]:
predictions = classifier.predict(df[['fixed acidity','volatile acidity','citric acid','residual sugar','chlorides','free sulfur dioxide','total sulfur dioxide','density','pH','sulphates','alcohol']])
predictions

array(['bad', 'bad', 'bad', ..., 'bad', 'bad', 'bad'], dtype=object)

In [20]:
df=df.assign(Prediction = predictions)

In [21]:
from sklearn.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score, f1_score

print ('Accuracy:', accuracy_score(df['quality'], predictions))
print ('Recall:', recall_score(df['quality'], predictions, average="weighted"))
print ('Precision:', precision_score(df['quality'], predictions, average="weighted"))
confusion = confusion_matrix(df['quality'], predictions)
print('Confusion matrix:')
print(confusion)

Accuracy: 0.9143214509068167
Recall: 0.9143214509068167
Precision: 0.9074940730295981
Confusion matrix:
[[1347   35]
 [ 102  115]]


In [22]:
from sklearn.model_selection import GridSearchCV

parameters = [{'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'leaf_size': [10, 20, 30, 40, 50], 'metric': ['euclidean']},
              {'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'leaf_size': [10, 20, 30, 40, 50], 'metric': ['manhattan']},
              {'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'leaf_size': [10, 20, 30, 40, 50], 'metric': ['chebyshev']},
              {'n_neighbors': [3, 5, 7, 9, 11, 13, 15], 'leaf_size': [10, 20, 30, 40, 50], 'p': [1, 2], 'metric': ['minkowski']}]
grid_search = GridSearchCV(estimator = classifier,
                           param_grid = parameters,
                           scoring = 'accuracy',
                           cv = 10,
                           n_jobs = -1)
grid_search.fit(x_train, y_train)
best_accuracy = grid_search.best_score_ # return mean accuracy on training data
best_parameters = grid_search.best_params_
print("Best Accuracy: {:.2f} %".format(best_accuracy*100))
print("Best Parameters:", best_parameters)

Best Accuracy: 87.96 %
Best Parameters: {'leaf_size': 10, 'metric': 'manhattan', 'n_neighbors': 11}
