# Voting Classifier
Data : white_wine.csv
- Apply Voting Classifier
    - Target : quality (quality > 6 -> Y = 1)
    - Features : density alcohol
- Validate the model using precision, recall, and f1 score in 20% testing data
- Apply soft voting classifier method, using these following method:
    - Logistic Regression
    - Decision Tree : max depth 5
    - KNN : nearest neighbor 3
- Apply soft voting classifier method, using these following method:
    - 3rd degree polynomial fetaures + logistic regression
    - Decision Tree: max depth 5
    - Standard scaler + KNN : nearest neighbor 3

> ## Library and Data

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import classification_report

In [2]:
white_wine = pd.read_csv(r'C:\Users\user\Documents\Data Science\MODUL 3\What Is Ensemble_\white_wine.csv')
white_wine

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.0010,3.00,0.45,8.8,6.0
1,6.3,0.30,0.34,1.6,0.049,14.0,132.0,0.9940,3.30,0.49,9.5,6.0
2,8.1,0.28,0.40,6.9,0.050,30.0,97.0,0.9951,3.26,0.44,10.1,6.0
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.40,9.9,6.0
...,...,...,...,...,...,...,...,...,...,...,...,...
515,6.1,0.31,0.26,2.2,0.051,28.0,167.0,0.9926,3.37,0.47,10.4,6.0
516,6.8,0.18,0.37,1.6,0.055,47.0,154.0,0.9934,3.08,0.45,9.1,5.0
517,7.4,0.15,0.42,1.7,0.045,49.0,154.0,0.9920,3.00,0.60,10.4,6.0
518,5.9,0.13,0.28,1.9,0.050,20.0,78.0,0.9918,3.43,0.64,10.8,6.0


In [3]:
white_wine['alcohol'].fillna(white_wine['alcohol'].mean(), inplace = True) # missing value in alcohol

> ## Data Splitting

In [4]:
y = np.where(white_wine['quality'] > 6, 1, 0)
x = white_wine[['alcohol', 'density']]

In [5]:
x_train, x_test, y_train, y_test = train_test_split(x,
                                    y,
                                    stratify = y,
                                    test_size = 0.2,
                                    random_state = 2020)

> ## Ensemble 1 (without Feature Engineering)

In [6]:
logreg = LogisticRegression(solver = 'liblinear', C = 0.01)
dt = DecisionTreeClassifier(max_depth = 5)
knn = KNeighborsClassifier(n_neighbors = 3)

vc = VotingClassifier([
    ('logreg', logreg),
    ('tree', dt),
    ('KNN', knn)
], voting = 'soft')

In [7]:
vc.fit(x_train, y_train)

VotingClassifier(estimators=[('logreg',
                              LogisticRegression(C=0.01, solver='liblinear')),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('KNN', KNeighborsClassifier(n_neighbors=3))],
                 voting='soft')

In [8]:
y_pred = vc.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.98        84
           1       1.00      0.85      0.92        20

    accuracy                           0.97       104
   macro avg       0.98      0.93      0.95       104
weighted avg       0.97      0.97      0.97       104



> ## Ensemble 2 (with Feature Engineering)

In [11]:
poly = PolynomialFeatures(degree = 3, interaction_only = False, include_bias = False)
logreg = LogisticRegression(solver = 'liblinear', C = 0.01)
pipe_lr = Pipeline([
    ('poly', poly),
    ('clf', logreg)
])

dt = DecisionTreeClassifier(max_depth = 5)

scaler = StandardScaler()
knn = KNeighborsClassifier(n_neighbors = 3)
pipe_knn = Pipeline([
    ('sclaer', scaler),
    ('knn', knn)
])

vc2 = VotingClassifier([
    ('logreg', pipe_lr),
    ('tree', dt),
    ('KNN', pipe_knn)
], voting = 'soft')

In [12]:
vc2.fit(x_train, y_train)

VotingClassifier(estimators=[('logreg',
                              Pipeline(steps=[('poly',
                                               PolynomialFeatures(degree=3,
                                                                  include_bias=False)),
                                              ('clf',
                                               LogisticRegression(C=0.01,
                                                                  solver='liblinear'))])),
                             ('tree', DecisionTreeClassifier(max_depth=5)),
                             ('KNN',
                              Pipeline(steps=[('sclaer', StandardScaler()),
                                              ('knn',
                                               KNeighborsClassifier(n_neighbors=3))]))],
                 voting='soft')

In [13]:
y_pred = vc2.predict(x_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      1.00      0.99        84
           1       1.00      0.90      0.95        20

    accuracy                           0.98       104
   macro avg       0.99      0.95      0.97       104
weighted avg       0.98      0.98      0.98       104

