# Wine Classification

## Step 1. Load the dataset

In [1]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.decomposition import KernelPCA
import numpy as np
import pandas as pd
wine = pd.read_csv('./datasets/wine/wine.csv')

## Step 2. Pre-process the dataset using PCA algorithms

In [2]:
#pre_processing
def pre(dataset):
    from sklearn.preprocessing import LabelEncoder
    labelencoder = LabelEncoder()
    X=dataset.drop(['style'],axis=1)
    y=dataset['style']
    X=X.values
    y = labelencoder.fit_transform(y)
    return X,y
def split(X,y):
    from sklearn.model_selection import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    return X_train , X_test , y_train, y_test

In [3]:
#ready dataset
X,y =pre(wine)

In [4]:
from sklearn.decomposition import PCA
from sklearn.decomposition import IncrementalPCA
import numpy as np
import time
class PCA_preprocessing:
    def __init__(self ,X, n):
        self.n=n
        self.X=X
        # PCA
        self.pca = PCA(n_components=n)
        self.PCA_time = time.time()
        self.X_PCA = self.pca.fit_transform(X)
        self.PCA_time = time.time() - self.PCA_time

        # IncrementalPCA
        self.n_batches = 2
        self.inc_pca = IncrementalPCA(n_components=n)
        self.IPCA_time = time.time()
        for X_batch in np.array_split(X, self.n_batches):
            self.inc_pca.partial_fit(X_batch)
        self.X_IPCA = self.inc_pca.transform(X)
        self.IPCA_time = time.time() - self.IPCA_time

        # Randomized PCA
        self.rnd_pca = PCA(n_components=n, svd_solver='randomized')
        self.RPCA_time = time.time()
        self.X_RPCA = self.rnd_pca.fit_transform(X)
        self.RPCA_time = time.time() - self.RPCA_time

    def getTime(self):
        print('PCA fit_transform time : ', self.PCA_time)
        print('IPCA fit_transform time : ', self.IPCA_time)
        print('IPCA fit_transform time : ', self.RPCA_time)

    def getX(self):
        return self.X_PCA , self.X_IPCA , self.X_RPCA

## Step 3. Train a model using each reduced dimensionality dataset.

In [5]:
#make models : SGD, KNN, DT, SVC
class model:
    def __init__(self,X_train,X_test,y_train,y_test):
        import time
        self.X = X_train
        self.X_test = X_test
        self.y = y_train
        self.y_test = y_test

        self.sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty, random_state=42, loss='log')
        self.sgd_time = time.time()
        self.sgd_clf.fit(self.X, self.y)
        self.sgd_time = time.time() - self.sgd_time


        self.knn_clf = KNeighborsClassifier(n_neighbors=2)
        self.knn_time = time.time()
        self.knn_clf.fit(self.X, self.y)
        self.knn_time = time.time() - self.knn_time


        self.tree_clf = DecisionTreeClassifier(max_depth=2, random_state=42)
        self.tree_time = time.time()
        self.tree_clf.fit(self.X, self.y)
        self.tree_time = time.time() - self.tree_time


        self.svm_clf = SVC(gamma='auto', C=2, random_state=42, probability=True)
        self.svc_time = time.time()
        self.svm_clf.fit(self.X, self.y)
        self.svc_time = time.time() - self.svc_time

    def getScore(self,model):
        from sklearn.metrics import accuracy_score
        y_score = model.predict(self.X_test)
        accuracy = accuracy_score(self.y_test, y_score)
        return accuracy

    def printScore(self):
        print('SGD classifier Accuracy : {}'.format(self.getScore(self.sgd_clf)))
        print('KNN classifier Accuracy : {}'.format(self.getScore(self.knn_clf)))
        print('Decision Tree classifier Accuracy : {}'.format(self.getScore(self.tree_clf)))
        print('SVM classifier Accuracy : {}'.format(self.getScore(self.svm_clf)))
        print()

    def letTest(self):
        print('<Test data>')
        print('input data : ', self.X_test[13])
        print('output data : ', self.y_test[13])
        print('<Prediction>')
        print('SGD Prediction : ',self.sgd_clf.predict([self.X_test[13]]))
        print('KNN Prediction : ',self.knn_clf.predict([self.X_test[13]]))
        print('Decision Tree Prediction : ',self.tree_clf.predict([self.X_test[13]]))
        print('SVC Prediction : ',self.svm_clf.predict([self.X_test[13]]))
        print()

    def getTime(self):
        print('<Training Time>')
        print('SGD Classifier : {}'.format(self.sgd_time))
        print('knn Classifier : {}'.format(self.knn_time))
        print('tree Classifier : {}'.format(self.tree_time))
        print('SVC Classifier : {}'.format(self.svc_time))
        print()

## Step 4. Compare each model's perfomance

In [6]:
def printAll(X,y):
    X_train, X_test, y_train, y_test = split(X, y)
    p_model = model(X_train, X_test, y_train, y_test)
    p_model.printScore()
    p_model.letTest()
    p_model.getTime()
    print('=====================================================================')
    print()

In [7]:
def PCA_print(X,y,n):
    pca = PCA_preprocessing(X, n)
    X_PCA, X_IPCA, X_RPCA = pca.getX()
    print('<PCA n_components = {}>'.format(n))
    printAll(X_PCA,y)
    print('<IPCA n_components = {}>'.format(n))
    printAll(X_IPCA, y)
    print('<RPCA n_components = {}>'.format(n))
    printAll(X_RPCA, y)

In [8]:
#To use grid search
def grid_kernelPCA(model,n):
    clf = Pipeline([
        ('kpca', KernelPCA(n_components=n)),
        ('reg',model)
    ])

    param = [{
        'kpca__gamma' : np.linspace(0.03,0.05,10),
        'kpca__kernel' : ['rbf','sigmoid']
    }]
    grid_search = GridSearchCV(clf, param_grid=param, cv=5)
    grid_search.fit(X,y)
    return grid_search

In [9]:
#original data
print('<< model : Original data >>')
printAll(X,y)

#n_components = 2
PCA_print(X,y,2)
#n_components = 4
PCA_print(X,y,4)
#n_components = 6
PCA_print(X,y,6)

<< model : Original data >>
SGD classifier Accuracy : 0.9253846153846154
KNN classifier Accuracy : 0.94
Decision Tree classifier Accuracy : 0.9476923076923077
SVM classifier Accuracy : 0.9453846153846154

<Test data>
input data :  [ 8.1     0.725   0.22    2.2     0.072  11.     41.      0.9967  3.36
  0.55    9.1     5.    ]
output data :  0
<Prediction>
SGD Prediction :  [0]
KNN Prediction :  [0]
Decision Tree Prediction :  [0]
SVC Prediction :  [0]

<Training Time>
SGD Classifier : 0.07489275932312012
knn Classifier : 0.005888462066650391
tree Classifier : 0.008011579513549805
SVC Classifier : 9.5533607006073


<PCA n_components = 2>
SGD classifier Accuracy : 0.8938461538461538
KNN classifier Accuracy : 0.9
Decision Tree classifier Accuracy : 0.9115384615384615
SVM classifier Accuracy : 0.9069230769230769

<Test data>
input data :  [-77.29659432  -1.77204454]
output data :  0
<Prediction>
SGD Prediction :  [0]
KNN Prediction :  [0]
Decision Tree Prediction :  [0]
SVC Prediction :  [

In [10]:
#n_components = 6
print('<SGD>')
grid_sgd_6 = grid_kernelPCA(SGDClassifier(max_iter=5, tol=-np.infty, random_state=42, loss='log'),6)
print('Best hyperparameter: {}'.format(grid_sgd_6.best_params_))

print('<KNN>')
grid_knn_6 = grid_kernelPCA(KNeighborsClassifier(n_neighbors=2),6)
print('Best hyperparameter: {}'.format(grid_knn_6.best_params_))

print('<Decision Tree>')
grid_tree_6 = grid_kernelPCA(DecisionTreeClassifier(max_depth=2, random_state=42),6)
print('Best hyperparameter: {}'.format(grid_tree_6.best_params_))

print('<SVC>')
grid_svc_6 = grid_kernelPCA(SVC(gamma='auto', C=2, random_state=42, probability=True),6)
print('Best hyperparameter: {}'.format(grid_svc_6.best_params_))

<SGD>
Best hyperparameter: {'kpca__gamma': 0.03222222222222222, 'kpca__kernel': 'rbf'}
<KNN>
Best hyperparameter: {'kpca__gamma': 0.03, 'kpca__kernel': 'sigmoid'}
<Decision Tree>
Best hyperparameter: {'kpca__gamma': 0.03222222222222222, 'kpca__kernel': 'rbf'}
<SVC>
Best hyperparameter: {'kpca__gamma': 0.03, 'kpca__kernel': 'rbf'}


In [12]:
best_rbf_pca = KernelPCA(n_components = 2 , kernel ='rbf' , gamma=0.322, fit_inverse_transform=True)
best_sigmoid_pca = KernelPCA(n_components = 2 , kernel ='sigmoid' , gamma=0.3, fit_inverse_transform=True)
from sklearn.metrics import mean_squared_error
def getMSE(pca):
    X_reduced = pca.fit_transform(X)
    X_preimage = pca.inverse_transform(X_reduced)
    print(mean_squared_error(X,X_preimage))

print('<Kernel rbf>')
getMSE(best_rbf_pca)
print('<Kernel sigmoid>')
getMSE(best_sigmoid_pca)

<Kernel rbf>
267.0327978983164
<Kernel sigmoid>
294.65146219846304
