## Step 1. Find Dataset that has a number of features.

- Red & White wine Dataset( https://www.kaggle.com/numberswithkartik/red-white-wine-dataset )

In [1]:
#dataset : 
import pandas as pd
wine = pd.read_csv('./datasets/wine/wine.csv')
wine.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality,style
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,red
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,red
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,red
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,red


In [2]:
wine.shape

(6497, 13)

In [3]:
wine.isnull().sum()

fixed_acidity           0
volatile_acidity        0
citric_acid             0
residual_sugar          0
chlorides               0
free_sulfur_dioxide     0
total_sulfur_dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
style                   0
dtype: int64

In [4]:
X=wine.drop(['style'],axis=1)
y=wine['style']
X.head()

Unnamed: 0,fixed_acidity,volatile_acidity,citric_acid,residual_sugar,chlorides,free_sulfur_dioxide,total_sulfur_dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [5]:
#preprocessing
from sklearn.preprocessing import LabelEncoder
labelencoder=LabelEncoder()
y = labelencoder.fit_transform(y)
y

array([0, 0, 0, ..., 1, 1, 1])

In [6]:
X=X.values

- Red = 0 & White = 1

In [7]:
#training set & test set 나누기
from sklearn.model_selection import train_test_split
import numpy as np
X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.2,random_state=42)

## Step 2. Train and validate model using dataset

In [8]:
import time
import numpy as np
from sklearn.linear_model import SGDClassifier
sgd_clf = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
sgd_time = time.time() 
sgd_clf.fit(X_train,y_train)
sgd_time = time.time() -sgd_time

from sklearn.neighbors import KNeighborsClassifier
knn_clf = KNeighborsClassifier(n_neighbors=2)
knn_time = time.time() 
knn_clf.fit(X_train,y_train)
knn_time = time.time() -knn_time

from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_time = time.time() 
tree_clf.fit(X_train,y_train)
tree_time = time.time() -tree_time

from sklearn.svm import SVC
svm_clf = SVC(gamma='auto',C=2,random_state=42,probability=True)
svc_time = time.time() 
svm_clf.fit(X_train,y_train)
svc_time = time.time() -svc_time 

In [9]:
from sklearn.metrics import accuracy_score
def getScore(model):
    y_score = model.predict(X_test)
    accuracy = accuracy_score(y_test,y_score)
    return accuracy
print('SGD classifier Accuracy : {}'.format(getScore(sgd_clf)))
print('KNN classifier Accuracy : {}'.format(getScore(knn_clf)))
print('Decision Tree classifier Accuracy : {}'.format(getScore(tree_clf)))
print('SVM classifier Accuracy : {}'.format(getScore(svm_clf)))

SGD classifier Accuracy : 0.9253846153846154
KNN classifier Accuracy : 0.94
Decision Tree classifier Accuracy : 0.9476923076923077
SVM classifier Accuracy : 0.9453846153846154


In [10]:
def letTest(model1,model2,model3,model4):
    print('input data : ',X_test[13])
    print('output data : ',y_test[13])
    print('<Prediction>')
    print(model1.predict([X_test[13]]))
    print(model2.predict([X_test[13]]))
    print(model3.predict([X_test[13]]))
    print(model4.predict([X_test[13]]))

In [11]:
letTest(sgd_clf,knn_clf,tree_clf,svm_clf)

input data :  [ 8.1     0.725   0.22    2.2     0.072  11.     41.      0.9967  3.36
  0.55    9.1     5.    ]
output data :  0
<Prediction>
[0]
[0]
[0]
[0]


## Step 3. Reduce features using PCA

In [12]:
from sklearn.decomposition import PCA
pca_2 = PCA(n_components=2)
pca_3 = PCA(n_components=3)
pca_5 = PCA(n_components=5)
X_2 = pca_2.fit_transform(X)
X_3 = pca_3.fit_transform(X)
X_5 = pca_5.fit_transform(X)

## Step 4. Train and validate model using step3's dataset

In [13]:
X_train , X_test= train_test_split(X_2,test_size=0.2,random_state=42)
sgd_clf_PCA2 = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
sgd_time_2 = time.time() 
sgd_clf_PCA2.fit(X_train,y_train)
sgd_time_2 = time.time() -sgd_time_2

knn_clf_PCA2 = KNeighborsClassifier(n_neighbors=2)
knn_time_2 = time.time() 
knn_clf_PCA2.fit(X_train,y_train)
knn_time_2 = time.time() -knn_time_2

tree_clf_PCA2 = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_time_2 = time.time() 
tree_clf_PCA2.fit(X_train,y_train)
tree_time_2 = time.time() -tree_time_2

svm_clf_PCA2 = SVC(gamma='auto',C=2,random_state=42,probability=True)
svc_time_2 = time.time() 
svm_clf_PCA2.fit(X_train,y_train)
svc_time_2 = time.time() -svc_time_2

In [15]:
print('<Using PCA : feature 2>')
print('SGD classifier Accuracy : {}'.format(getScore(sgd_clf_PCA2)))
print('KNN classifier Accuracy : {}'.format(getScore(knn_clf_PCA2)))
print('Decision Tree classifier Accuracy : {}'.format(getScore(tree_clf_PCA2)))
print('SVM classifier Accuracy : {}'.format(getScore(svm_clf_PCA2)))

<Using PCA : feature 2>
SGD classifier Accuracy : 0.92
KNN classifier Accuracy : 0.9
Decision Tree classifier Accuracy : 0.9115384615384615
SVM classifier Accuracy : 0.9069230769230769


In [14]:
letTest(sgd_clf_PCA2,knn_clf_PCA2,tree_clf_PCA2,svm_clf_PCA2)

input data :  [-77.29659432  -1.77204454]
output data :  0
<Prediction>
[0]
[0]
[0]
[0]


In [16]:
X_train , X_test= train_test_split(X_3,test_size=0.2,random_state=42)
sgd_clf_PCA3 = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
sgd_time_3 = time.time() 
sgd_clf_PCA3.fit(X_train,y_train)
sgd_time_3 = time.time() -sgd_time_3

knn_clf_PCA3 = KNeighborsClassifier(n_neighbors=2)
knn_time_3 = time.time() 
knn_clf_PCA3.fit(X_train,y_train)
knn_time_3 = time.time() -knn_time_3

tree_clf_PCA3 = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_time_3 = time.time() 
tree_clf_PCA3.fit(X_train,y_train)
tree_time_3 = time.time() -tree_time_3

svm_clf_PCA3 = SVC(gamma='auto',C=2,random_state=42,probability=True)
svc_time_3 = time.time() 
svm_clf_PCA3.fit(X_train,y_train)
svc_time_3 = time.time() -svc_time_3

In [18]:
print('<Using PCA : feature 3>')
print('SGD classifier Accuracy : {}'.format(getScore(sgd_clf_PCA3)))
print('KNN classifier Accuracy : {}'.format(getScore(knn_clf_PCA3)))
print('Decision Tree classifier Accuracy : {}'.format(getScore(tree_clf_PCA3)))
print('SVM classifier Accuracy : {}'.format(getScore(svm_clf_PCA3)))

<Using PCA : feature 3>
SGD classifier Accuracy : 0.8915384615384615
KNN classifier Accuracy : 0.9084615384615384
Decision Tree classifier Accuracy : 0.9115384615384615
SVM classifier Accuracy : 0.9261538461538461


In [17]:
letTest(sgd_clf_PCA3,knn_clf_PCA3,tree_clf_PCA3,svm_clf_PCA3)

input data :  [-77.29659432  -1.77204454   0.12417026]
output data :  0
<Prediction>
[0]
[0]
[0]
[0]


In [19]:
X_train , X_test= train_test_split(X_5,test_size=0.2,random_state=42)
sgd_clf_PCA5 = SGDClassifier(max_iter=5, tol=-np.infty,random_state=42,loss='log')
sgd_time_5 = time.time() 
sgd_clf_PCA5.fit(X_train,y_train)
sgd_time_5 = time.time() -sgd_time_5

knn_clf_PCA5 = KNeighborsClassifier(n_neighbors=2)
knn_time_5 = time.time() 
knn_clf_PCA5.fit(X_train,y_train)
knn_time_5 = time.time() -knn_time_5

tree_clf_PCA5 = DecisionTreeClassifier(max_depth=2,random_state=42)
tree_time_5 = time.time() 
tree_clf_PCA5.fit(X_train,y_train)
tree_time_5 = time.time() -tree_time_5

svm_clf_PCA5 = SVC(gamma='auto',C=2,random_state=42,probability=True)
svc_time_5 = time.time() 
svm_clf_PCA5.fit(X_train,y_train)
svc_time_5 = time.time() -svc_time_5

In [21]:
print('<Using PCA : feature 5>')
print('SGD classifier Accuracy : {}'.format(getScore(sgd_clf_PCA5)))
print('KNN classifier Accuracy : {}'.format(getScore(knn_clf_PCA5)))
print('Decision Tree classifier Accuracy : {}'.format(getScore(tree_clf_PCA5)))
print('SVM classifier Accuracy : {}'.format(getScore(svm_clf_PCA5)))

<Using PCA : feature 5>
SGD classifier Accuracy : 0.9315384615384615
KNN classifier Accuracy : 0.9384615384615385
Decision Tree classifier Accuracy : 0.9146153846153846
SVM classifier Accuracy : 0.9292307692307692


In [20]:
letTest(sgd_clf_PCA5,knn_clf_PCA5,tree_clf_PCA5,svm_clf_PCA5)

input data :  [-77.29659432  -1.77204454   0.12417026   1.60533549  -1.21018723]
output data :  0
<Prediction>
[0]
[0]
[0]
[0]


## Step 5. Compare the step 2 and step 4 performances

In [22]:
print('<Training Time Compare>')
print('Without PCA : feature 12')
print('SGD Classifier : {}'.format(sgd_time))
print('knn Classifier : {}'.format(knn_time))
print('tree Classifier : {}'.format(tree_time))
print('SVC Classifier : {}'.format(svc_time))
print('Using PCA : feature 2')
print('SGD Classifier : {}'.format(sgd_time_2))
print('knn Classifier : {}'.format(knn_time_2))
print('tree Classifier : {}'.format(tree_time_2))
print('SVC Classifier : {}'.format(svc_time_2))
print('Using PCA : feature 3')
print('SGD Classifier : {}'.format(sgd_time_3))
print('knn Classifier : {}'.format(knn_time_3))
print('tree Classifier : {}'.format(tree_time_3))
print('SVC Classifier : {}'.format(svc_time_3))
print('Using PCA : feature 5')
print('SGD Classifier : {}'.format(sgd_time_5))
print('knn Classifier : {}'.format(knn_time_5))
print('tree Classifier : {}'.format(tree_time_5))
print('SVC Classifier : {}'.format(svc_time_5))

<Training Time Compare>
Without PCA : feature 12
SGD Classifier : 0.0
knn Classifier : 0.015627622604370117
tree Classifier : 0.0
SVC Classifier : 6.109557867050171
Using PCA : feature 2
SGD Classifier : 0.003999471664428711
knn Classifier : 0.002000570297241211
tree Classifier : 0.0029997825622558594
SVC Classifier : 7.517149448394775
Using PCA : feature 3
SGD Classifier : 0.004998445510864258
knn Classifier : 0.001999378204345703
tree Classifier : 0.004000425338745117
SVC Classifier : 7.4215476512908936
Using PCA : feature 5
SGD Classifier : 0.004998922348022461
knn Classifier : 0.0019996166229248047
tree Classifier : 0.006001710891723633
SVC Classifier : 7.190220355987549


In [24]:
#Explained Variance Ratio
print(pca_2.explained_variance_ratio_)
print(pca_3.explained_variance_ratio_)
print(pca_5.explained_variance_ratio_)

[0.95355286 0.04062238]
[0.95355286 0.04062238 0.00482557]
[9.53552856e-01 4.06223786e-02 4.82556665e-03 4.94432602e-04
 3.46689059e-04]
