# 필요한 모듈 설정

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import patches
%matplotlib inline

# 데이터 로드

In [2]:
wine = pd.read_csv('WineQT_add.csv')
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## 데이터 크기, 결측치 확인

In [3]:
wine.shape

(2742, 12)

In [4]:
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

# Modeling 

## Scaled, Normalization

In [5]:
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE 

oversample = SMOTE()
features, labels =  oversample.fit_resample(wine.drop(["quality"],axis=1), wine["quality"])
scaler = preprocessing.MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
scaled_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.397260,0.000000,0.068493,0.106845,0.140845,0.098940,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.000000,0.116438,0.143573,0.338028,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.040000,0.095890,0.133556,0.197183,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.560000,0.068493,0.105175,0.225352,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.397260,0.000000,0.068493,0.106845,0.140845,0.098940,0.567548,0.606299,0.137725,0.153846
...,...,...,...,...,...,...,...,...,...,...,...
6979,0.190092,0.269620,0.146355,0.088171,0.085345,0.385415,0.144447,0.232894,0.643844,0.245509,0.736679
6980,0.437700,0.127134,0.556631,0.128598,0.111648,0.068841,0.038472,0.471355,0.322835,0.335135,0.495597
6981,0.470912,0.137021,0.547970,0.124644,0.106828,0.064775,0.037452,0.488312,0.322835,0.288460,0.464507
6982,0.242944,0.173818,0.341129,0.098085,0.081078,0.197183,0.087139,0.380721,0.405435,0.269912,0.494044


In [6]:
normalized_arr = preprocessing.normalize(scaled_data)
normalized_data = pd.DataFrame(normalized_arr, columns=features.columns)
normalized_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.248124,0.397799,0.000000,0.068586,0.106990,0.141036,0.099074,0.568317,0.607121,0.137911,0.154055
1,0.281540,0.517522,0.000000,0.115762,0.142738,0.336063,0.214295,0.491254,0.360099,0.208363,0.214133
2,0.302173,0.467748,0.042682,0.102320,0.142511,0.210404,0.180984,0.542926,0.436902,0.204465,0.229826
3,0.514873,0.096605,0.493654,0.060378,0.092715,0.198654,0.168206,0.513252,0.291528,0.131965,0.189867
4,0.248124,0.397799,0.000000,0.068586,0.106990,0.141036,0.099074,0.568317,0.607121,0.137911,0.154055
...,...,...,...,...,...,...,...,...,...,...,...
6979,0.161439,0.228980,0.124295,0.074881,0.072481,0.327322,0.122675,0.197790,0.546798,0.208503,0.625639
6980,0.393521,0.114302,0.500448,0.115619,0.100379,0.061892,0.034589,0.423779,0.290250,0.301309,0.445575
6981,0.427221,0.124308,0.497129,0.113079,0.096916,0.058765,0.033977,0.443006,0.292882,0.261697,0.421410
6982,0.256867,0.183780,0.360679,0.103706,0.085724,0.208484,0.092132,0.402540,0.428671,0.285380,0.522358


In [7]:
from numpy import log
import warnings
warnings.filterwarnings(action='ignore')

unskew_data = normalized_data.copy(deep=True)

## Classifier Models

In [8]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [9]:
rnd_clf = RandomForestClassifier(random_state=42)
log_clf = LogisticRegression(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)
kn_clf = KNeighborsClassifier()
svm_clf = SVC(random_state=42)


models = [rnd_clf, dt_clf, kn_clf, svm_clf, log_clf]

## train / test split

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(unskew_data, labels, test_size=0.2, random_state=42)

## fit

In [11]:
for m in models:
    m.fit(X_train, y_train)

## predict

In [12]:
accuracy = []
precision = []
recall = []
F1 = []

for m in models:
    y_pred = m.predict(X_test)
    print(m.__class__.__name__,'Acc: ', accuracy_score(y_test,y_pred))
    print(m.__class__.__name__,'Precision: ', precision_score(y_test,y_pred, average = "macro"))
    print(m.__class__.__name__,'Recall: ', recall_score(y_test,y_pred, average = "macro"))
    print(m.__class__.__name__,'F1: ', f1_score(y_test,y_pred, average = "macro"))
    
    print("")
    
    acc = accuracy_score(y_test,y_pred)
    pcs = precision_score(y_test,y_pred, average = "macro")
    rec = recall_score(y_test,y_pred, average = "macro")
    f1 = f1_score(y_test,y_pred, average = "macro")
    
    accuracy.append(acc)
    precision.append(pcs)
    recall.append(rec)
    F1.append(f1)
    
    print("")

RandomForestClassifier Acc:  0.9670722977809592
RandomForestClassifier Precision:  0.9672869407101296
RandomForestClassifier Recall:  0.9690131489345513
RandomForestClassifier F1:  0.9678051455468987


DecisionTreeClassifier Acc:  0.9284180386542591
DecisionTreeClassifier Precision:  0.9289212458565358
DecisionTreeClassifier Recall:  0.9297694887222926
DecisionTreeClassifier F1:  0.9293131475520479


KNeighborsClassifier Acc:  0.8482462419470294
KNeighborsClassifier Precision:  0.8442578546909556
KNeighborsClassifier Recall:  0.8530405075614093
KNeighborsClassifier F1:  0.8447993033625393


SVC Acc:  0.7093772369362921
SVC Precision:  0.6944225182525051
SVC Recall:  0.7185000721659085
SVC F1:  0.6982284806161864


LogisticRegression Acc:  0.5504652827487473
LogisticRegression Precision:  0.5378204035359688
LogisticRegression Recall:  0.5618437387045698
LogisticRegression F1:  0.5377964546853241




# Metrics Score

In [18]:
import math

metrics = [accuracy, precision, recall , F1]
total = []

for i in range(5):
    avg = 0
    for mt in metrics:
        avg += mt[i]*0.25
    avg = round(avg,2)*100
    avg = math.trunc(avg)
    total.append(avg)
        
print(total)

[97, 93, 85, 71, 55]
