# 필요한 모듈 설정

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from matplotlib import patches
%matplotlib inline

# 데이터 로드

In [2]:
wine = pd.read_csv('WineQT.csv')
wine = wine.drop(['Id'], axis = 1) 
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


## 데이터 크기, 결측치 확인

In [3]:
wine.shape

(1143, 12)

In [4]:
wine.isnull().sum()

fixed acidity           0
volatile acidity        0
citric acid             0
residual sugar          0
chlorides               0
free sulfur dioxide     0
total sulfur dioxide    0
density                 0
pH                      0
sulphates               0
alcohol                 0
quality                 0
dtype: int64

## 데이터 전처리 : Data x |Correlation|

In [5]:
fixed_acidity = wine.loc[:, "fixed acidity"]
volatile_acidity = wine.loc[:, "volatile acidity"]
citric_acid = wine.loc[:, "citric acid"]
residual_sugar = wine.loc[:, "residual sugar"]
chlorides = wine.loc[:, "chlorides"]
free_sulfur_dioxide = wine.loc[:, "free sulfur dioxide"]
total_sulfur_dioxide = wine.loc[:, "total sulfur dioxide"]
density = wine.loc[:, "density"]
pH = wine.loc[:, "pH"]
sulphates = wine.loc[:, "sulphates"]
alcohol = wine.loc[:, "alcohol"]
quality = wine.loc[:, "quality"]

corr_list = []

import scipy.stats as stats

corr_list.append(abs(stats.spearmanr(fixed_acidity, quality).correlation))
corr_list.append(abs(stats.spearmanr(volatile_acidity, quality).correlation))
corr_list.append(abs(stats.spearmanr(citric_acid, quality).correlation))
corr_list.append(abs(stats.spearmanr(residual_sugar, quality).correlation))
corr_list.append(abs(stats.spearmanr(chlorides, quality).correlation))
corr_list.append(abs(stats.spearmanr(free_sulfur_dioxide, quality).correlation))
corr_list.append(abs(stats.spearmanr(total_sulfur_dioxide, quality).correlation))
corr_list.append(abs(stats.spearmanr(density, quality).correlation))
corr_list.append(abs(stats.spearmanr(pH, quality).correlation))
corr_list.append(abs(stats.spearmanr(sulphates, quality).correlation))
corr_list.append(abs(stats.spearmanr(alcohol, quality).correlation))
corr_list.append(abs(stats.spearmanr(quality, quality).correlation))

wine = wine * corr_list


# Modeling 

## Scaled, Normalization

In [6]:
from sklearn import preprocessing
from imblearn.over_sampling import SMOTE 

oversample = SMOTE()
features, labels =  oversample.fit_resample(wine.drop(["quality"],axis=1), wine["quality"])
scaler = preprocessing.MinMaxScaler()
scaled_data = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)
scaled_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247788,0.397260,0.000000,0.068493,0.106845,0.149254,0.098940,0.567548,0.606299,0.137725,0.153846
1,0.283186,0.520548,0.000000,0.116438,0.143573,0.358209,0.215548,0.494126,0.362205,0.209581,0.215385
2,0.283186,0.438356,0.040000,0.095890,0.133556,0.208955,0.169611,0.508811,0.409449,0.191617,0.215385
3,0.584071,0.109589,0.560000,0.068493,0.105175,0.238806,0.190813,0.582232,0.330709,0.149701,0.215385
4,0.247788,0.397260,0.000000,0.068493,0.106845,0.149254,0.098940,0.567548,0.606299,0.137725,0.153846
...,...,...,...,...,...,...,...,...,...,...,...
2893,0.123905,0.193025,0.271822,0.075342,0.077098,0.241516,0.121743,0.220477,0.596256,0.250953,0.672704
2894,0.455759,0.153816,0.475719,0.099708,0.092750,0.051600,0.032459,0.499582,0.339931,0.264751,0.358245
2895,0.297670,0.142148,0.359562,0.134098,0.086738,0.037204,0.024735,0.464328,0.385827,0.419774,0.344692
2896,0.646476,0.136306,0.667245,0.087278,0.096730,0.091911,0.081272,0.596027,0.145697,0.288790,0.245078


In [7]:
normalized_arr = preprocessing.normalize(scaled_data)
normalized_data = pd.DataFrame(normalized_arr, columns=features.columns)
normalized_data

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,0.247821,0.397313,0.000000,0.068502,0.106859,0.149274,0.098953,0.567623,0.606380,0.137743,0.153867
1,0.279605,0.513966,0.000000,0.114966,0.141757,0.353679,0.212822,0.487878,0.357625,0.206931,0.212661
2,0.301354,0.466480,0.042566,0.102042,0.142124,0.222361,0.180493,0.541454,0.435718,0.203910,0.229203
3,0.513628,0.096372,0.492461,0.060232,0.092491,0.210005,0.167800,0.512011,0.290823,0.131646,0.189408
4,0.247821,0.397313,0.000000,0.068502,0.106859,0.149274,0.098953,0.567623,0.606380,0.137743,0.153867
...,...,...,...,...,...,...,...,...,...,...,...
2893,0.116513,0.181509,0.255606,0.070848,0.072498,0.227108,0.114480,0.207324,0.560685,0.235982,0.632572
2894,0.446156,0.150575,0.465695,0.097607,0.090796,0.050513,0.031775,0.489056,0.332769,0.259172,0.350697
2895,0.309488,0.147792,0.373837,0.139422,0.090182,0.038681,0.025717,0.482763,0.401145,0.436440,0.358377
2896,0.539914,0.113838,0.557259,0.072892,0.080785,0.076761,0.067876,0.497781,0.121681,0.241187,0.204680


In [8]:
from numpy import log
import warnings
warnings.filterwarnings(action='ignore')
unskew_data = normalized_data.copy(deep=True)

## train / test split

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(unskew_data, labels, test_size=0.2, random_state=42)

## Classifier Models

In [10]:
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import classification_report
from sklearn import metrics

In [11]:
rnd_clf = RandomForestClassifier(random_state=42)
log_clf = LogisticRegression(random_state=42)
dt_clf = DecisionTreeClassifier(random_state=42)
kn_clf = KNeighborsClassifier()
svm_clf = SVC(random_state=42)

models = [rnd_clf, dt_clf, kn_clf, svm_clf, log_clf]

## fit

In [12]:
for m in models:
    m.fit(X_train, y_train)
    

## predict

In [13]:
accuracy = []
precision = []
recall = []
F1 = []

for m in models:
    y_pred = m.predict(X_test)
    print(m.__class__.__name__,'Acc: ', accuracy_score(y_test,y_pred))
    print(m.__class__.__name__,'Precision: ', precision_score(y_test,y_pred, average = "macro"))
    print(m.__class__.__name__,'Recall: ', recall_score(y_test,y_pred, average = "macro"))
    print(m.__class__.__name__,'F1: ', f1_score(y_test,y_pred, average = "macro"))
    
    print("")
    
    acc = accuracy_score(y_test,y_pred)
    pcs = precision_score(y_test,y_pred, average = "macro")
    rec = recall_score(y_test,y_pred, average = "macro")
    f1 = f1_score(y_test,y_pred, average = "macro")
    
    accuracy.append(acc)
    precision.append(pcs)
    recall.append(rec)
    F1.append(f1)
    
    print("")

RandomForestClassifier Acc:  0.8655172413793103
RandomForestClassifier Precision:  0.8658302909575254
RandomForestClassifier Recall:  0.8706356647455998
RandomForestClassifier F1:  0.866377859796415


DecisionTreeClassifier Acc:  0.7879310344827586
DecisionTreeClassifier Precision:  0.7896546946049657
DecisionTreeClassifier Recall:  0.7929153773515666
DecisionTreeClassifier F1:  0.7909420483004094


KNeighborsClassifier Acc:  0.756896551724138
KNeighborsClassifier Precision:  0.7418955665789978
KNeighborsClassifier Recall:  0.7665572823968133
KNeighborsClassifier F1:  0.7415440083443238


SVC Acc:  0.7206896551724138
SVC Precision:  0.7047723040036314
SVC Recall:  0.7259527643585614
SVC F1:  0.7097193376166292


LogisticRegression Acc:  0.5793103448275863
LogisticRegression Precision:  0.5624798773261995
LogisticRegression Recall:  0.5821540896645386
LogisticRegression F1:  0.5636534640568399




# Metrics Score

In [14]:
import math

metrics = [accuracy, precision, recall , F1]
total = []

for i in range(5):
    avg = 0
    for mt in metrics:
        avg += mt[i]*0.25
    avg = round(avg,2)*100
    avg = math.trunc(avg)
    total.append(avg)
        
print(total)

[87, 79, 75, 72, 56]


In [15]:
for i in accuracy:
    print(i)
print("")
for i in precision:
    print(i)
print("")
for i in recall:
    print(i)
print("")
for i in F1:
    print(i)
print("")
for i in total:
    print(i)
print("")

0.8655172413793103
0.7879310344827586
0.756896551724138
0.7206896551724138
0.5793103448275863

0.8658302909575254
0.7896546946049657
0.7418955665789978
0.7047723040036314
0.5624798773261995

0.8706356647455998
0.7929153773515666
0.7665572823968133
0.7259527643585614
0.5821540896645386

0.866377859796415
0.7909420483004094
0.7415440083443238
0.7097193376166292
0.5636534640568399

87
79
75
72
56

