In [2]:
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline
wine = pd.read_csv('winequality-red.csv') 

In [3]:
from sklearn.utils import shuffle
wine = shuffle(wine, random_state=42)

In [4]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599 entries, 803 to 1126
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 162.4 KB


In [5]:
wine['1']=wine['fixed acidity']-wine['total sulfur dioxide']
wine['5']=wine['sulphates']-wine['chlorides']#황산염-염화물

In [6]:
X_train = wine.drop((["quality","density","fixed acidity"]),axis = 1)
y_train = wine["quality"].copy()

In [7]:
corr_matrix=wine.corr()
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599 entries, 803 to 1126
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  1                     1599 non-null   float64
 13  5                     1599 non-null   float64
dtypes: float64(13), int64(1)
memory usage: 187.4 KB


In [8]:
corr_matrix["quality"].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
5                       0.307735
sulphates               0.251397
citric acid             0.226373
1                       0.190265
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

In [9]:
# 스탠다드 스케일러 적용

sc = StandardScaler()

X_train = sc.fit_transform(X_train)


In [10]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# voting

In [13]:
from sklearn.ensemble import VotingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=2)
ada_reg.fit(X_train, y_train)

ex1_reg = ExtraTreesRegressor(n_estimators=400, random_state=42,max_depth=20)
ex1_reg.fit(X_train, y_train)
ex2_reg = ExtraTreesRegressor(n_estimators=500, random_state=42,max_depth=50)
ex2_reg.fit(X_train, y_train)
model1 = AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=500, random_state=42,max_depth=50))
model1.fit(X_train, y_train)

voting_reg = VotingRegressor(
    estimators=[('ex1',ex1_reg), ('ex2',ex2_reg),('ada',model1)])

voting_reg.fit(X_train, y_train)



VotingRegressor(estimators=[('ex1',
                             ExtraTreesRegressor(max_depth=20, n_estimators=400,
                                                 random_state=42)),
                            ('ex2',
                             ExtraTreesRegressor(max_depth=50, n_estimators=500,
                                                 random_state=42)),
                            ('ada',
                             AdaBoostRegressor(base_estimator=ExtraTreesRegressor(max_depth=50,
                                                                                  n_estimators=500,
                                                                                  random_state=42)))])

In [28]:
from sklearn.model_selection import cross_val_score

temp = []
for reg in [ex1_reg, ex2_reg,model1, voting_reg]:
    reg_score=cross_val_score(voting_reg,X_train, y_train,scoring="neg_mean_squared_error", cv=10,n_jobs=-1)
    score = np.sqrt(-reg_score)
    temp.append(score)
    display_scores(score)
# print("Mean: ",(score(),label))
 

Scores: [0.53365271 0.5276345  0.57580465 0.53481577 0.5551453  0.58030228
 0.53365711 0.57162171 0.52058185 0.51280071]
Mean: 0.5446016581033308
Standard deviation: 0.023023846118914112
Scores: [0.53463918 0.52791732 0.57617498 0.53442957 0.55564168 0.58066503
 0.53333138 0.57038707 0.51969221 0.51211717]
Mean: 0.5444995574697368
Standard deviation: 0.023168811597168264
Scores: [0.53478304 0.52778904 0.57650234 0.53488608 0.55531338 0.58037347
 0.53240381 0.57054094 0.5195324  0.5130194 ]
Mean: 0.5445143894942798
Standard deviation: 0.023093166418843975
Scores: [0.534836   0.52700042 0.57610576 0.53559687 0.55601054 0.58166119
 0.533402   0.57032942 0.51811527 0.51221509]
Mean: 0.5445272568010208
Standard deviation: 0.023494788170826692


### ex1 : Mean: 0.5446016581033308
### ex2 : Mean: 0.5444995574697368
### ada : Mean: 0.5445143894942798
### voting : Mean: 0.5445272568010208

# Stacking

In [38]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=2)
ada_reg.fit(X_train, y_train)

estimators = [
     ('ex', ExtraTreesRegressor(n_estimators=400, random_state=42)),
     ('model1', AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=500, random_state=42,max_depth=50))),

     ('svr',make_pipeline(StandardScaler(),
                           SVR(kernel="rbf",gamma="scale",degree=18,coef0=100,C=50)))
 ]
reg = StackingRegressor(
     estimators=estimators, final_estimator= ExtraTreesRegressor(n_estimators=600,max_depth=30, random_state=42)
)
reg.fit(X_train, y_train)


StackingRegressor(estimators=[('ex',
                               ExtraTreesRegressor(n_estimators=400,
                                                   random_state=42)),
                              ('model1',
                               AdaBoostRegressor(base_estimator=ExtraTreesRegressor(max_depth=50,
                                                                                    n_estimators=500,
                                                                                    random_state=42))),
                              ('svr',
                               Pipeline(steps=[('standardscaler',
                                                StandardScaler()),
                                               ('svr',
                                                SVR(C=50, coef0=100,
                                                    degree=18))]))],
                  final_estimator=ExtraTreesRegressor(max_depth=30,
                                               

In [40]:
from sklearn.model_selection import cross_val_score

reg_scores = cross_val_score(reg,X_train, y_train, scoring="neg_mean_squared_error", cv=10)
score = np.sqrt(-reg_scores)
display_scores(score)


Scores: [0.6280354  0.6042202  0.63926212 0.58318207 0.59785261 0.58219114
 0.5887105  0.60467956 0.56172324 0.54369225]
Mean: 0.5933549082763601
Standard deviation: 0.02700623689244713


# Stacking

In [43]:
from sklearn.ensemble import StackingRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=2)
ada_reg.fit(X_train, y_train)

estimators = [
     ('ex', ExtraTreesRegressor(n_estimators=400, random_state=42)),
     ('model1', AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=500, random_state=42,max_depth=50))),

     #('svr',make_pipeline(StandardScaler(),
                           SVR(kernel="rbf",gamma="scale",degree=18,coef0=100,C=50)))
 ]
reg = StackingRegressor(
     estimators=estimators, final_estimator= ExtraTreesRegressor(n_estimators=600,max_depth=30, random_state=42)
)
reg.fit(X_train, y_train)


StackingRegressor(estimators=[('ex',
                               ExtraTreesRegressor(n_estimators=400,
                                                   random_state=42)),
                              ('model1',
                               AdaBoostRegressor(base_estimator=ExtraTreesRegressor(max_depth=50,
                                                                                    n_estimators=500,
                                                                                    random_state=42)))],
                  final_estimator=ExtraTreesRegressor(max_depth=30,
                                                      n_estimators=600,
                                                      random_state=42))

In [44]:
from sklearn.model_selection import cross_val_score

reg_scores = cross_val_score(reg,X_train, y_train, scoring="neg_mean_squared_error", cv=10)
score = np.sqrt(-reg_scores)
display_scores(score)


Scores: [0.60901424 0.6774765  0.63927058 0.6233675  0.62162022 0.61905543
 0.59884424 0.62664782 0.58347486 0.60870258]
Mean: 0.6207473954129392
Standard deviation: 0.02399653884428279


# AdaBoost(ExtraTrees)

In [47]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=2)
ada_reg.fit(X_train, y_train)
model1 = AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=500, random_state=42,max_depth=50))
model1.fit(X_train, y_train)

AdaBoostRegressor(base_estimator=ExtraTreesRegressor(max_depth=50,
                                                     n_estimators=500,
                                                     random_state=42))

In [48]:
model1_scores = cross_val_score(model1, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
model1_rmse_scores = np.sqrt(-model1_scores)
display_scores(model1_rmse_scores)

Scores: [0.53429956 0.52094256 0.57275529 0.53706857 0.56432807 0.56774792
 0.54300589 0.572844   0.52016012 0.51619866]
Mean: 0.5449350655054588
Standard deviation: 0.021548376805195158


# Adaboost

In [50]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=1.5)
ada_reg.fit(X_train, y_train)
model1 = AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=600, random_state=42,max_depth=50))
model1.fit(X_train, y_train)

model1_scores = cross_val_score(model1, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
model1_rmse_scores = np.sqrt(-model1_scores)
display_scores(model1_rmse_scores)

Scores: [0.52988065 0.51689483 0.5720813  0.536759   0.56572415 0.56750595
 0.54291333 0.57166404 0.5179278  0.51702579]
Mean: 0.5438376838915974
Standard deviation: 0.02230042753300183
