In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
from pathlib import Path
import os
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import SGDRegressor
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
%matplotlib inline
wine = pd.read_csv('winequality-red.csv') 

In [2]:
from sklearn.utils import shuffle
wine = shuffle(wine, random_state=42)

In [3]:
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599 entries, 803 to 1126
Data columns (total 12 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
dtypes: float64(11), int64(1)
memory usage: 162.4 KB


In [4]:
wine['1']=wine['fixed acidity']-wine['total sulfur dioxide']
wine['5']=wine['sulphates']-wine['chlorides']#황산염-염화물

In [5]:
X_train = wine.drop((["quality","density","fixed acidity"]),axis = 1)
y_train = wine["quality"].copy()

In [6]:
corr_matrix=wine.corr()
wine.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1599 entries, 803 to 1126
Data columns (total 14 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   fixed acidity         1599 non-null   float64
 1   volatile acidity      1599 non-null   float64
 2   citric acid           1599 non-null   float64
 3   residual sugar        1599 non-null   float64
 4   chlorides             1599 non-null   float64
 5   free sulfur dioxide   1599 non-null   float64
 6   total sulfur dioxide  1599 non-null   float64
 7   density               1599 non-null   float64
 8   pH                    1599 non-null   float64
 9   sulphates             1599 non-null   float64
 10  alcohol               1599 non-null   float64
 11  quality               1599 non-null   int64  
 12  1                     1599 non-null   float64
 13  5                     1599 non-null   float64
dtypes: float64(13), int64(1)
memory usage: 187.4 KB


In [7]:
corr_matrix["quality"].sort_values(ascending=False)

quality                 1.000000
alcohol                 0.476166
5                       0.307735
sulphates               0.251397
citric acid             0.226373
1                       0.190265
fixed acidity           0.124052
residual sugar          0.013732
free sulfur dioxide    -0.050656
pH                     -0.057731
chlorides              -0.128907
density                -0.174919
total sulfur dioxide   -0.185100
volatile acidity       -0.390558
Name: quality, dtype: float64

In [8]:
# 스탠다드 스케일러 적용

sc = StandardScaler()

X_train = sc.fit_transform(X_train)


In [9]:
def display_scores(scores):
    print("Scores:", scores)
    print("Mean:", scores.mean())
    print("Standard deviation:", scores.std())

# Adaboost

In [50]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.model_selection import cross_val_score

ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=1.5)
ada_reg.fit(X_train, y_train)
model1 = AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=600, random_state=42,max_depth=50))
model1.fit(X_train, y_train)

model1_scores = cross_val_score(model1, X_train, y_train, scoring="neg_mean_squared_error", cv=10)
model1_rmse_scores = np.sqrt(-model1_scores)
display_scores(model1_rmse_scores)

Scores: [0.52988065 0.51689483 0.5720813  0.536759   0.56572415 0.56750595
 0.54291333 0.57166404 0.5179278  0.51702579]
Mean: 0.5438376838915974
Standard deviation: 0.02230042753300183


In [17]:
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import cross_val_score

reg = GradientBoostingRegressor(random_state=42,n_estimators=2000)
reg.fit(X_train, y_train)

reg_score=cross_val_score(reg,X_train, y_train,scoring="neg_mean_squared_error", cv=10)
score = np.sqrt(-reg_score)
display_scores(score)

Scores: [0.58435808 0.61432691 0.60255311 0.60361811 0.60872942 0.62492511
 0.62076928 0.64088422 0.55373729 0.6118533 ]
Mean: 0.6065754835646732
Standard deviation: 0.02262355948380248


In [None]:
Mean: 0.5986670572492433

In [20]:
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import ExtraTreesRegressor


ada_reg= AdaBoostRegressor(n_estimators=500, random_state=42, learning_rate=1.5)
ada_reg.fit(X_train, y_train)
model1 = AdaBoostRegressor(base_estimator=ExtraTreesRegressor(n_estimators=600, random_state=42,max_depth=50))
model1.fit(X_train, y_train)

ex_reg = ExtraTreesRegressor(n_estimators=1000, random_state=42)
ex_reg.fit(X_train, y_train)

gb_reg = GradientBoostingRegressor(random_state=42,n_estimators=300)
gb_reg.fit(X_train, y_train)

voting_reg = VotingRegressor(
    estimators=[('ex',ex_reg), ('gb',gb_reg),('ada_ex',model1)])

voting_reg.fit(X_train, y_train)



VotingRegressor(estimators=[('ex',
                             ExtraTreesRegressor(n_estimators=1000,
                                                 random_state=42)),
                            ('gb',
                             GradientBoostingRegressor(n_estimators=300,
                                                       random_state=42)),
                            ('ada_ex',
                             AdaBoostRegressor(base_estimator=ExtraTreesRegressor(max_depth=50,
                                                                                  n_estimators=600,
                                                                                  random_state=42)))])

In [22]:
from sklearn.metrics import accuracy_score

for reg in (ex_reg, gb_reg, model1,voting_reg):
    clf_scores = cross_val_score(reg, X_train, y_train,scoring="neg_mean_squared_error", cv=10)
    score = np.sqrt(-reg_score)
display_scores(score)

Scores: [0.58435808 0.61432691 0.60255311 0.60361811 0.60872942 0.62492511
 0.62076928 0.64088422 0.55373729 0.6118533 ]
Mean: 0.6065754835646732
Standard deviation: 0.02262355948380248
