In [1]:
# 경고(에러가 아닌) 메세지를 뜨지 않게 하기 위한 명령어
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd

In [4]:
data = pd.read_csv('./winequality-red.csv', sep = ',')
data.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


# 통계적 방법 : 선형회귀분석

!pip install statsmodels

In [9]:
# 통계분석모델(statsmodels) 불러오기
import statsmodels.api as sm

In [14]:
# 독립변수 셋만 별도로 X에 할당
X = data[data.columns[0:-1]]

# 종속변수 셋만 별도로 y에 할당
y = data[['quality']]

In [15]:
X.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [16]:
y.head()

Unnamed: 0,quality
0,5
1,5
2,5
3,6
4,5


In [19]:
# 상수항 변수 (y절편) 추가하기
X_add = sm.add_constant(X)
X_add.head()

Unnamed: 0,const,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol
0,1.0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4
1,1.0,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8
2,1.0,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8
3,1.0,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8
4,1.0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4


In [20]:
# 선형회귀모델(OrdinaryLeastSqured) 적용하기
multi_model = sm.OLS(y, X_add).fit()

# 주요 결과물 살펴보기
print(multi_model.summary())

                            OLS Regression Results                            
Dep. Variable:                quality   R-squared:                       0.361
Model:                            OLS   Adj. R-squared:                  0.356
Method:                 Least Squares   F-statistic:                     81.35
Date:                Mon, 28 Feb 2022   Prob (F-statistic):          1.79e-145
Time:                        18:25:41   Log-Likelihood:                -1569.1
No. Observations:                1599   AIC:                             3162.
Df Residuals:                    1587   BIC:                             3227.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
const                   21.9652 

# 머신러닝 방법 : 선형회귀분석

In [21]:
# X와 y 나누기
X = data[data.columns[0:11]]
y = data[['quality']]

In [22]:
# train과 test 나누는 모듈 (train_test_split) 불러오기
from sklearn.model_selection import train_test_split

In [23]:
# 7:3의 비율로 train과 test 셋 나누기
X_train, X_test, y_train, y_test = train_test_split(X,y, train_size = 0.7, random_state = 42)

In [24]:
# 사이킷런에서 선형회귀모델 적용하기
from sklearn.linear_model import LinearRegression

In [25]:
# LinearRegression을 model이라는 이름으로 사용하기
model = LinearRegression()

In [26]:
# 모델 적용하기 (훈련데이터만)
model.fit(X_train, y_train)

LinearRegression()

In [27]:
# 성능지표(= R-square : 설명력) 결과보기 => 정확도 36%
model.score(X_train, y_train)

0.36119824413213175

In [28]:
# 테스트 데이터에 훈련모델 적용하기 => 정확도 35%
model.score(X_test, y_test)

0.351388533250524

### train과 test의 정확도가 비슷할수록 안정적인 모델!