In [1]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt

from matplotlib import rc 
rc('font',family='AppleGothic')

### wine 데이터셋
#### [데이터 설명]
- 데이터 링크(출처) : http://archive.ics.uci.edu/ml/machine-learning-databases/wine-quality
- winequality-red.csv / winequality-white.csv 두 파일을 다운로드
- 관측치 개수: 6497개
- 변수 개수: 독립변수 11개 / 종속변수 1개

#### 변수
- fixed acidity : 고정산
- volatile acidity : 휘발산
- critric acid : 구연산
- residual sugar : 잔당
- chlorides : 염화물
- free sulfur dioxide : 유리 이산화황
- total sulfur dioxide : 총 이산화황
- density : 밀도
- pH
- sulphates : 황산염
- alcohol : 알코올
- quality : 등급

In [2]:
red_df = pd.read_csv('/Users/zoe/Downloads/winequality-red.csv', sep = ';', header = 0, engine = 'python')
white_df = pd.read_csv('/Users/zoe/Downloads/winequality-white.csv', sep = ';', header = 0, engine = 'python')

In [3]:
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5


In [4]:
red_df.shape

(1599, 12)

In [5]:
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6


In [6]:
white_df.shape

(4898, 12)

In [7]:
# 각 데이터에 레드/화이트 와인을 구별할 수 있는 type 변수 추가
red_df['class'] = 0
white_df['class'] = 1

In [8]:
red_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [9]:
white_df.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class
0,7.0,0.27,0.36,20.7,0.045,45.0,170.0,1.001,3.0,0.45,8.8,6,1
1,6.3,0.3,0.34,1.6,0.049,14.0,132.0,0.994,3.3,0.49,9.5,6,1
2,8.1,0.28,0.4,6.9,0.05,30.0,97.0,0.9951,3.26,0.44,10.1,6,1
3,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1
4,7.2,0.23,0.32,8.5,0.058,47.0,186.0,0.9956,3.19,0.4,9.9,6,1


In [10]:
wine = pd.concat([red_df, white_df])
wine.shape

(6497, 13)

In [11]:
wine.head()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class
0,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0
1,7.8,0.88,0.0,2.6,0.098,25.0,67.0,0.9968,3.2,0.68,9.8,5,0
2,7.8,0.76,0.04,2.3,0.092,15.0,54.0,0.997,3.26,0.65,9.8,5,0
3,11.2,0.28,0.56,1.9,0.075,17.0,60.0,0.998,3.16,0.58,9.8,6,0
4,7.4,0.7,0.0,1.9,0.076,11.0,34.0,0.9978,3.51,0.56,9.4,5,0


In [12]:
wine.tail()

Unnamed: 0,fixed acidity,volatile acidity,citric acid,residual sugar,chlorides,free sulfur dioxide,total sulfur dioxide,density,pH,sulphates,alcohol,quality,class
4893,6.2,0.21,0.29,1.6,0.039,24.0,92.0,0.99114,3.27,0.5,11.2,6,1
4894,6.6,0.32,0.36,8.0,0.047,57.0,168.0,0.9949,3.15,0.46,9.6,5,1
4895,6.5,0.24,0.19,1.2,0.041,30.0,111.0,0.99254,2.99,0.46,9.4,6,1
4896,5.5,0.29,0.3,1.1,0.022,20.0,110.0,0.98869,3.34,0.38,12.8,7,1
4897,6.0,0.21,0.38,0.8,0.02,22.0,98.0,0.98941,3.26,0.32,11.8,6,1


In [13]:
## 변수 공간 제거 (빈공간 '_' (under score)로 변경)
wine.columns = wine.columns.str.replace(' ', '_')
wine.columns # 변경된 변수명 확인

Index(['fixed_acidity', 'volatile_acidity', 'citric_acid', 'residual_sugar',
       'chlorides', 'free_sulfur_dioxide', 'total_sulfur_dioxide', 'density',
       'pH', 'sulphates', 'alcohol', 'quality', 'class'],
      dtype='object')

In [14]:
# 데이터 내보내기 (export)
wine.to_csv('/Users/zoe/Downloads/wine.csv', index = False)

In [15]:
# 독립변수와 종속변수 분리
X = wine[['alcohol', 'residual_sugar', 'pH']]
Y = pd.DataFrame(wine['class'])

In [16]:
import statsmodels.api as sm

In [17]:
# intercept 추가
X = sm.add_constant(X, has_constant='add')

In [18]:
X.head()

Unnamed: 0,const,alcohol,residual_sugar,pH
0,1.0,9.4,1.9,3.51
1,1.0,9.8,2.6,3.2
2,1.0,9.8,2.3,3.26
3,1.0,9.8,1.9,3.16
4,1.0,9.4,1.9,3.51


In [19]:
X.shape

(6497, 4)

In [20]:
Y.head()

Unnamed: 0,class
0,0
1,0
2,0
3,0
4,0


In [21]:
Y.tail()

Unnamed: 0,class
4893,1
4894,1
4895,1
4896,1
4897,1


In [22]:
Y.shape

(6497, 1)

In [24]:
from sklearn.model_selection import train_test_split

In [25]:
# 학습 데이터, 테스트 데이터 분리
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3, random_state=1234, stratify=Y)
print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(4547, 4) (1950, 4) (4547, 1) (1950, 1)


In [26]:
print(np.sum(Y==1)/len(Y))

class    0.753886
dtype: float64


In [27]:
print(np.sum(Y_train==1)/len(Y_train))

class    0.753904
dtype: float64


In [28]:
print(np.sum(Y_test==1)/len(Y_test))

class    0.753846
dtype: float64


In [29]:
# 로지스틱 회귀 모델 생성
logit_m = sm.Logit(Y_train, X_train)
logit_m_train = logit_m.fit(method='powell')

Optimization terminated successfully.
         Current function value: 0.423600
         Iterations: 9
         Function evaluations: 447


In [30]:
logit_m_train.summary()

0,1,2,3
Dep. Variable:,class,No. Observations:,4547.0
Model:,Logit,Df Residuals:,4543.0
Method:,MLE,Df Model:,3.0
Date:,"Fri, 01 Oct 2021",Pseudo R-squ.:,0.2409
Time:,19:00:27,Log-Likelihood:,-1926.1
converged:,True,LL-Null:,-2537.3
Covariance Type:,nonrobust,LLR p-value:,1.071e-264

0,1,2,3,4,5,6
,coef,std err,z,P>|z|,[0.025,0.975]
const,9.2089,0.906,10.164,0.000,7.433,10.985
alcohol,0.4436,0.038,11.796,0.000,0.370,0.517
residual_sugar,0.3479,0.018,19.785,0.000,0.313,0.382
pH,-4.3414,0.265,-16.371,0.000,-4.861,-3.822
