In [26]:
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn import linear_model
from sklearn.datasets import load_boston
from sklearn.datasets import make_classification
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import classification_report
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler

%matplotlib inline
%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=5, suppress=True)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
boston = load_boston()
dataset = pd.DataFrame(boston.data, columns=boston.feature_names)
dataset['target'] = boston.target
dataset

observations = len(dataset)
variables = dataset.columns[:-1]
X = dataset.iloc[:, :-1]
y = dataset['target'].values

In [9]:
yq = np.array(y > 25, dtype=int)

### 평균 중심화

In [11]:
linear_regression = linear_model.LinearRegression(normalize=False, fit_intercept=True)

In [12]:
linear_regression.fit(X, y)

LinearRegression()

In [27]:
print("coefficeints: {}\n intercept: {}".format(linear_regression.coef_, linear_regression.intercept_))

coefficeints: [ -0.10801   0.04642   0.02056   2.68673 -17.76661   3.80987   0.00069
  -1.47557   0.30605  -0.01233  -0.95275   0.00931  -0.52476]
 intercept: 36.459488385090125


In [28]:
dataset.min()

CRIM         0.00632
ZN           0.00000
INDUS        0.46000
CHAS         0.00000
NOX          0.38500
RM           3.56100
AGE          2.90000
DIS          1.12960
RAD          1.00000
TAX        187.00000
PTRATIO     12.60000
B            0.32000
LSTAT        1.73000
target       5.00000
dtype: float64

In [35]:
centering = StandardScaler(with_mean=True, with_std=False)
linear_regression.fit(centering.fit_transform(X), y)
print("coefficeints: {}\nintercetp: {}".format(linear_regression.coef_, linear_regression.intercept_))

coefficeints: [ -0.10801   0.04642   0.02056   2.68673 -17.76661   3.80987   0.00069
  -1.47557   0.30605  -0.01233  -0.95275   0.00931  -0.52476]
intercetp: 22.532806324110688


### 표준화

In [36]:
standardization = StandardScaler(with_mean=True, with_std=True)
linear_regression.fit(standardization.fit_transform(X), y)
print("coefficeints: {}\nintercetp: {}".format(linear_regression.coef_, linear_regression.intercept_))

coefficeints: [-0.92815  1.08157  0.1409   0.68174 -2.05672  2.67423  0.01947 -3.10404
  2.66222 -2.07678 -2.06061  0.84927 -3.74363]
intercetp: 22.532806324110684


### 정규화

In [40]:
scaling = MinMaxScaler(feature_range=(0, 1))
linear_regression.fit(scaling.fit_transform(X), y)
print("coefficeints: {}\nintercetp: {}".format(linear_regression.coef_, linear_regression.intercept_))

coefficeints: [ -9.60976   4.64205   0.56084   2.68673  -8.63457  19.88369   0.06722
 -16.22666   7.03914  -6.46333  -8.95582   3.69283 -19.01724]
intercetp: 26.6202675846878


### 로직스틱회귀 사례

In [42]:
Xq = sm.add_constant(standardization.fit_transform(X))
logit = sm.Logit(yq, Xq)
result = logit.fit()
print(result.summary())

Optimization terminated successfully.
         Current function value: 0.206631
         Iterations 9
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  506
Model:                          Logit   Df Residuals:                      492
Method:                           MLE   Df Model:                           13
Date:                Tue, 23 Jun 2020   Pseudo R-squ.:                  0.6289
Time:                        10:11:37   Log-Likelihood:                -104.56
converged:                       True   LL-Null:                       -281.76
Covariance Type:            nonrobust   LLR p-value:                 9.145e-68
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -3.0542      0.356     -8.571      0.000      -3.753      -2.356
x1            -0.0954      0.

In [49]:
def sigmoid(p):
    return 1 / (1 + np.exp(-p))
print('intercept: {:.3f}'.format(result.params[0]))
print('probability of value above 25 when all predictors are average: {:.3f}'.format(sigmoid(result.params[0])))

intercept: -3.054
probability of value above 25 when all predictors are average: 0.045


In [51]:
print('average likelihood of positive response: {:.3f}'.format(sum(yq) / float(len(yq))))

average likelihood of positive response: 0.245


In [52]:
C = np.ones(len(X))
logit = sm.Logit(yq, C)
result = logit.fit()
print(result.summary())
print('\nprobability of alue above 25 using just a constant: {:.3f}'.format(sigmoid(result.params[0])))

Optimization terminated successfully.
         Current function value: 0.556842
         Iterations 5
                           Logit Regression Results                           
Dep. Variable:                      y   No. Observations:                  506
Model:                          Logit   Df Residuals:                      505
Method:                           MLE   Df Model:                            0
Date:                Tue, 23 Jun 2020   Pseudo R-squ.:               3.276e-11
Time:                        10:21:13   Log-Likelihood:                -281.76
converged:                       True   LL-Null:                       -281.76
Covariance Type:            nonrobust   LLR p-value:                       nan
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const         -1.1251      0.103    -10.886      0.000      -1.328      -0.923

probability of alue above 25