In [None]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

In [None]:
import warnings
warnings.filterwarnings(action='ignore') 

## Logistic Regression
#### 파이썬 코드 (하이퍼파라미터 튜닝 및 모델 저장 활용예 1/2)

#### 클래스와 알고리즘 선택

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
clf = LogisticRegression(random_state=0)

In [None]:
size_korea = pd.read_csv('../../data/size_korea_small.csv')

In [None]:
# load 
X = size_korea[['키','몸무게','체지방율']].values
y = size_korea.성별.replace({'남':0,'여':1})

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

#### Hyperparameters 설정과 적합

In [None]:
clf.get_params().keys()

In [None]:
hyperparameters = [{'C': np.logspace(-5, 5, 10), 
                   'penalty': ['l2']},
                   {'penalty': ['none']}]

randomized_search = RandomizedSearchCV(clf, hyperparameters, cv=5, scoring='accuracy')
randomized_search.fit(X_train, y_train)

In [None]:
randomized_search.cv_results_

In [None]:
randomized_search.best_estimator_.coef_

In [None]:
randomized_search.best_estimator_.intercept_

#### 모델 저장

In [None]:
from joblib import dump, load
dump(randomized_search, '../working/randomized_search.joblib')

## Logistic Regression
#### 파이썬 코드 (하이퍼파라미터 튜닝 및 모델 저장 활용예 2/2)

#### 저장한 모델 불러오기

In [None]:
!dir ..\working

In [None]:
model = load('../working/randomized_search.joblib')
model.get_params()

In [None]:
model.predict(X_test)

<font color='Blue'>

# 좀 더 알아 보기

## 단순 회귀 생성(Simple Linear Regression)


$$
y = ax + b
$$


In [None]:
from sklearn.linear_model import LinearRegression
np.random.seed(1)
x = 10 * np.random.rand(50)
y = 2 * x - 5 + np.random.randn(50)

model = LinearRegression(fit_intercept=True)
y_pred = model.fit(x.reshape(-1,1), y).predict(x.reshape(-1,1))

In [None]:
from sklearn.metrics import mean_squared_error
print(f'Model intercept   : {model.intercept_}')
print(f'Coefficients      : {model.coef_}')
print(f'Mean squared error: {mean_squared_error(y, y_pred)}')

In [None]:
x_line = np.linspace(0, 10, 1000)
y_line_pred = model.predict(x_line.reshape(-1,1))
plt.scatter(x, y)
plt.plot(x_line, y_line_pred)

## 다중 회귀 생성(Multiple Linear Regression)


$$
y = a_0 + a_1 x_1 + a_2 x_2 + \cdots
$$


In [None]:
np.random.seed(1)
X = 10 * np.random.rand(1000, 3)
y = np.dot(X, [1.5, -2., 1.]) + 0.5 * np.random.randn(1000)

model = LinearRegression(fit_intercept=True)
y_pred = model.fit(X, y).predict(X)

print(f'Model intercept   : {model.intercept_}')
print(f'Coefficients      : {model.coef_}')
print(f'Mean squared error: {mean_squared_error(y, y_pred)}')

## 로지스틱 회귀분석(Logistic Regression)

### sklearn과 statsmodels api의 비교

In [None]:
from sklearn.datasets import make_blobs
X, y = make_blobs(100, 1, centers=2, random_state=14, cluster_std=5)

In [None]:
model = LinearRegression(fit_intercept=False)
y_pred = model.fit(X, y).predict(X)

In [None]:
plt.scatter(X[:, 0], y, c=y, cmap='coolwarm')
plt.plot(X, y_pred); plt.ylabel('y'); plt.xlabel('X'); 

In [None]:
from sklearn.metrics import mean_squared_error
print(f'Model intercept   : {model.intercept_}')
print(f'Coefficients      : {model.coef_}')
print(f'Mean squared error: {mean_squared_error(y, y_pred)}')

In [None]:
import statsmodels.api as sm
y_pred = sm.OLS(y, X).fit().predict(X)
plt.scatter(X[:, 0], y, c=y, cmap='coolwarm')
plt.plot(X, y_pred); plt.ylabel('y'); plt.xlabel('X'); 

In [None]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [None]:
y_pred = model.fit(X,y).predict_proba(X) 
plt.scatter(X[:, 0], y, c=y, cmap='coolwarm')
plt.scatter(X[:, 0], y_pred[:,1])
plt.ylabel('g(y)')
plt.xlabel('X')

In [None]:
y_predict = sm.Logit(y, X).fit().predict(X) 
plt.scatter(X[:, 0], y, c=y, cmap='coolwarm')
plt.scatter(X, y_predict); plt.ylabel('g(y)'); plt.xlabel('X'); 

## scikit-learn API

In [None]:
np.random.seed(1234)
X = 10 * np.random.rand(100); X.shape
y = np.cos(X) + 0.25 * np.random.randn(100)
X = X.reshape(-1,1); X.shape # X = X[:,np.newaxis] 

In [None]:
from sklearn.linear_model import LinearRegression
y_predict = LinearRegression().fit(X,y).predict(X)

## Basis Function Regression

In [None]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
poly_model = make_pipeline(PolynomialFeatures(20), LinearRegression())
poly_model.fit(X, y)
Xfit = np.linspace(0, 10, 100)
yfit = poly_model.predict(Xfit.reshape(-1,1))

In [None]:
X

In [None]:
PolynomialFeatures(20).fit_transform(X)

In [None]:
plt.figure(figsize=(20, 5)); plt.rc('legend', fontsize=16)
plt.scatter(X, y)
plt.plot(X, y_predict,        'r-', label='Linear regression')
plt.plot(Xfit, yfit,          'g:',  label='Polynomial regession')

plt.legend()
plt.ylim(-4, 4)

## Regularization(Lasso, Ridge)    (L1, L2)

In [None]:
from sklearn.linear_model import Ridge, Lasso
lasso_model = make_pipeline(PolynomialFeatures(20), Lasso(alpha= 1))
predict_lasso = lasso_model.fit(X, y).predict(Xfit.reshape(-1,1))

ridge_model = make_pipeline(PolynomialFeatures(20), Ridge(alpha= 30))
predict_ridge = ridge_model.fit(X, y).predict(Xfit.reshape(-1,1))

In [None]:
plt.figure(figsize=(20, 5)); plt.rc('legend', fontsize=16)
plt.scatter(X, y)
plt.plot(Xfit, predict_ridge, 'b-.', label='Ridge regession')
plt.plot(Xfit, predict_lasso, 'k-',  label='Lasso regession')
plt.legend()
plt.ylim(-4, 4)

##  Digits Classification Exercise

In [None]:
from sklearn.datasets import load_digits
data = load_digits()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['label'] = data.target
df['label'] = df['label'].apply(lambda x: 0 if x != 1 else x)
df

In [None]:
df.isnull().sum().sum()

In [None]:
df.boxplot(figsize=(20,4))

In [None]:
df['label'].value_counts(normalize=True).plot.bar()

In [None]:
X = df.drop('label', axis=1)
X = X/X.max() # 표준화
y = df['label']

In [None]:
X.boxplot(figsize=(20,4))

In [None]:
X.isnull().sum()

In [None]:
X.fillna(0, inplace=True)

In [None]:
# 모델 선택
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, 
                                                  stratify= y, random_state=11)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, RandomizedSearchCV
np.random.seed(1357)
kf = KFold(n_splits=3)
model = LogisticRegression(solver='saga', random_state=11)

In [None]:
params = {'C': np.logspace(-4, 4, 20), 'penalty': ['l1', 'l2']}
randomized = RandomizedSearchCV(model, param_distributions=params, cv=kf,
                          scoring = 'recall_macro')
randomized.fit(X_train, y_train)

In [None]:
print('optimal train score: {:.3f}'.format(randomized.best_score_))
print('test score         : {:.3f}'.format(randomized.score(X_test, y_test)))
print('optimal parameters : {}'.format(randomized.best_params_))

#### solver를 정하지 않으면 'solver': 'lbfgs'

In [None]:
tuned_params  = randomized.best_params_
model = LogisticRegression().set_params(**tuned_params)
model.get_params()

In [None]:
tuned_params

#### 처음 모델 정의할 때와 동일하게 정의

In [None]:
model = LogisticRegression(random_state=11, solver='saga').set_params(**tuned_params)
model.get_params()

#### refit

In [None]:
model.fit(X_train, y_train)
print(model.coef_)
print(model.intercept_)

In [None]:
X_train

#### 같은 모델? --> 같은 모델!

In [None]:
model.coef_ == randomized.best_estimator_.coef_

### coeffiecnts

In [None]:
data = zip(X_train.columns, model.coef_.reshape(-1,))
(pd
 .DataFrame(data, columns =['variable', 'coefficient'])
 .set_index('variable')
 .plot.bar(rot='30', figsize=(25,4)))

In [None]:
from sklearn.metrics import plot_confusion_matrix, classification_report
y_predict = model.predict(X_test)
print(classification_report(y_test, y_predict))
plot_confusion_matrix(model, X_test, y_test)

In [None]:
y_proba = model.predict_proba(X_test)
plt.hist(y_proba[:, 1], color ='blue', label='positive probability'); plt.legend()

In [None]:
X_train.shape

In [None]:
import scikitplot as skplt
skplt.estimators.plot_learning_curve(model, X_train, y_train, scoring='accuracy')
skplt.estimators.plot_learning_curve(model, X_train, y_train, scoring='recall_macro')