## 유방암 데이터에 대한 로지스틱 회귀

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")

In [None]:
x, y = load_breast_cancer(return_X_y=True) 
#함수들의 매개변수 return_X_y 를 True로 설정하면 Bunch class 가 아니라 특성 X와 타깃 y로 반환한다. 
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
model = LogisticRegression(max_iter=300)
model.fit(x_train, y_train)

In [None]:
print(f'Train Data Score: {model.score(x_train, y_train)}')
print(f'Test Data Score: {model.score(x_test, y_test)}')

Train Data Score: 0.960093896713615
Test Data Score: 0.9440559440559441


# 확률적 경사 하강법 (Stochastic Gradient Descent)

- 모델을 학습 시키기 위한 간단한 방법
- 학습 파라미터에 대한 손실 함수의 기울기를 구해 기울기가 최소화 되는 방향으로 학습

\begin{equation}
\frac{\partial L}{\partial w} = \underset{h \rightarrow 0}{lim} \frac{L(w+h) - L(w)}{h}
\end{equation}

\begin{equation}
w^{'} = w - \alpha \frac{\partial L}{\partial w}
\end{equation}

* scikit-learn에서는 선형 SGD 회귀와 SGD 분류를 지원

## SGD를 사용한 선형 회귀 분석
- https://www.baeldung.com/cs/gradient-stochastic-and-mini-batch

In [None]:
from sklearn.linear_model import SGDRegressor
# from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
x, y = load_boston(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
model = make_pipeline(StandardScaler(), 
                      SGDRegressor(loss='squared_loss')) # 선형 회귀에서는 squared_loss를 주로 사용
model.fit(x_train, y_train)

NameError: name 'make_pipeline' is not defined

In [None]:
print(f'Train Data Score: {model.score(x_train, y_train)}')
print(f'Test Data Score: {model.score(x_test, y_test)}')

Train Data Score: 0.7451335116593927
Test Data Score: 0.7061252559668184


## 붓꽃 데이터에 대한 SGD 분류

In [None]:
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_iris, load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [None]:
x, y = load_iris(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
model = make_pipeline(StandardScaler(), SGDClassifier(loss='log')) # cross entropy = log)
model.fit(x_train, y_train)

In [None]:
print(f'Train Data Score: {model.score(x_train, y_train)}')
print(f'Test Data Score: {model.score(x_test, y_test)}')

Train Data Score: 0.9642857142857143
Test Data Score: 0.9473684210526315


## 유방암 데이터에 대한 SGD 분류

In [None]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [None]:
model = make_pipeline(StandardScaler(), SGDClassifier(loss='log'))
model.fit(x_train, y_train)

In [None]:
print(f'Train Data Score: {model.score(x_train, y_train)}')
print(f'Test Data Score: {model.score(x_test, y_test)}')

Train Data Score: 0.9835680751173709
Test Data Score: 0.986013986013986
