<a href="https://colab.research.google.com/github/jiukeem/deeplearning_from_scratch/blob/master/cross_validation_k_fold.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

In [2]:
cancer = load_breast_cancer()
x = cancer.data
y = cancer.target
x_train_all, x_test, y_train_all, y_test = train_test_split(x, y, stratify=y, test_size=0.2, random_state=42)

In [3]:
class SingleLayer:
  def __init__(self, learning_rate=0.1, l2=0):
    self.w = None
    self.b = None
    self.cost_history = []
    self.lr = learning_rate
    self.val_cost_history = [] # 러닝커브를 보기 위함
    self.l2_lambda = l2
        
  def initialization(self, x):
    self.w = np.ones(x.shape[1]) # 이렇게 설정하면 (1, nx) 일지 (nx, 1) 일지 어떻게 알지?
    self.b = 0
    return self.w, self.b

  def sigmoid(self, z):
    a = 1 / (1 + np.exp(-z))
    return a

  def forward_prop(self, x):
    z = np.sum(x * self.w) + self.b
    a = self.sigmoid(z)
    return a

  def calcul_loss(self, a, y):
    a = np.clip(a, 1e-10, 1-1e-10)
    loss = -(y * np.log(a) + (1 - y) * np.log(1 - a)) + self.l2_lambda * np.sum(self.w **2) / 2
    return loss

  def backward_prop(self, x, y, a):
    dw = (a - y) * x + self.l2_lambda * self.w
    db = (a - y) * 1 
    return dw, db

  def fit(self, x, y, epoches=100, x_val=None, y_val=None):
    self.w, self.b = self.initialization(x)
    for i in range(epoches):
      loss = 0
      index = np.random.permutation(np.arange(len(x)))
      for i in index:
        a = self.forward_prop(x[i])
        dw, db = self.backward_prop(x[i], y[i], a)
        self.w -= dw * self.lr
        self.b -= db * self.lr
        loss += self.calcul_loss(a, y[i])        
      self.cost_history.append(loss / x.shape[0])
      self.update_val_loss(x_val, y_val)

  def update_val_loss(self, x_val, y_val):
    if x_val is None:
      return
    val_loss = 0
    for i in range(x_val.shape[0]):
      a_val = self.forward_prop(x_val[i])
      val_loss += self.calcul_loss(a_val, y_val[i])
    self.val_cost_history.append(val_loss / x_val.shape[0])

  def predict(self, x): # 여기 x는 2차원 행렬로 들어온다
    a = [self.forward_prop(x_row) for x_row in x] # 이렇게 하면 각 row가 요소가 되는 듯?
    y_hat = [i > 0.5 for i in a]
    return y_hat
      
  def score(self, x, y):
    y_hat = self.predict(x)
    score = np.mean(y_hat == y)
    return score      

In [11]:
k = 10
bins = x_train_all.shape[0] // k
validation_scores = []

for i in range(k):
  start = bins * i
  end = bins * (i + 1)
  val_fold = x_train_all[start:end]
  val_target = y_train_all[start:end]

  train_index = list(range(0, start)) + list(range(end, x_train_all.shape[0]))
  train_fold = x_train_all[train_index]
  train_target = y_train_all[train_index] # 처음에 y_train_all[:start], y_train_all[end:] 식으로 했는데 안돌아간다

  # 꼭 폴드를 나눈 다음에 normalization 을 해줘야 한다. validation fold 의 정보를 미리 누설하지 않기위해
  train_mean = np.mean(x_train_all, axis=0)
  train_std = np.std(x_train_all, axis=0)
  train_fold_scaled = (train_fold - train_mean) / train_std
  val_fold_scaled = (val_fold - train_mean) / train_std

  layer = SingleLayer(l2=0.01)
  layer.fit(train_fold_scaled, train_target, epoches=50)
  score = layer.score(val_fold_scaled, val_target)
  validation_scores.append(score)

np.mean(validation_scores)

0.968888888888889

In [12]:
# 사이킷런으로 하고싶다면
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_validate
sgd = SGDClassifier(loss='log', penalty='l2', alpha=0.001, random_state=42)
scores = cross_validate(sgd, x_train_all, y_train_all, cv=10)
np.mean(scores['test_score'])

0.850096618357488

In [13]:
# normalization 을 안해서 낮게 나왔다. cross_validate 의 x_train_all, y_train_all 자리에 scaled를 넣어주어야 하는데 그러면 폴드를 분리하기 전에 스케일 처리 정보를 넘겨주게 되는 문제가 생긴다.
# pipeline 이라는 걸 이용해서 해결해보자

In [14]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [15]:
pipe = make_pipeline(StandardScaler(), sgd)
scores = cross_validate(pipe, x_train_all, y_train_all, cv=10)
np.mean(scores['test_score'])

0.9694202898550724

In [None]:
# cross validate 의 파라미터에 sgd 대신 pipe를 넣은 걸 확인할 수 있다. (pipe 를 보면 pipe 안에 sgd가 이미 들어있다.) 즉 pipe 라는 매개체를 중간에 삽입한 것 
# StandardScaler 이 우리가 했던 normalization 처리다. 
# 즉 cross validate 에서는 폴드로 나눠주기만 하고 그 다음에 normalization이 실행된다.