# 第2回　ロジスティック回帰
3月17日　石村

## データセット
乳がんデータ（二値分類）

In [4]:
from sklearn.datasets import load_breast_cancer
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

cancer = load_breast_cancer()

import pandas as pd
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

In [5]:
# 今回は2変数のみ使用する
X = X[X.columns[:2]]

## sklearnのロジスティック回帰

In [103]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression(penalty='none', random_state=42)
lr.fit(X, y)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='none',
                   random_state=42, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [104]:
lr.coef_

array([[-1.05710374, -0.21814035]])

In [105]:
lr.intercept_

array([19.84943528])

In [175]:
y_proba = lr.predict_proba(X)
from sklearn.metrics import log_loss

lr.score(X, y)

0.8910369068541301

# 自力で実装

In [30]:
import numpy as np

def sigmoid(x):
    return 1.0 / (1.0 + np.exp(-x))

def calc_loss(y_true, y_pred):
    eps = 1e-15
    loss = -(-y_true * np.log(y_pred-eps) + (1-y_true) * np.log(1-y_pred+eps)).mean()
    return loss

def update_params(w, X, proba, y, lr):
    gradient = (X.T @ (proba - y)).values
    w = w - lr / len(X) * gradient
    return w


class MyLogisticRegression():
    def __init__(self, fit_intercept=True):
        self.w = None
        self.X = None
        self.y_true = None
        self.fit_intercept=fit_intercept
        return
    
    def initialize_params(self, size):
        np.random.seed(seed=32)
        return np.random.rand(size)
    
    
    
    def fit(self, X, y):
        if self.fit_intercept:
            self.X = X.copy()
            self.X['const'] = 1
        self.y_true = y
        self.w = self.initialize_params(self.X.shape[1])
        
        lr = 0.01
        max_itr = 10000
        loss_hist = float('inf')
        for i in range(max_itr):
            y_proba = self.predict_proba(self.X)
            log_loss = calc_loss(self.y_true, y_proba)
            if log_loss > loss_hist:
                print(i)
                break
            self.w = update_params(self.w, self.X, y_proba, self.y_true, lr)
        
    def predict_proba(self, X):
        return sigmoid(X @ self.w)

In [31]:
lr = MyLogisticRegression()

In [32]:
lr.fit(X, y)

In [33]:
t = X.copy()
t['c'] = 1
((lr.predict_proba(t) > 0.5)*1 == y).sum() / len(y)

0.8717047451669596

In [34]:
lr.w

array([-0.34252985,  0.02223078,  4.83420403])