## Логистическая регрессия

In [2]:
import pandas as pd
import numpy as np
from sklearn.metrics import roc_auc_score
from sklearn.linear_model import LogisticRegression

In [9]:
df = pd.read_csv("data-logistic.csv", names=["target", "feature1", "feature2"])
df.head()
y = df["target"].as_matrix()
X = df[["feature1", "feature2"]].as_matrix()
df.head()

Unnamed: 0,target,feature1,feature2
0,-1,-0.663827,-0.138526
1,1,1.994596,2.468025
2,-1,-1.247395,0.749425
3,1,2.309374,1.899836
4,1,0.849143,2.40775


### Задача логистической регрессии

Здесь пишем функцию, которая по заданной обучающей выборке и векторе ответов генерирует функцию от вектора весов, которую мы и будем минимизировать в градиентном спуске.

In [11]:
def check_args(X, y, C, w=y, k=0):
    C = np.float64(C)
    k = np.float64(k)
    assert isinstance(y, np.ndarray)
    assert len(y.shape) == 1
    assert isinstance(X, np.ndarray)
    assert len(X.shape) == 2
    assert X.shape[0] == y.shape[0]
    assert isinstance(w, np.ndarray)
    assert len(w.shape) == 1
    assert w.shape[0] == X.shape[1]

def approx(X, y, w, i):
    return 1 + np.exp(-y[i] * np.inner(w, X[i, :]))

def predict(X, w):
    return 1.0 / (1.0 + np.exp(-np.inner(w, X)))

# def get_logregr_functional(X, y, C):
#     """
#             Returns function generated by answer vector and feature matrix.
            
#             y     (l, 1)    answer vector
#             X     (l, k)    feature matrix
#             C     float     regularization factor
            
#             Return          function of (l, 1) vector (which are weights)
#     """
#     # checking arguments
#     l = x.shape[1]
#     check_args(X, y, C)
#     def logregr_functional(w): 
#         assert isinstance(w, np.ndarray)
#         assert len(w.shape) == 1
#         assert w.shape[0] == l
#         sm = 0
#         for i in range(l):
#             sm += np.log(approx(X, y, w, i))
#         return sm/l + 1.0/2.0 * C * np.inner(w, w)
#     return logregr_functional

### Пересчет весов

In [5]:
def get_weights(X, y, C, w, k):
    check_args(X, y, C, w, k)
    sm = 0
    l = X.shape[1]
    for i in range(l):
        sm += y[i] * X[i, :]*(1 - 1.0/approx(X, y, w, i))
    return (1 - k*C)*w + k/l * sm 

### Реализуем градиентный спуск здесь

In [15]:
MAX_ITERS = 10000
DIFF_MIN = 10e-5
def full_gradient(X, y, w0, k, C):
    """
            Full Gradient method implementation
            
    func    function to optimize, its argument must be compatible with x0
    x0      first guess
    """
    iters = 0
    w_prev = None
    w_next = w0
    while iters < MAX_ITERS and (w_prev is None or np.linalg.norm(w_next - w_prev) > DIFF_MIN):
        w_prev = w_next
        w_next = get_weights(X, y, C, w_prev, k)
    return w_next

w0 = np.array([0, 0])
no_reg_w = full_gradient(X, y, w0, 0.1, 0)
yes_reg_w = full_gradient(X, y, w0, 0.1, 10)
print(no_reg_w, yes_reg_w)

[ 8.28995574  2.31176058] [ 0.05962532  0.05711134]


In [18]:
no_reg_ans = predict(X, no_reg_w)
yes_reg_ans = predict(X, yes_reg_w)
score_no = roc_auc_score(y, no_reg_ans)
score_yes = roc_auc_score(y, yes_reg_ans)
print(score_no, score_yes)

0.921619047619 0.935714285714


(205, 2)