# Домашнее задание 3
## Линейная регрессия

In [1]:
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")

### Ridge-регрессия

Рассматриваем модель Ridge-регрессии (с $L_2$-регуляризацией):

$$Q = \| Y - X\omega \|^2 + \lambda \| \omega \|^2$$
$$dQ\big|_{\omega_{*}} = d(\| Y - X\omega \|^2) + \lambda d(\|\omega \|^2)$$
Отдельно посчитаем $d(\|\omega \|^2)\big|_{\omega_{*}} = d(<\omega, \omega>)\big|_{\omega_{*}} = 2\omega_{*}$. Тогда по известной с лекции формулы для ошибки:
$$dQ\big|_{\omega_{*}} = -2\mathbb{X}^T<\mathbb{Y} - \mathbb{X}\omega, \omega_{*}> + 2\lambda \omega_{*} = <-2\mathbb{X}^T(\mathbb{Y} - \mathbb{X}\omega_{*}) + 2\lambda E, \omega_{*}>$$
Тогда $$\nabla_{\omega}Q = 2\lambda E - 2\mathbb{X}^T(\mathbb{Y} - \mathbb{X}\omega) = 2\lambda E + 2\mathbb{X}^T(\mathbb{X}\omega - \mathbb{Y})$$

In [2]:
from sklearn.base import BaseEstimator, RegressorMixin

class RidgeLinReg(BaseEstimator, RegressorMixin):
    def __init__(self, batch_size=25, num_steps=350, lr=1e-2, lambd=1):
        self.batch_size = batch_size
        self.num_steps = num_steps
        self.lr = lr
        self.lambd = lambd

    def fit(self, X, Y):
        batch_size = self.batch_size
        w = np.random.randn(X.shape[1])[:, None]
        n_objects = len(X)

        for i in range(self.num_steps):
            sample_indices = np.random.randint(0, n_objects, size=batch_size)
            w -= 2 * self.lr * (self.lambd * w + np.dot(X[sample_indices].T, np.dot(X[sample_indices], w) - Y[sample_indices])) / self.batch_size

        self.w = w
        return self

    def predict(self, X):
        return X @ self.w

Сравним с моделью из Sklearn, воспользовавшись кодом с лекции:

In [3]:
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score

n_features = 700
n_objects = 100000

w_true = np.random.uniform(-2, 2, (n_features, 1))

X = np.random.uniform(-100, 100, (n_objects, n_features)) * np.arange(n_features)
Y = X.dot(w_true) + np.random.normal(0, 10, (n_objects, 1))

In [4]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [5]:
scaler = StandardScaler()
scaler.fit(x_train)
x_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [6]:
own_model = RidgeLinReg().fit(x_scaled, y_train)
y_pred = own_model.predict(x_test_scaled)
own_r2 = r2_score(y_test, y_pred)

sklearn_model = Ridge().fit(x_scaled, y_train)
y_pred = sklearn_model.predict(x_test_scaled)

sklearn_r2 = r2_score(y_test, y_pred)

print('R^2 in own model:', own_r2)
print('R^2 in sklearn loss:', sklearn_r2)

R^2 in own model: 0.9978136453703507
R^2 in sklearn loss: 0.9999999996210158


Видим, что реализованная регрессия мало уступает модели "из коробки"

## Логистическая регрессия

Вычислим градиент для функции log-loss: $$\nabla Q = X^T * (-Y * \frac{e^{X^T * (-Y) * \omega}}{1 + e^{X^T * (-Y) * \omega}})$$

In [537]:
from sklearn.base import BaseEstimator, ClassifierMixin

class LogReg(BaseEstimator, ClassifierMixin):
    def __init__(self, num_steps=350, lr=1e-2):
        self.num_steps = num_steps
        self.lr = lr

    def fit(self, X, Y):
        batch_size = self.batch_size
        w = np.random.randn(X.shape[1])
        n_objects = len(X)
        
        for i in range(self.num_steps):
            sample_indices = np.random.randint(0, n_objects, size=batch_size)
            w -= self.lr * (np.dot(X.T, -Y * (np.exp(np.dot(np.dot(X.T, -Y).T, w))) / (1 + np.exp(np.dot(np.dot(X.T, -Y).T, w)))))
        self.w = w
        return self

    def predict(self, X):
        y = 1 / (1 + np.exp(-X @ self.w))
        for i in range(0, len(y)):
            if y[i] > 0.5:
                y[i] = 1
            else:
                y[i] = 0
        return y

In [538]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
X, Y = load_breast_cancer(return_X_y=True)

In [539]:
x_train, x_test, y_train, y_test = train_test_split(X, Y)

In [540]:
scaler = StandardScaler()
scaler.fit(x_train)
x_scaled = scaler.transform(x_train)
x_test_scaled = scaler.transform(x_test)

In [588]:
from sklearn.metrics import accuracy_score

own_model = LogReg().fit(x_scaled, y_train)
y_pred = own_model.predict(x_test_scaled)
own_score = accuracy_score(y_test, y_pred)

sklearn_model = LogisticRegression().fit(x_scaled, y_train)
y_pred = sklearn_model.predict(x_test_scaled)

sklearn_score = accuracy_score(y_test, y_pred)

print('Score in own model:', own_score)
print('Score in sklearn loss:', sklearn_score)

Score in own model: 0.9440559440559441
Score in sklearn loss: 0.9790209790209791


Снова получили неплохой результат