# 逻辑回归

In [None]:
import numpy as np
import pandas as pd

# 特征缩放
def feature_scaling(X):
    return X.apply(lambda column: (column - column.mean()) / column.std())

加载数据

In [None]:
# 加载数据
def load_and_preprocess_data():
    df = pd.read_csv(filepath_or_buffer='dataFile/ex2data1.txt', names=['text1', 'text2', 'lable'])

    X = df.iloc[:, 0:2]
    y = df.iloc[:, 2]
    # X = feature_scaling(X)
    X.insert(0, 'ones', 1)
    X = np.matrix(X)
    y = np.matrix(y).T
    theta = np.matrix(np.zeros(X.shape[1]))
    return X, y, df, theta

sigmoid 函数
$$
\sigma(x) = \frac{1}{1 + e^{-x}}\tag{1}
$$

In [None]:
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

代价函数
$$
J(\theta) = \frac{1}{m} \sum_{i=1}^m \left[ -y^{(i)} \log(h_\theta(x^{(i)})) - (1 - y^{(i)}) \log(1 - h_\theta(x^{(i)})) \right]
\tag{2}
$$

In [None]:
# 计算代价函数
def compute_cost(X, y, theta):
    m = len(y)
    h = sigmoid(np.dot(X, theta.T))
    J = (1 / m) * (np.sum(-np.log(h).T.dot(y) - np.log(1 - h).T.dot(1 - y)))
    return J

逻辑回归模型的假设函数
$$
\begin{align}
h_\theta(x) &= \sigma(\theta^T x)\tag{3} \\
&= \frac{1}{1 + e^{-\theta^T x}}\tag{4}
\end{align}\\
$$

逻辑回归的梯度下降函数
$$
\begin{align}
\theta_j &= \theta_j - \frac{\alpha}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)}\tag{5}
\end{align}
$$

In [None]:
# 梯度下降算法
def gradient_descent(X, y, theta):
    m = len(y)
    theta = theta - (1 / m) * np.dot((sigmoid(np.dot(X, theta.T)) - y).T, X)
    return theta

这里返回一次的梯度下降结果

下载使用sklearn里的逻辑回归模型

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import pandas as pd


df = pd.read_csv(filepath_or_buffer='dataFile/ex2data1.txt', names=['text1', 'text2', 'lable'])
X = df.iloc[:, 0:2]
y = df.iloc[:, 2]
X = np.array(X)
y = np.array(y).reshape(y.shape[0],)
lr_model = LogisticRegression()
lr_model.fit(X, y)
print(lr_model.coef_)
x0 = np.linspace(df['text1'].min(), df['text1'].max(), 100)
x1 = (-1 / lr_model.coef_[0][1]) * (lr_model.coef_[0][0] * x0 + lr_model.intercept_)
# 打印图像
fig, ax = plt.subplots(figsize=(12, 8))
ax.scatter(df['text1'], df['text2'], c=df['lable'], s=40, cmap=plt.cm.Spectral)
ax.set_xlabel('Exam 1 score')
ax.set_ylabel('Exam 2 score')
ax.plot(x0, x1, color='red', linewidth=3)
plt.show()

# 正则化逻辑回归
## 为什么要正则化？
针对于回归类算法或者分类算法，如线性回归和逻辑回归，它们能够有效地解决许多问题，但是当将它们应用到某些特定的机器学习应用时，会遇到过拟合(over-fitting)的问题，可能会导致它们效果很差。
## 正则化的目的
高次的$x$的特征可能会非常高，这会导致模型过拟合。采用惩罚模型的思想，高次项系数最小化，减少其对模型的影响

第一种方法使用用像逻辑回归这样的线性技术来构造从原始特征的多项式中得到的特征,degree=5
$$
\begin{align}
\phi(x) &= (x_1, x_1x_2, x_1^2, x_1x_2, x_1^3, x_1^2x_2, \dots, x_1x_2^3)\tag{6}
\end{align}
$$

In [None]:
def load_and_preprocess_data():
    df = pd.read_csv('dataFile/ex2data2.txt', names=['Test 1', 'Test 2', 'Accepted'])
    data = df.copy()
    # 特征映射
    degree = 6
    for i in range(1, degree+1):
        for j in range(i + 1):
            df['F' + str(i - j) + str(j)] = (df['Test 1'] ** (i - j)) * (df['Test 2'] ** j)
    df.drop(['Test 1', 'Test 2'], axis=1, inplace=True)
    X = df.drop('Accepted', axis=1)
    y = df['Accepted']
    return data, df, X, y

## regularized cost(正则化代价函数)
$$
J(\theta) = \frac{1}{m} \sum_{i=1}^m \left[ -y^{(i)} \log(h_\theta(x^{(i)})) - (1 - y^{(i)}) \log(1 - h_\theta(x^{(i)})) \right] + \frac{\lambda}{2m} \sum_{j=1}^n \theta_j^2\tag{7}
$$

In [None]:
def compute_cost_reg(w, b, X, y, learning_rate):
    m = X.shape[0]
    cost = (-1 / m) * (np.dot(y.T, np.log(sigmoid(np.dot(X, w.T) + b))) + np.dot((1 - y), np.log(1 - sigmoid(np.dot(X, w.T) + b))))
    reg = (learning_rate / (2 * m)) * np.sum(w[1:] ** 2)
    return cost + reg

## 需要使用正则化梯度函数：
$$
\begin{align}
\theta_0 &= \theta_0 - \frac{\alpha}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_0^{(i)}\tag{8}\\
\theta_j &= \theta_j - \left\{ \frac{\alpha}{m} \left( \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right) x_j^{(i)} \right) + \frac{\alpha \lambda}{m} \theta_j\right\}\tag{9}\\
&=\theta_j (1-\alpha\frac{\lambda}{m}) -\frac{\alpha}{m} \sum_{i=1}^m \left( h_\theta(x^{(i)}) - y^{(i)} \right)
\end{align}
$$

In [None]:
def compute_gradient(w, b, X, y):
    m = X.shape[0]
    dj_dw = (1 / m) * (np.dot(X.T, sigmoid(np.dot(X, w.T)+b) - y))
    dj_db = (1 / m) * np.sum(sigmoid(np.dot(X, w.T)+b) - y)
    return dj_dw, dj_db


def compute_gradient_reg(w, b, X, y, learning_rate):
    m = X.shape[0]
    dj_dw, dj_db = compute_gradient(w, b, X, y)
    dj_dw += (learning_rate / m) * w
    return dj_dw, dj_db

梯度下降算法

In [None]:
def gradient_descent_reg(X, y, w_in, b_in, cost_function, gradient_function, alpha, iters, lambda_):
    J_history = []
    w_history = []
    for i in range(iters):
        dj_dw, dj_db = gradient_function(w_in, b_in, X, y, lambda_)
        w_in = w_in - alpha * dj_dw
        b_in = b_in - alpha * dj_db
        J_history.append(cost_function(w_in, b_in, X, y, lambda_))
        if i % 100000 == 0:
            w_history.append(w_in)
            print(f'Iteration {i:4d}: Cost {J_history[-1]:8.2f}')

    return w_in, b_in, J_history, w_history

执行算法，实现拟合

In [None]:
# 导入数据
data, df, X, y = load_and_preprocess_data()
initial_w = np.random.rand(X.shape[1]) - 0.5
initial_b = 0.5
iterations = 300000
alpha = 0.1
lambda_ = 0.01
# print(compute_cost_reg(initial_w, initial_b, X, y, 1))
# print(compute_gradient_reg(initial_w, initial_b, X, y, 1))
w, b, J_history, w_history = gradient_descent_reg(
    X.values, y.values, initial_w, initial_b,
    compute_cost_reg, compute_gradient_reg,
    alpha, iterations, lambda_)
# 预测
predictions = sigmoid(np.dot(X, w.T) + b)
for i in range(len(predictions)):
    if predictions[i] >= 0.5:
        predictions[i] = 1
    else:
        predictions[i] = 0
# 显示模型准确性，约为0.83
print(np.sum((y-predictions) == 0) / len(predictions))