In [1]:
# 逻辑斯蒂回归模型可用于二分类或多分类。二分类时，p(y=1|x)=sigmoid(x)=1/(1+exp(-w*x)),p(y=0|x)=1-p(y=1|x)
# 可以用极大似然估计法估计模型权重系数。

In [2]:
# 引入IMDB电影评论数据做二分类

import numpy as np
from tensorflow.keras.datasets import imdb

(train_data, train_labels), (test_data, test_labels) = imdb.load_data(num_words=1000)     #  保留训练数据中前1000 个最常出现的单词

# 填充列表，使其具有相同的长度，做one-hot编码
def vectorize_sequences(sequences, dimension=1000):
    results = np.zeros((len(sequences), dimension))
    for i, sequence in enumerate(sequences):
        results[i, sequence] = 1.
    return results

x_train = vectorize_sequences(train_data)
x_test = vectorize_sequences(test_data)

In [3]:
print(x_train.shape)
print(train_labels.shape)
print(x_test.shape)
print(test_labels.shape)

(25000, 1000)
(25000,)
(25000, 1000)
(25000,)


In [4]:
del train_data
del test_data

In [5]:
# LR模型构建

In [6]:
# 定义逻辑斯蒂分布的分布函数,即sigmoid函数
from math import exp

def sigmoid(x):
    return 1 / (1 + exp(-x))

In [7]:
# 给各样本加上特征1，以使截距b包含在w中
from tqdm import tqdm

def data_matrix(X):
    data_mat = []
    for d in tqdm(X):
        data_mat.append([1.0, *d])
    return data_mat

In [8]:
data_mat = data_matrix(x_train)

100%|█████████████████████████████████████████████████████████████████████████| 25000/25000 [00:01<00:00, 22750.13it/s]


In [9]:
# 定义权重w
weights = np.zeros((len(data_mat[0]), 1), dtype=np.float32)

In [10]:
# 定义学习率
learning_rate = 0.01

In [11]:
# 迭代max_epochs次，使用单个样本对权重做梯度上升更新
max_epochs = 10
for epoch in tqdm(range(max_epochs)):
    for i in range(len(data_mat)):
        result = sigmoid(np.dot(data_mat[i], weights))
        error = train_labels[i] - result
        weights += learning_rate * error * np.transpose([data_mat[i]])          # 似然函数的梯度

100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [00:46<00:00,  4.68s/it]


In [12]:
# 模型预测
def lr_predict(x_test):
    result = []
    for i in range(len(x_test)):
        temp = np.dot(x_test[i], weights)
        if temp>=0:
            result.append(1)
        else:
            result.append(0)
    return result

In [13]:
x_test1 = data_matrix(x_test)

100%|█████████████████████████████████████████████████████████████████████████| 25000/25000 [00:01<00:00, 17018.34it/s]


In [14]:
test_pred = lr_predict(x_test1)

In [15]:
# 模型评估
def accuracy(y, y_pred):
    right = 0
    for i in range(len(y)):
        if y[i] == y_pred[i]:
            right += 1
    return right / len(y)

In [16]:
score = accuracy(test_labels, test_pred)

In [17]:
print(score)

0.84976
