In [110]:
import os
import time
import numpy as np
from utils import *
from softmax_regression import Model

In [25]:
if not os.path.exists('./dataset'):
    current_path = os.getcwd()           # 获得当前路径
    os.mkdir(current_path + '/dataset')  # 添加dataset路径
    path = './sentiment-analysis-on-movie-reviews/train.tsv'  # 原始数据所在路径
    generate_dataset(path)  # 生成数据集：训练集、开发集和测试集（train.txt、dev.txt和test.txt）

In [26]:
train_path = './dataset/train.txt'
vocab, data = build_vocab(train_path)

正在进行数据清洗！
数据清洗完成！
开始生成词典！
已生成好词典！


In [27]:
feature = feature_extraction(vocab, data)

In [103]:
model = Model()

In [104]:
def one_hot_encode(y, class_nums):
    '''
    将标签改为one-hot编码
    '''
    y_new = np.zeros((class_nums, len(y)))
    for idx, val in enumerate(y):
        y_new[int(val)][idx] = 1
    return y_new    
    
def cross_entropy_loss(y_, y):
    '''
    交叉熵损失函数
    y_：模型预测值
    y：真实标签值
    '''
    N = len(y)  # batch_size=128
    loss = 0.
    for i in range(N):
        for j in range(5):
            loss += -y[j][i] * np.log(y_[j][i])
    loss = loss / N
    return loss

def compute_accuracy(y_, y):
    N = len(y)
    correct_nums = 0
    for i in range(N):
        if y_[i] == y[i]:
            correct_nums += 1
    return correct_nums / N

def update_parameter(model, x, y, y_):
    lr = 1e-4
    N = x.shape[0]
    sigma = np.zeros((model.Weight.shape[0], model.Weight.shape[1]))
    for i in range(N):
        sigma += np.matmul((y.T[i]-y_.T[i]).reshape(-1, 1), x[i].reshape(1, -1))
    model.Weight = model.Weight + lr * sigma / N
    

In [112]:
for epoch in range(20):
    print('epoch: [%d/%d]' % (epoch+1, 20))
    batch_nums = 1
    for x, y in data_iter(feature):
        start_time = time.time()
        x = np.asarray(x)
        y = np.asarray(y)
        one = np.ones((x.shape[0], 1))
        x = np.concatenate((x, one), axis=1)          # 将x变为增广矩阵
        y_hat = model.forward(x)                      # 使用模型得到每个样本的标签（类别）的概率分布
        y_pred_label = np.argmax(y_hat, axis=0)       # 使用argmax得到模型预测的结果
        accuracy = compute_accuracy(y_pred_label, y)  # 计算mini-batch的准确率
        y = one_hot_encode(y, 5)                    # 将样本标签转化为one-hot向量，用于使用交叉熵损失函数计算mini-batch的loss
        loss = cross_entropy_loss(y_hat, y)         # 计算mini-batch loss
        if batch_nums % 10 == 0:
            end_time = time.time()
            print('train loss: %.4f\ttrain accuracy: %.4f\tcost time: %.4fs' % (loss, accuracy, end_time-start_time))
        update_parameter(model, x, y, y_hat)        # 更新参数
        batch_nums += 1


epoch: [1/20]
train loss: 2.0257	train accuracy: 0.1953	cost time: 0.2020s
train loss: 1.7799	train accuracy: 0.1641	cost time: 0.2034s
train loss: 3.0887	train accuracy: 0.1641	cost time: 0.1958s
train loss: 3.0836	train accuracy: 0.1719	cost time: 0.1977s
train loss: 2.9875	train accuracy: 0.1797	cost time: 0.1976s
train loss: 1.7190	train accuracy: 0.1562	cost time: 0.2123s
train loss: 2.0196	train accuracy: 0.2109	cost time: 0.2172s
train loss: 2.1785	train accuracy: 0.1406	cost time: 0.2156s
train loss: 2.1006	train accuracy: 0.1250	cost time: 0.2116s
train loss: 1.8294	train accuracy: 0.1953	cost time: 0.2147s
train loss: 3.4617	train accuracy: 0.1719	cost time: 0.2109s
train loss: 2.5428	train accuracy: 0.1797	cost time: 0.2174s
train loss: 4.6143	train accuracy: 0.2109	cost time: 0.2090s
train loss: 3.9505	train accuracy: 0.2422	cost time: 0.2055s
train loss: 4.5966	train accuracy: 0.2031	cost time: 0.2040s
train loss: 4.1309	train accuracy: 0.1953	cost time: 0.2045s
train loss

KeyboardInterrupt: 