In [110]:
import os
import time
import numpy as np
from utils import *
from softmax_regression import Model

In [25]:
if not os.path.exists('./dataset'):
    current_path = os.getcwd()           # 获得当前路径
    os.mkdir(current_path + '/dataset')  # 添加dataset路径
    path = './sentiment-analysis-on-movie-reviews/train.tsv'  # 原始数据所在路径
    generate_dataset(path)  # 生成数据集：训练集、开发集和测试集（train.txt、dev.txt和test.txt）

In [26]:
train_path = './dataset/train.txt'
vocab, data = build_vocab(train_path)

正在进行数据清洗！
数据清洗完成！
开始生成词典！
已生成好词典！


In [27]:
feature = feature_extraction(vocab, data)

In [115]:
dev_path = './dataset/dev.txt'
dev_data = data_clean(dev_path)
dev_feature = feature_extraction(vocab, dev_data)

正在进行数据清洗！
数据清洗完成！


In [137]:
np.array(dev_feature[-1][0]).shape

(10000,)

In [149]:
model = Model()

In [150]:
def one_hot_encode(y, class_nums):
    '''
    将标签改为one-hot编码
    '''
    y_new = np.zeros((class_nums, len(y)))
    for idx, val in enumerate(y):
        y_new[int(val)][idx] = 1
    return y_new    
    
def cross_entropy_loss(y_, y):
    '''
    交叉熵损失函数
    y_：模型预测值
    y：真实标签值
    '''
    N = y.shape[1]  # batch_size
    loss = 0.
    for i in range(N):
        for j in range(5):
            loss += -y[j][i] * np.log(y_[j][i])
    # loss = loss / N
    return (loss / N, loss)

def compute_accuracy(y_, y):
    N = len(y)
    correct_nums = 0
    for i in range(N):
        if y_[i] == y[i]:
            correct_nums += 1
    return (correct_nums / N, correct_nums)

def update_parameter(model, x, y, y_):
    lr = 1e-2
    N = x.shape[0]
    sigma = np.zeros((model.Weight.shape[0], model.Weight.shape[1]))
    for i in range(N):
        sigma += np.matmul((y.T[i]-y_.T[i]).reshape(-1, 1), x[i].reshape(1, -1))
    model.Weight = model.Weight + lr * sigma / N
    
def evaluate(model, dev):
    N = len(dev)
    batch_correct_nums = 0.
    batch_loss = 0.
    for x, y in data_iter(dev):
        x = np.asarray(x)
        y = np.asarray(y)
        one = np.ones((x.shape[0], 1))
        x = np.concatenate((x, one), axis=1)          # 将x变为增广矩阵
        y_hat = model.forward(x)                      # 使用模型得到每个样本的标签（类别）的概率分布
        y_pred_label = np.argmax(y_hat, axis=0)       # 使用argmax得到模型预测的结果
        _, correct_nums = compute_accuracy(y_pred_label, y)  # 计算mini-batch的准确率
        batch_correct_nums += correct_nums
        y = one_hot_encode(y, 5)                      # 将样本标签转化为one-hot向量，用于使用交叉熵损失函数计算mini-batch的loss
        _, loss = cross_entropy_loss(y_hat, y)        # 计算mini-batch loss
        batch_loss += loss
    return (batch_correct_nums / N, batch_loss / N)

In [151]:
total_batch = 1
best_dev_loss = float('inf')
flag = False
last_improve = 0
for epoch in range(20):
    print('epoch: [%d/%d]' % (epoch+1, 20))
    for x, y in data_iter(feature):
        # start_time = time.time()
        x = np.asarray(x)
        y = np.asarray(y)
        one = np.ones((x.shape[0], 1))
        x = np.concatenate((x, one), axis=1)             # 将x变为增广矩阵
        y_hat = model.forward(x)                         # 使用模型得到每个样本的标签（类别）的概率分布
        y_pred_label = np.argmax(y_hat, axis=0)          # 使用argmax得到模型预测的结果
        accuracy, _ = compute_accuracy(y_pred_label, y)  # 计算mini-batch的准确率
        y = one_hot_encode(y, 5)                       # 将样本标签转化为one-hot向量，用于使用交叉熵损失函数计算mini-batch的loss
        loss, _ = cross_entropy_loss(y_hat, y)         # 计算mini-batch loss
        update_parameter(model, x, y, y_hat)           # 更新参数
        if total_batch % 100 == 0:
            # end_time = time.time()
            # print('train loss: %.4f\ttrain accuracy: %.4f' % (loss, accuracy))
            dev_accu, dev_loss = evaluate(model, dev_feature)
            if dev_loss < best_dev_loss:
                best_dev_loss = dev_loss
                improve =  '*'
                last_improve = total_batch
                # save model
                np.save('model/weights', model.Weight, allow_pickle=True, fix_imports=True)
            else:
                improve = ''
            print('train loss: {:.4f}\t train accuracy: {:.4f}\t dev accuracy: {:.4f}\t dev loss: {:.4f}\t{}'.format(loss, accuracy, dev_accu, dev_loss, improve))
        if total_batch - last_improve > 300:
            flag = True
            break
        total_batch += 1
    if flag:
        break


epoch: [1/20]
train loss: 2.6544	 train accuracy: 0.2969	 dev accuracy: 0.3070	 dev loss: 2.5619	*
train loss: 2.2801	 train accuracy: 0.3594	 dev accuracy: 0.3341	 dev loss: 2.4739	*
train loss: 2.7340	 train accuracy: 0.2656	 dev accuracy: 0.3517	 dev loss: 2.4110	*
train loss: 2.6176	 train accuracy: 0.3359	 dev accuracy: 0.3645	 dev loss: 2.3658	*
train loss: 2.3771	 train accuracy: 0.3594	 dev accuracy: 0.3745	 dev loss: 2.3315	*
epoch: [2/20]
train loss: 2.6977	 train accuracy: 0.3672	 dev accuracy: 0.3829	 dev loss: 2.3049	*
train loss: 2.2836	 train accuracy: 0.3516	 dev accuracy: 0.3902	 dev loss: 2.2833	*
train loss: 2.1025	 train accuracy: 0.4297	 dev accuracy: 0.3946	 dev loss: 2.2660	*
train loss: 1.9921	 train accuracy: 0.4141	 dev accuracy: 0.3977	 dev loss: 2.2514	*
train loss: 2.0614	 train accuracy: 0.3828	 dev accuracy: 0.4007	 dev loss: 2.2393	*
train loss: 2.1046	 train accuracy: 0.3750	 dev accuracy: 0.4030	 dev loss: 2.2292	*
epoch: [3/20]
train loss: 2.3956	 tra

KeyboardInterrupt: 