# import package

In [29]:
import torch
import numpy as np
import torch.optim as optim
import torch.nn.functional as F
from tqdm import tqdm
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import Optimizer
from torch.optim.lr_scheduler import LambdaLR
import pytreebank
import pandas as pd
import os
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader, Dataset
import csv
from torch.utils.data import DataLoader, Dataset
import math
import random

## hyperparameters

In [31]:
class ConfigTrans(object):
    def __init__(self):
        self.dropout = 0.11067975661405677                   
        self.num_classes = 5                # 类别数
        self.warm_epochs = 20
        self.num_epochs = 100              # epoch数
        self.batch_size = 128         # mini-batch大小              
        self.learning_rate =0.0011604442299768141           # 学习率
        self.weight_decay = 1e-4
        self.patience = 40
        self.num_layers = 1
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.PATH = './sst5_net.pth'
        self.best_acc = 0
        self.stale=0
        self.hidden_dim=128
        self.token_label_dim = 256
config = ConfigTrans()


## 数据处理（函数解释）
- get_english_tokenizer():这个函数返回一个英文文本分词器，使用PyTorch的get_tokenizer函数以"basic_english"配置创建。它将在后续的文本数据预处理中使用。
- data_transform():这个函数用于对数据集进行预处理。它接受一个数据集（如训练集、验证集或测试集），并对每个样本执行以下操作：
将文本转为小写。
将文本拆分成标签和句子，然后提取句子的标签和单词。
使用英文分词器分词句子。
构建数据、标签以及每个标记的标签列表。
这样操作是为了方便之后给LSTM模型输入
- build_vocab():用于构建词汇表。它接受数据和一个最小频率参数，然后使用torchtext库的build_vocab_from_iterator函数构建词汇表。词汇表中包括特殊令牌（如"<unk>"），并设置默认索引为"<unk>"，以处理未知单词。
- load_glove():用于加载与训练的Glove词嵌入模型，从指定路径加载包含词嵌入的文本文件。
- embedding_transform():将词汇表中的每个单词映射为词向量,未知单词会生成随机嵌入。
- sentence_to_idx():将文本数据和标签转化为模型可用的索引。它将文本转化为单词索引序列，用<pad>令牌进行填充，并创建适合训练的数据和标签张量。
- add_pad_embedding()：用于添加一个随机初始化的<pad>令牌的词嵌入到词嵌入矩阵中。
- MyDataset():用于包装数据。
- get_dataloader_vocab_embedding()：这是主要的数据加载和预处理流程。它首先加载数据集，然后通过data_transform函数对数据进行预处理。接着，它构建词汇表（如果词汇表文件不存在）并加载预训练词嵌入模型。最后，它将数据转化为索引表示，构建数据加载器，并返回词汇表、词嵌入、训练数据加载器、验证数据加载器和测试数据加载器。

In [32]:
def get_english_tokenizer():
    tokenizer = get_tokenizer("basic_english", language="en")
    return tokenizer


def data_transform(dataset):

    tokenizer = get_english_tokenizer()
    labels = []
    data = []
    label_for_each_token = []
    for sample in dataset:
        sample.lowercase()
        sample = sample.to_labeled_lines()
        token_label = [i[0] for i in sample]
        token = [i[1] for i in sample]

        sentence = tokenizer(sample[0][1])
        labels.append(sample[0][0])

        data_temp = []
        label_temp = []
        for i in range(len(token)):
            if token[i] in sentence:
                data_temp.append(token[i])
                label_temp.append(token_label[i])
        data.append(data_temp)
        label_for_each_token.append(label_temp)

    return data, labels, label_for_each_token


def build_vocab(data, min_freq=1):
    vocab = build_vocab_from_iterator(
        data,
        specials=["<unk>"],
        min_freq=min_freq
    )
    vocab.set_default_index(vocab["<unk>"])

    return vocab


def load_glove(path="glove.6B.300d.txt"):

    words = pd.read_table(path, sep=" ", index_col=0, header=None, quoting=csv.QUOTE_NONE)

    embedding = words.values
    words = words.index.to_numpy()

    print("The embedding size is {}, and the number of words is {}".format(embedding.shape[1], embedding.shape[0]))
    return words, embedding


def embedding_transform(vocab, words, embedding):
    word_to_idx = vocab.get_stoi()
    num_words = vocab.__len__()
    embedding_size = embedding.shape[1]
    transformed_embedding = np.zeros((num_words, embedding_size))

    # 创建一个字典来存储未知单词的嵌入
    unknown_word_embeddings = {}

    for token, idx in word_to_idx.items():
        if token in words:
            token_id = np.argwhere(words == token).item()
            transformed_embedding[idx,] = embedding[token_id,]
        else:
            if token not in unknown_word_embeddings:
                # 为未知单词生成随机嵌入
                unknown_word_embeddings[token] = np.random.rand(embedding_size)
            transformed_embedding[idx,] = unknown_word_embeddings[token]

    return transformed_embedding



def sentence_to_idx(data, vocab, token_label, MAX_SEQUENCE_LENGTH=25):

    pad_id = vocab.__len__() - 1
    data_size = len(data)
    num_class = 5
    transformed_data = np.zeros((data_size, MAX_SEQUENCE_LENGTH))
    transformed_label = np.zeros((data_size, MAX_SEQUENCE_LENGTH, num_class))

    for i in range(data_size):
        sentence = data[i]
        labels = token_label[i]
        len_sen = len(sentence)
        sentence = sentence[:MAX_SEQUENCE_LENGTH]
        labels = labels[:MAX_SEQUENCE_LENGTH]
        word_ids = [vocab[i] for i in sentence]

        if len_sen < MAX_SEQUENCE_LENGTH:
            padding_length = MAX_SEQUENCE_LENGTH - len_sen
            word_ids.extend([pad_id] * padding_length)

        transformed_data[i,] = np.array(word_ids, dtype=np.int64)
        for j in range(len(labels)):
            transformed_label[i, j, labels[j] - 1] = 1

    transformed_data = torch.tensor(transformed_data, dtype=torch.long)
    transformed_label = torch.tensor(transformed_label, dtype=torch.float)

    return transformed_data, transformed_label


def add_pad_embedding(embedding, seed=21):
   
    embedding_dim = embedding.shape[1]
    np.random.seed(seed)
    pad_embedding = np.random.rand(1, embedding_dim)

    return np.concatenate((embedding, pad_embedding), axis=0)


class MyDataset(Dataset):

    def __init__(self, x, y, label_each_token):
        self.data = x
        self.label = y
        self.label_each_token = label_each_token

    def __getitem__(self, idx):
        return self.data[idx], self.label_each_token[idx], self.label[idx]

    def __len__(self):
        return len(self.data)


def search_file(file_name, search_path, pathsep = os.pathsep):

    for path in search_path.split(pathsep):
        candidate = os.path.join(path, file_name)
        if os.path.isfile(candidate):
            return os.path.abspath(candidate)
        return None


def get_dataloader_vocab_embedding(batch_size=config.batch_size, dataset_path='.data/sst/trees', embedding_path='.vector_cache/glove.6B.300d.txt',
                                   MAX_SEQUENCE_LENGTH=25):
    # Load dataset
    dataset = pytreebank.load_sst(dataset_path)
    train = dataset['train']
    val = dataset['dev']
    test = dataset['test']

    # data transformation
    train_data, train_labels, train_token_labels = data_transform(train)
    val_data, val_labels, val_token_labels = data_transform(val)
    test_data, test_labels, test_token_labels = data_transform(test)

    # label transformation
    train_labels = torch.LongTensor(train_labels)
    val_labels = torch.LongTensor(val_labels)
    test_labels = torch.LongTensor(test_labels)

    if search_file('vocab.pt', dataset_path):
        vocab = torch.load(os.path.join(dataset_path, "vocab.pt"))
        print("Load vocab directly.")
    else:
        # build vocabulary
        vocab = build_vocab(train_data)
        # vocab add <pad>
        vocab.insert_token("<PAD>", vocab.__len__())
        print("Build vocab.")
    print("The vocabulary contains {} words.".format(vocab.__len__()))

    if search_file("transformed_embedding.pt", dataset_path):
        transformed_embedding = torch.load(os.path.join(dataset_path, "transformed_embedding.pt"))
        print("Load transformed embeddings directly.")
    else:
        # load pre-trained embedding
        words, embedding = load_glove(embedding_path)

        # get the embedding of our vocabulary
        transformed_embedding = embedding_transform(vocab, words, embedding)
        # add <pad> embedding
        transformed_embedding = add_pad_embedding(transformed_embedding)
        transformed_embedding = torch.tensor(transformed_embedding, dtype=torch.float)

    # transform the words into ids
    train_data, train_token_labels = sentence_to_idx(train_data, vocab, train_token_labels, MAX_SEQUENCE_LENGTH)
    val_data, val_token_labels = sentence_to_idx(val_data, vocab, val_token_labels, MAX_SEQUENCE_LENGTH)
    test_data, test_token_labels = sentence_to_idx(test_data, vocab, test_token_labels, MAX_SEQUENCE_LENGTH)

    # build datasets
    train_dataset = MyDataset(train_data, train_labels, train_token_labels)
    val_dataset = MyDataset(val_data, val_labels, val_token_labels)
    test_dataset = MyDataset(test_data, test_labels, test_token_labels)

    # build dataloaders
    train_loader = DataLoader(train_dataset, batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size, shuffle=True)
    test_loader = DataLoader(test_dataset, batch_size, shuffle=True)

    return vocab, transformed_embedding, train_loader, val_loader, test_loader 

In [12]:
vocab,transformed_embedding,trainloader,validloader,testloader = get_dataloader_vocab_embedding()
classes = ('positive', 'negative', 'neutral', 'very positive', 'very negative')

Load vocab directly.
The vocabulary contains 16517 words.
Load transformed embeddings directly.


# Construct my LSTM、RNN、Transformer network

### PositionEncoding 便于在Transformer中使用

In [13]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, dropout=0.1, max_len=512):
        super(PositionalEncoding, self).__init__()

        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
        # sin and cos position encoding
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        pe = pe.unsqueeze(0).transpose(0, 1)
        self.register_buffer('pe', pe)

    def forward(self, x):
        x = x + self.pe[:x.size(0), :]
        return x

### 三个模型，分别是LSTM，RNN，Transformer

为了方便，dropout、embedding、attention的影响都在LSTM模型上进行测试。
- embedding：在class LSTM_net中，if embedding 来判断是否使用预训练的词向量嵌入，若embedding==None，就使用随机初始化的方法。
- dropout：实验设计比较简单，设置dropout的不同值来测试即可。
- attention：class Attention中定义，分别在softmax前加和不加attention来测试影响。

In [24]:
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.projection = nn.Sequential(
            nn.Linear(hidden_dim, 64),
            nn.ReLU(True),
            nn.Linear(64, 1)
        )

    def forward(self, encoder_outputs):
        energy = self.projection(encoder_outputs)
        weights = F.softmax(energy.squeeze(-1), dim=1)
        outputs = (encoder_outputs * weights.unsqueeze(-1)).sum(dim=1)
        return outputs

class LSTM_net(nn.Module):
    def __init__(self, embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout=0):
        super(LSTM_net, self).__init__()
        if embedding is not None:
            self.embedding = nn.Embedding(vocab_size, embed_size)
            self.embedding.weight = nn.Parameter(embedding)
            self.embedding.weight.requires_grad = False
            self.pretrain = 1
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size, max_norm=1)
            self.embedding.weight.requires_grad = True
            self.pretrain = 0

        self.embed_size = embed_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.token_label_dim = token_label_dim
        self.dropout = dropout
        self.linear = nn.Linear(5, token_label_dim)
        if num_layers == 1:
            self.lstm = nn.LSTM(embed_size + token_label_dim,
                                hidden_dim,
                                num_layers=num_layers,
                                batch_first=True,
                                dropout=0)
        else:
            self.lstm = nn.LSTM(embed_size + token_label_dim,
                                hidden_dim,
                                num_layers=num_layers,
                                batch_first=True,
                                dropout=dropout)
        self.attention = Attention(hidden_dim)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                        nn.Linear(hidden_dim, 5),
                                        nn.Softmax(dim=1))
        self.model_name = "LSTM"
        # nn.init.orthogonal_(self.lstm.weight_ih_l0)
        # nn.init.orthogonal_(self.lstm.weight_hh_l0)
        # nn.init.zeros_(self.lstm.bias_ih_l0)
        # nn.init.zeros_(self.lstm.bias_hh_l0)

    def forward(self, inputs_0, inputs_1):
        x1 = self.embedding(inputs_0)
        x2 = self.linear(inputs_1)
        x = torch.cat((x1, x2), dim=2)
        x, _ = self.lstm(x, None)
        # x = [batch, seq_len, hidden_size]
        x = self.attention(x)
        # x = x[:, -1, :]
        x = self.classifier(x)
        return x


class RNN_net(nn.Module):
    def __init__(self, embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout=0):
        super(RNN_net, self).__init__()
        if embedding is not None:
            self.embedding = nn.Embedding(vocab_size, embed_size)
            self.embedding.weight = nn.Parameter(embedding)
            self.embedding.weight.requires_grad = False
            self.pretrain = 1
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size, max_norm=1)
            self.embedding.weight.requires_grad = True
            self.pretrain = 0

        self.embed_size = embed_size
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.token_label_dim = token_label_dim
        self.dropout = dropout
        self.linear = nn.Linear(5, token_label_dim)
        if num_layers == 1:
            self.rnn = nn.RNN(embed_size + token_label_dim,
                                hidden_dim,
                                num_layers=num_layers,
                                batch_first=True,
                                dropout=0)
        else:
            self.rnn = nn.RNN(embed_size + token_label_dim,
                                hidden_dim,
                                num_layers=num_layers,
                                batch_first=True,
                                dropout=dropout)
        self.attention = Attention(hidden_dim)
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                        nn.Linear(hidden_dim, 5),
                                        nn.Softmax(dim=1))
        self.model_name = "RNN"
        # nn.init.orthogonal_(self.lstm.weight_ih_l0)
        # nn.init.orthogonal_(self.lstm.weight_hh_l0)
        # nn.init.zeros_(self.lstm.bias_ih_l0)
        # nn.init.zeros_(self.lstm.bias_hh_l0)

    def forward(self, inputs_0, inputs_1):
        x1 = self.embedding(inputs_0)
        x2 = self.linear(inputs_1)
        x = torch.cat((x1, x2), dim=2)
        x, _ = self.rnn(x, None)
        # x = [batch, seq_len, hidden_size]
        x = self.attention(x)
        # x = x[:, -1, :]
        x = self.classifier(x)
        return x


class Transformer_net(nn.Module):
    def __init__(self, embedding, embed_size, vocab_size,num_classes, num_layers=1, dropout=0, max_len=128,activation: str = "relu"):
        super(Transformer_net, self).__init__()
        if embedding is not None:
            self.num_class = 5
            self.embedding = nn.Embedding(vocab_size, embed_size)
            self.embedding.weight = nn.Parameter(embedding)
            self.position_embedding = PositionalEncoding(embed_size, dropout, max_len)
            self.embedding.weight.requires_grad = False
            self.pretrain = 1
        else:
            self.embedding = nn.Embedding(vocab_size, embed_size, max_norm=1)
            self.embedding.weight.requires_grad = True
            self.pretrain = 0

        encoder_layer = nn.TransformerEncoderLayer(embed_size ,nhead=2, dim_feedforward=512, dropout=dropout,activation=activation)
        self.transformer = nn.TransformerEncoder(encoder_layer, num_layers)
        # 输出层
        self.classifier = nn.Sequential(nn.Dropout(dropout),
                                        nn.Linear(embed_size, num_classes),
                                        nn.Softmax(dim=1))
        self.model_name="Transformer"

    def forward(self, inputs ,lengths):
        inputs = torch.transpose(inputs, 0, 1)
        hidden_states = self.embedding(inputs)
        hidden_states = self.position_embedding(hidden_states)
        hidden_states = self.transformer(hidden_states)
        hidden_states = hidden_states[0, :, :]
        x = self.classifier(hidden_states)
        return x

### 尝试了一些之前训练的小技巧
get_cosine_shedule_with_warmup():调度学习率（带有热身），每隔一段时间会修正学习率，整体上先快后慢。
LabelSmoothing():标签平滑化，防止模型过于自信，但由于标签不是onehot格式所以实际效果不佳。

In [15]:
# Learning rate schedule
def get_cosine_shedule_with_warmup(
        optimizer: Optimizer,
        num_warmup_steps: int,
        num_training_steps: int,
        num_cycles: float = 0.5,
        last_epoch: int = -1
):
    def lr_lambda(current_step):
        # warmup
        if current_step < num_warmup_steps:
            return float(current_step) / float(max(1, num_warmup_steps))
        # decadence
        progress = float(current_step - num_warmup_steps) / float(max(1, num_training_steps - num_warmup_steps))
        return max(0.0, 0.5 * (1.0 + math.cos(math.pi * float(num_cycles) * 2.0 * progress)))

    return LambdaLR(optimizer, lr_lambda, last_epoch)


class LabelSmoothing(nn.Module):
    def __init__(self, smoothing=0.0):
        super(LabelSmoothing, self).__init__()
        self.confidence = 1.0 - smoothing
        self.smoothing = smoothing

    def forward(self, x, target):
        logprobs = F.log_softmax(x, dim=-1)
        nll_loss = -logprobs.gather(dim=-1, index=target.unsqueeze(1))
        nll_loss = nll_loss.squeeze(1)
        smooth_loss = -logprobs.mean(dim=-1)
        loss = self.confidence * nll_loss + self.smoothing * smooth_loss
        return loss.mean()

# Train the networtk and save best model

优化器选择的是AdamW，lr，weight_decay都是通过网格或者随机搜索得到的较佳值。

In [82]:
embed_size = transformed_embedding.shape[1]
vocab_size = vocab.__len__()
hidden_dim = config.hidden_dim
num_layers = config.num_layers
dropout = config.dropout
num_classes = config.num_classes
token_label_dim = config.token_label_dim
net = Transformer_net(transformed_embedding,embed_size,vocab_size,num_classes,num_layers=num_layers,dropout=dropout)
#net = LSTM_net(transformed_embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout)
#net = RNN_net(transformed_embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout)
if(net.model_name=="Transformer"):
    Trans_flag = True
else:
    Trans_flag = False
print(Trans_flag)
criterion = nn.CrossEntropyLoss()
#criterion = LabelSmoothing(smoothing = 0.19649094880484694)
optimizer =optim.AdamW(net.parameters(), lr=config.learning_rate , weight_decay=config.weight_decay)
scheduler = get_cosine_shedule_with_warmup(optimizer, config.warm_epochs, config.num_epochs)
PATH = config.PATH
device = config.device
print(device)
model = net.to(device)

True
cuda:0


超参调参使用网格搜索和随机搜索寻找最佳超参数，寻找的范围定义在了hyperparameter_space中，打印超参数和准确率，同时将超参数保存到config.txt文件中，效果较好。以LSTM模型为例，最终寻找到的最佳值为：

    ```self.dropout = 0.11067975661405677                   
        self.num_classes = 5                # 类别数
        self.warm_epochs = 50
        self.num_epochs = 1000                # epoch数
        self.batch_size = 128         # mini-batch大小              
        self.learning_rate =0.0011604442299768141           # 学习率
        self.weight_decay = 1e-4
        self.patience = 150
        self.num_layers = 1
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.PATH = './sst5_net.pth'
        self.best_acc = 0
        self.stale=0
        self.hidden_dim=128
        self.token_label_dim = 256```
使用最佳搜索后LSTM提升了0.025的准确率

In [None]:
def train_and_evaluate_model(model, trainloader, validloader, num_epochs , lr_rate , weight_decay , warm_epochs , smoothing):
    
    device = "cuda:0" if torch.cuda.is_available() else "cpu"

    criterion = LabelSmoothing(smoothing=smoothing)
    optimizer = optim.AdamW(model.parameters(), lr= lr_rate , weight_decay = weight_decay)
    scheduler = get_cosine_shedule_with_warmup(optimizer,warm_epochs , num_epochs)
    model.to(device)
    stale = 0
    best_acc = 0.0

    for epoch in range(num_epochs):
        model.train()
        train_loss = 0.0

        for batch in trainloader:
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)
            optimizer.zero_grad()
            outputs = model(inputs_0, inputs_1)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

        scheduler.step()
        valid_loss = []
        valid_accs = []
        for batch in validloader:
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)
            with torch.no_grad():
                 logits = model(inputs_0,inputs_1)
            # We can still compute the loss (but not the gradient).
            loss = criterion(logits, labels.to(device))
            # Compute the accuracy for current batch.
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

            # Record the loss and accuracy.
            valid_loss.append(loss.item())
            valid_accs.append(acc)
        
        valid_loss = sum(valid_loss) / len(valid_loss)
        valid_acc = sum(valid_accs) / len(valid_accs)

        # Print the information.
        if valid_acc> best_acc:
            best_acc = valid_acc
        else:
            stale += 1
            if stale > 50:
                #print(f"No improvment {50} consecutive epochs, early stopping")
                break

    return best_acc

def evaluate_model(model, dataloader, device):
    model.eval()
    correct = 0
    total = 0

    with torch.no_grad():
        for batch in dataloader:
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)

            outputs = model(inputs_0, inputs_1)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    accuracy = correct / total

    return accuracy

def train_with_random_search():
    # Define the hyperparameter space you want to search
    hyperparameter_space = {
        #'learning_rate': [0.003, 3e-4, 3e-5],
        'num_layers': [1, 2, 3],
        'hidden_dim': [64, 128, 256],
        #'dropout': [0, 0.1, 0.15 ,0.2],
        'token_label_dim':[64,128,256],
        'batch_size':[64 ,128, 256],
        'weight_decay':[1e-5,1e-4,5e-6],
        'warm_epochs':[25,50,100]
        # Add more hyperparameters as needed
    }

    # Number of random hyper
    # parameter combinations to try
    num_searches = 50

    best_acc = 0.0
    best_hyperparameters = None
    for search in range(num_searches):
        # Randomly sample hyperparameters from the space
        hyperparameters = {
            'learning_rate': random.uniform(1e-5,3e-3),
            'num_layers': random.choice(hyperparameter_space['num_layers']),
            'hidden_dim': random.choice(hyperparameter_space['hidden_dim']),
            'dropout':  random.uniform(0,0.2),
            'token_label_dim' : random.choice(hyperparameter_space['token_label_dim']),
            'batch_size':random.choice(hyperparameter_space['batch_size']),
            'weight_decay':random.choice(hyperparameter_space['weight_decay']),
            'warm_epochs':random.choice(hyperparameter_space['warm_epochs']),
            'smoothing':  random.uniform(0,0.2)
            # Add more hyperparameters as needed
        }
        vocab,transformed_embedding,trainloader,validloader,testloader = get_dataloader_vocab_embedding(batch_size = hyperparameters['batch_size'])
        vocab_size = vocab.__len__()
        embed_size = transformed_embedding.shape[1]
        model = LSTM_net(transformed_embedding, embed_size, vocab_size, hidden_dim = hyperparameters['hidden_dim'], token_label_dim = hyperparameters['token_label_dim'], num_layers = hyperparameters['num_layers'], dropout=hyperparameters['dropout'])
        # Train the model and evaluate it
        accuracy = train_and_evaluate_model(model, trainloader, validloader, 500 ,lr_rate=hyperparameters['learning_rate'],weight_decay=hyperparameters['weight_decay'],warm_epochs = hyperparameters['warm_epochs'],smoothing = hyperparameters['smoothing'])

        # Track the best model and hyperparameters
        if accuracy > best_acc:
            best_acc = accuracy
            best_hyperparameters = hyperparameters

    print(f"Best hyperparameters: {best_hyperparameters}")
    print(f"Best accuracy: {best_acc}")
    with open('config.txt', 'w+') as config_file:
        for key, value in best_hyperparameters.items():
            config_file.write(f"{key}: {value}\n")
        config_file.write("\n")


train_with_random_search()


正式训练模型，通过earlystop（设置patience）来缓解过拟合，每次准确率提升都会更新best_acc，并保存模型参数到本地，如果超过patience次没有提升就会结束训练。

### 实验记录：每次结果都是五次测试取平均值
- 两种embedding 方式（以LSTM模型为例）

| Embedding | Accuracy |
| ----  | ---- |
| No   | 0.371 |
| Yes  | 0.447 |

- dropout对实验的影响（以LSTM模型为例）

| Dropout | Accuracy |
| ----  | ---- |
| 0   | 0.425 |
| 0.1  | 0.439 |
| 0.2 | 0.447 |

- 分类器前加attention对实验的影响（以LSTM模型为例）

| Attention | Accuracy |
| ----  | ---- |
| No   | 0.415 |
| Yes  | 0.426 |

- 三种不同模型对实验的影响（epoch = 100）（以各自的最佳参数）

| Model_name | Accuracy | Time/s |
| ----  | ---- | ---- |
| LSTM   | 0.442 | 21.6 |
| RNN | 0.492 | 44.3 | 
| Transformer | 0.439 | 37.9 |

可以看到RNN的效果最好但是使用时间较长，LSTM的表现适中，Transformer的表现较差。

模型性能上：RNN>LSTM>Transformer

收敛速度上：LSTM>Transformer>RNN


In [83]:
def train():
    stale = config.stale
    best_acc = config.best_acc  
    for epoch in range(config.num_epochs):
        # ---------- Training ----------
        # Make sure the model is in train mode before training.
        model.train()

        # These are used to record information in training.
        train_loss = []
        train_accs = []
        for batch in tqdm(trainloader):
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)
            if(Trans_flag):
                logits = model(inputs_0,len(inputs_0))
            else:
                logits =model(inputs_0,inputs_1)
            loss = criterion(logits,labels.to(device))
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
            train_loss.append(loss.item())
            train_accs.append(acc)
        train_loss = sum(train_loss) / len(train_loss)
        train_acc = sum(train_accs) / len(train_accs)

        # Print the information.
        print(f"[ Train | {epoch + 1:03d}/{config.num_epochs:03d} ] loss = {train_loss:.5f}, acc = {train_acc:.5f}")

        # ---------- Validation ----------
        # Make sure the model is in eval mode so that some modules like dropout are disabled and work normally.
        scheduler.step()
        model.eval()
        # These are used to record information in validation.
        valid_loss = []
        valid_accs = []

        # Iterate the validation set by batches.
        for batch in tqdm(validloader):
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)
            with torch.no_grad():
                 if(Trans_flag):
                    logits = model (inputs_0,len(inputs_0))
                 else: 
                    logits = model(inputs_0,inputs_1)
            # We can still compute the loss (but not the gradient).
            loss = criterion(logits, labels.to(device))
            # Compute the accuracy for current batch.
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()

            # Record the loss and accuracy.
            valid_loss.append(loss.item())
            valid_accs.append(acc)
            #break

        # The average loss and accuracy for entire validation set is the average of the recorded values.
        valid_loss = sum(valid_loss) / len(valid_loss)
        valid_acc = sum(valid_accs) / len(valid_accs)

        # Print the information.
        print(f"[ Valid | {epoch + 1:03d}/{config.num_epochs:03d} ] loss = {valid_loss:.5f}, acc = {valid_acc:.5f}")


        # save models
        if valid_acc > best_acc:
            print(f"Best model found at epoch {epoch}, saving model")
            torch.save(net.state_dict(), PATH)
            best_acc = valid_acc
            stale = 0
        else:
            stale += 1
            if stale > config.patience:
                print(f"No improvment {config.patience} consecutive epochs, early stopping")
                break
train()

100%|██████████| 67/67 [00:00<00:00, 119.88it/s]


[ Train | 001/100 ] loss = 1.62864, acc = 0.15757


100%|██████████| 9/9 [00:00<00:00, 316.16it/s]


[ Valid | 001/100 ] loss = 1.62678, acc = 0.16981
Best model found at epoch 0, saving model


100%|██████████| 67/67 [00:00<00:00, 113.98it/s]


[ Train | 002/100 ] loss = 1.58087, acc = 0.27013


100%|██████████| 9/9 [00:00<00:00, 298.68it/s]


[ Valid | 002/100 ] loss = 1.57814, acc = 0.28379
Best model found at epoch 1, saving model


100%|██████████| 67/67 [00:00<00:00, 116.31it/s]


[ Train | 003/100 ] loss = 1.56917, acc = 0.28992


100%|██████████| 9/9 [00:00<00:00, 313.99it/s]


[ Valid | 003/100 ] loss = 1.56504, acc = 0.31297
Best model found at epoch 2, saving model


100%|██████████| 67/67 [00:00<00:00, 112.78it/s]


[ Train | 004/100 ] loss = 1.55417, acc = 0.32350


100%|██████████| 9/9 [00:00<00:00, 314.93it/s]


[ Valid | 004/100 ] loss = 1.54841, acc = 0.32831
Best model found at epoch 3, saving model


100%|██████████| 67/67 [00:00<00:00, 114.21it/s]


[ Train | 005/100 ] loss = 1.52670, acc = 0.36155


100%|██████████| 9/9 [00:00<00:00, 314.01it/s]


[ Valid | 005/100 ] loss = 1.53308, acc = 0.34825
Best model found at epoch 4, saving model


100%|██████████| 67/67 [00:00<00:00, 114.55it/s]


[ Train | 006/100 ] loss = 1.51300, acc = 0.37966


100%|██████████| 9/9 [00:00<00:00, 303.85it/s]


[ Valid | 006/100 ] loss = 1.54672, acc = 0.33844


100%|██████████| 67/67 [00:00<00:00, 118.10it/s]


[ Train | 007/100 ] loss = 1.50593, acc = 0.38736


100%|██████████| 9/9 [00:00<00:00, 312.31it/s]


[ Valid | 007/100 ] loss = 1.51702, acc = 0.37025
Best model found at epoch 6, saving model


100%|██████████| 67/67 [00:00<00:00, 116.16it/s]


[ Train | 008/100 ] loss = 1.49568, acc = 0.39646


100%|██████████| 9/9 [00:00<00:00, 314.76it/s]


[ Valid | 008/100 ] loss = 1.51741, acc = 0.36967


100%|██████████| 67/67 [00:00<00:00, 113.08it/s]


[ Train | 009/100 ] loss = 1.48181, acc = 0.40878


100%|██████████| 9/9 [00:00<00:00, 297.37it/s]


[ Valid | 009/100 ] loss = 1.52000, acc = 0.37402
Best model found at epoch 8, saving model


100%|██████████| 67/67 [00:00<00:00, 115.40it/s]


[ Train | 010/100 ] loss = 1.47083, acc = 0.42086


100%|██████████| 9/9 [00:00<00:00, 338.29it/s]


[ Valid | 010/100 ] loss = 1.51201, acc = 0.37980
Best model found at epoch 9, saving model


100%|██████████| 67/67 [00:00<00:00, 112.19it/s]


[ Train | 011/100 ] loss = 1.46828, acc = 0.42149


100%|██████████| 9/9 [00:00<00:00, 305.93it/s]


[ Valid | 011/100 ] loss = 1.51046, acc = 0.38067
Best model found at epoch 10, saving model


100%|██████████| 67/67 [00:00<00:00, 109.12it/s]


[ Train | 012/100 ] loss = 1.46686, acc = 0.42782


100%|██████████| 9/9 [00:00<00:00, 310.58it/s]


[ Valid | 012/100 ] loss = 1.50897, acc = 0.37836


100%|██████████| 67/67 [00:00<00:00, 114.24it/s]


[ Train | 013/100 ] loss = 1.45854, acc = 0.43373


100%|██████████| 9/9 [00:00<00:00, 302.09it/s]


[ Valid | 013/100 ] loss = 1.51857, acc = 0.36390


100%|██████████| 67/67 [00:00<00:00, 123.92it/s]


[ Train | 014/100 ] loss = 1.47485, acc = 0.42013


100%|██████████| 9/9 [00:00<00:00, 301.72it/s]


[ Valid | 014/100 ] loss = 1.51631, acc = 0.37983


100%|██████████| 67/67 [00:00<00:00, 111.49it/s]


[ Train | 015/100 ] loss = 1.45772, acc = 0.43322


100%|██████████| 9/9 [00:00<00:00, 316.06it/s]


[ Valid | 015/100 ] loss = 1.50874, acc = 0.38215
Best model found at epoch 14, saving model


100%|██████████| 67/67 [00:00<00:00, 116.01it/s]


[ Train | 016/100 ] loss = 1.45229, acc = 0.44030


100%|██████████| 9/9 [00:00<00:00, 313.87it/s]


[ Valid | 016/100 ] loss = 1.50200, acc = 0.37544


100%|██████████| 67/67 [00:00<00:00, 115.57it/s]


[ Train | 017/100 ] loss = 1.44100, acc = 0.45227


100%|██████████| 9/9 [00:00<00:00, 272.41it/s]


[ Valid | 017/100 ] loss = 1.51670, acc = 0.37344


100%|██████████| 67/67 [00:00<00:00, 115.85it/s]


[ Train | 018/100 ] loss = 1.45438, acc = 0.44022


100%|██████████| 9/9 [00:00<00:00, 314.66it/s]


[ Valid | 018/100 ] loss = 1.51406, acc = 0.36533


100%|██████████| 67/67 [00:00<00:00, 113.26it/s]


[ Train | 019/100 ] loss = 1.44634, acc = 0.44764


100%|██████████| 9/9 [00:00<00:00, 412.10it/s]


[ Valid | 019/100 ] loss = 1.51424, acc = 0.36914


100%|██████████| 67/67 [00:00<00:00, 118.80it/s]


[ Train | 020/100 ] loss = 1.45634, acc = 0.43777


100%|██████████| 9/9 [00:00<00:00, 308.12it/s]


[ Valid | 020/100 ] loss = 1.52031, acc = 0.36765


100%|██████████| 67/67 [00:00<00:00, 113.59it/s]


[ Train | 021/100 ] loss = 1.46021, acc = 0.43175


100%|██████████| 9/9 [00:00<00:00, 426.73it/s]


[ Valid | 021/100 ] loss = 1.51078, acc = 0.38326
Best model found at epoch 20, saving model


100%|██████████| 67/67 [00:00<00:00, 125.29it/s]


[ Train | 022/100 ] loss = 1.46582, acc = 0.42751


100%|██████████| 9/9 [00:00<00:00, 282.86it/s]


[ Valid | 022/100 ] loss = 1.51126, acc = 0.37516


100%|██████████| 67/67 [00:00<00:00, 121.26it/s]


[ Train | 023/100 ] loss = 1.44506, acc = 0.44935


100%|██████████| 9/9 [00:00<00:00, 417.10it/s]


[ Valid | 023/100 ] loss = 1.52381, acc = 0.36275


100%|██████████| 67/67 [00:00<00:00, 118.49it/s]


[ Train | 024/100 ] loss = 1.44552, acc = 0.44912


100%|██████████| 9/9 [00:00<00:00, 310.19it/s]


[ Valid | 024/100 ] loss = 1.51793, acc = 0.37865


100%|██████████| 67/67 [00:00<00:00, 117.31it/s]


[ Train | 025/100 ] loss = 1.44286, acc = 0.44842


100%|██████████| 9/9 [00:00<00:00, 375.76it/s]


[ Valid | 025/100 ] loss = 1.52278, acc = 0.36679


100%|██████████| 67/67 [00:00<00:00, 122.41it/s]


[ Train | 026/100 ] loss = 1.43519, acc = 0.45880


100%|██████████| 9/9 [00:00<00:00, 386.64it/s]


[ Valid | 026/100 ] loss = 1.49804, acc = 0.38905
Best model found at epoch 25, saving model


100%|██████████| 67/67 [00:00<00:00, 116.58it/s]


[ Train | 027/100 ] loss = 1.43119, acc = 0.46024


100%|██████████| 9/9 [00:00<00:00, 343.28it/s]


[ Valid | 027/100 ] loss = 1.50233, acc = 0.38647


100%|██████████| 67/67 [00:00<00:00, 125.22it/s]


[ Train | 028/100 ] loss = 1.43839, acc = 0.45546


100%|██████████| 9/9 [00:00<00:00, 314.22it/s]


[ Valid | 028/100 ] loss = 1.51682, acc = 0.37344


100%|██████████| 67/67 [00:00<00:00, 113.04it/s]


[ Train | 029/100 ] loss = 1.41978, acc = 0.47380


100%|██████████| 9/9 [00:00<00:00, 315.87it/s]


[ Valid | 029/100 ] loss = 1.52434, acc = 0.36824


100%|██████████| 67/67 [00:00<00:00, 115.78it/s]


[ Train | 030/100 ] loss = 1.41856, acc = 0.47442


100%|██████████| 9/9 [00:00<00:00, 305.62it/s]


[ Valid | 030/100 ] loss = 1.49497, acc = 0.40354
Best model found at epoch 29, saving model


100%|██████████| 67/67 [00:00<00:00, 110.80it/s]


[ Train | 031/100 ] loss = 1.40677, acc = 0.48449


100%|██████████| 9/9 [00:00<00:00, 263.91it/s]


[ Valid | 031/100 ] loss = 1.49173, acc = 0.40064


100%|██████████| 67/67 [00:00<00:00, 117.68it/s]


[ Train | 032/100 ] loss = 1.40231, acc = 0.48982


100%|██████████| 9/9 [00:00<00:00, 408.09it/s]


[ Valid | 032/100 ] loss = 1.49104, acc = 0.40325


100%|██████████| 67/67 [00:00<00:00, 124.27it/s]


[ Train | 033/100 ] loss = 1.38711, acc = 0.50917


100%|██████████| 9/9 [00:00<00:00, 319.43it/s]


[ Valid | 033/100 ] loss = 1.50530, acc = 0.38704


100%|██████████| 67/67 [00:00<00:00, 116.03it/s]


[ Train | 034/100 ] loss = 1.38955, acc = 0.50707


100%|██████████| 9/9 [00:00<00:00, 314.48it/s]


[ Valid | 034/100 ] loss = 1.50428, acc = 0.39163


100%|██████████| 67/67 [00:00<00:00, 109.41it/s]


[ Train | 035/100 ] loss = 1.39133, acc = 0.50529


100%|██████████| 9/9 [00:00<00:00, 417.69it/s]


[ Valid | 035/100 ] loss = 1.48094, acc = 0.41799
Best model found at epoch 34, saving model


100%|██████████| 67/67 [00:00<00:00, 111.43it/s]


[ Train | 036/100 ] loss = 1.38109, acc = 0.51574


100%|██████████| 9/9 [00:00<00:00, 330.41it/s]


[ Valid | 036/100 ] loss = 1.49271, acc = 0.40034


100%|██████████| 67/67 [00:00<00:00, 132.31it/s]


[ Train | 037/100 ] loss = 1.39631, acc = 0.50012


100%|██████████| 9/9 [00:00<00:00, 416.56it/s]


[ Valid | 037/100 ] loss = 1.52191, acc = 0.37144


100%|██████████| 67/67 [00:00<00:00, 126.88it/s]


[ Train | 038/100 ] loss = 1.40587, acc = 0.49098


100%|██████████| 9/9 [00:00<00:00, 343.39it/s]


[ Valid | 038/100 ] loss = 1.48353, acc = 0.41538


100%|██████████| 67/67 [00:00<00:00, 113.53it/s]


[ Train | 039/100 ] loss = 1.39062, acc = 0.50482


100%|██████████| 9/9 [00:00<00:00, 316.29it/s]


[ Valid | 039/100 ] loss = 1.51922, acc = 0.37665


100%|██████████| 67/67 [00:00<00:00, 128.09it/s]


[ Train | 040/100 ] loss = 1.39005, acc = 0.50742


100%|██████████| 9/9 [00:00<00:00, 321.39it/s]


[ Valid | 040/100 ] loss = 1.48906, acc = 0.40381


100%|██████████| 67/67 [00:00<00:00, 117.64it/s]


[ Train | 041/100 ] loss = 1.39217, acc = 0.50544


100%|██████████| 9/9 [00:00<00:00, 407.30it/s]


[ Valid | 041/100 ] loss = 1.49265, acc = 0.40525


100%|██████████| 67/67 [00:00<00:00, 117.91it/s]


[ Train | 042/100 ] loss = 1.37517, acc = 0.52231


100%|██████████| 9/9 [00:00<00:00, 370.84it/s]


[ Valid | 042/100 ] loss = 1.51700, acc = 0.37518


100%|██████████| 67/67 [00:00<00:00, 112.03it/s]


[ Train | 043/100 ] loss = 1.36896, acc = 0.52775


100%|██████████| 9/9 [00:00<00:00, 311.41it/s]


[ Valid | 043/100 ] loss = 1.50681, acc = 0.38244


100%|██████████| 67/67 [00:00<00:00, 106.70it/s]


[ Train | 044/100 ] loss = 1.37262, acc = 0.52573


100%|██████████| 9/9 [00:00<00:00, 314.84it/s]


[ Valid | 044/100 ] loss = 1.51139, acc = 0.38270


100%|██████████| 67/67 [00:00<00:00, 115.39it/s]


[ Train | 045/100 ] loss = 1.36132, acc = 0.53630


100%|██████████| 9/9 [00:00<00:00, 370.37it/s]


[ Valid | 045/100 ] loss = 1.50535, acc = 0.38330


100%|██████████| 67/67 [00:00<00:00, 111.60it/s]


[ Train | 046/100 ] loss = 1.36273, acc = 0.53626


100%|██████████| 9/9 [00:00<00:00, 316.75it/s]


[ Valid | 046/100 ] loss = 1.50586, acc = 0.38298


100%|██████████| 67/67 [00:00<00:00, 110.89it/s]


[ Train | 047/100 ] loss = 1.36142, acc = 0.53638


100%|██████████| 9/9 [00:00<00:00, 347.72it/s]


[ Valid | 047/100 ] loss = 1.52375, acc = 0.36998


100%|██████████| 67/67 [00:00<00:00, 108.66it/s]


[ Train | 048/100 ] loss = 1.35359, acc = 0.54485


100%|██████████| 9/9 [00:00<00:00, 324.56it/s]


[ Valid | 048/100 ] loss = 1.49623, acc = 0.40064


100%|██████████| 67/67 [00:00<00:00, 106.90it/s]


[ Train | 049/100 ] loss = 1.35154, acc = 0.54408


100%|██████████| 9/9 [00:00<00:00, 349.98it/s]


[ Valid | 049/100 ] loss = 1.49627, acc = 0.39688


100%|██████████| 67/67 [00:00<00:00, 111.13it/s]


[ Train | 050/100 ] loss = 1.33775, acc = 0.56114


100%|██████████| 9/9 [00:00<00:00, 303.98it/s]


[ Valid | 050/100 ] loss = 1.48955, acc = 0.40208


100%|██████████| 67/67 [00:00<00:00, 102.06it/s]


[ Train | 051/100 ] loss = 1.33769, acc = 0.56168


100%|██████████| 9/9 [00:00<00:00, 316.95it/s]


[ Valid | 051/100 ] loss = 1.48944, acc = 0.40379


100%|██████████| 67/67 [00:00<00:00, 106.64it/s]


[ Train | 052/100 ] loss = 1.34155, acc = 0.55733


100%|██████████| 9/9 [00:00<00:00, 277.16it/s]


[ Valid | 052/100 ] loss = 1.50448, acc = 0.39168


100%|██████████| 67/67 [00:00<00:00, 117.16it/s]


[ Train | 053/100 ] loss = 1.34019, acc = 0.55752


100%|██████████| 9/9 [00:00<00:00, 314.55it/s]


[ Valid | 053/100 ] loss = 1.48729, acc = 0.40816


100%|██████████| 67/67 [00:00<00:00, 113.23it/s]


[ Train | 054/100 ] loss = 1.32324, acc = 0.57587


100%|██████████| 9/9 [00:00<00:00, 312.71it/s]


[ Valid | 054/100 ] loss = 1.50624, acc = 0.38415


100%|██████████| 67/67 [00:00<00:00, 112.70it/s]


[ Train | 055/100 ] loss = 1.32452, acc = 0.57362


100%|██████████| 9/9 [00:00<00:00, 308.27it/s]


[ Valid | 055/100 ] loss = 1.49032, acc = 0.40064


100%|██████████| 67/67 [00:00<00:00, 116.66it/s]


[ Train | 056/100 ] loss = 1.31165, acc = 0.58994


100%|██████████| 9/9 [00:00<00:00, 385.12it/s]

[ Valid | 056/100 ] loss = 1.50579, acc = 0.38589
No improvment 20 consecutive epochs, early stopping





# Test the trained model
使用测试集得出准确率，（仅是测试）不代表实验过程中最高准确率

In [93]:
# @title
# Measure accuracy for each class
net = Transformer_net(transformed_embedding,embed_size,vocab_size,num_classes,num_layers=num_layers,dropout=dropout)
#net = LSTM_net(transformed_embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout)
#net = RNN_net(transformed_embedding, embed_size, vocab_size, hidden_dim, token_label_dim, num_layers, dropout)
net.load_state_dict(torch.load(PATH))
device = config.device
model = net.to(device)
correct_pred = {classname: 0 for classname in classes}
total_pred = {classname: 0 for classname in classes}
test_accs = []
with torch.no_grad():
    for batch in testloader:
            inputs_0 = batch[0].to(device)
            inputs_1 = batch[1].to(device)
            labels = batch[2].to(device)
            if(Trans_flag):
                logits = model (inputs_0,len(inputs_0))
            else:
                logits = model(inputs_0,inputs_1)
            acc = (logits.argmax(dim=-1) == labels.to(device)).float().mean()
            test_accs.append(acc)
    test_acc = sum(test_accs) / len(test_accs)
    print(f"acc = {test_acc:.5f}")
           

acc = 0.41416


# Conclusion

在SST-5情感分析中，本次使用使用了LSTM、RNN、Transformer三个模型，测试过程中通过网格和随机搜索寻找了各自模型的最佳超参值，三个模型中RNN的效果最好但是耗时较长，LSTM的综合水平较好，Transformer的效果较差，实验中使用dropout、attention、Glove_embedding均能对实验准确率提高，其中embedding的效果最明显。