# Deep Continuous Bag of Words (Deep CBOW) Text Classifier

The code below implements a continuous bag of words text classifier.
- We tokenize the text, create a vocabulary and encode each piece of text in the dataset
- We create embeddings for inputs and sum them together
- The resulting vector is fed to hidden neural network, which generates a new vector that is multiplied to a weights matrix
- We then add the bias and obtain scores
- The scores are applied a softmax to generate probabilities which are used for the final classification

The code used in this notebook was inspired by code from the [official repo](https://github.com/neubig/nn4nlp-code) used in the [CMU Neural Networks for NLP class](http://www.phontron.com/class/nn4nlp2021/schedule.html) by [Graham Neubig](http://www.phontron.com/index.php). 

![img txt](https://github.com/dair-ai/ML-Notebooks/blob/main/img/deep_cbow.png?raw=true)

In [1]:
import torch#导入了PyTorch库，用于进行深度学习任务和张量运算
import random#导入了random模块，用于生成随机数
import torch.nn as nn#导入了PyTorch的nn模块，其中包含了定义神经网络层和模型的基本类

In [None]:
''' uncomment to download the data
%%capture

# download the files
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/dev.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/test.txt
!wget https://raw.githubusercontent.com/neubig/nn4nlp-code/master/data/classes/train.txt

# create the data folders
!mkdir data data/classes
!cp dev.txt data/classes
!cp test.txt data/classes
!cp train.txt data/classes
'''

## Read and Process the Data

In [2]:
# function to read in data, process each line and split columns by " ||| "
def read_data(filename):#从文件中读取数据并返回一个列表
    data = []#创建一个空列表 data 用于存储读取的数据
    with open(filename, 'r') as f:#使用 open 函数打开指定的文件，以只读模式进行读取。文件对象被绑定到名称 f 上
        for line in f:#对于文件中的每一行，执行以下操作
            line = line.lower().strip()#将当前行的内容转换为小写并去除首尾的空白字符
            line = line.split(' ||| ')#将当前行按照 " ||| " 进行分割，得到一个列表
            data.append(line)#将分割后的行（以列表形式表示）添加到 data 列表中
    return data#返回存储着所有行数据的列表 data

train_data = read_data('data/classes/train.txt')#从文件"data/classes/train.txt"中读取训练数据，并将结果存储在 train_data 变量中
test_data = read_data('data/classes/test.txt')#从文件"data/classes/test.txt"中读取测试数据，并将结果存储在 test_data 变量中

# creating the word and tag indices
word_to_index = {}#创建一个空字典 word_to_index 用于存储单词到索引的映射
word_to_index["<unk>"] = len(word_to_index) # add <UNK> to dictionary#将 <unk> 添加到 word_to_index 字典中，其对应的索引为当前 word_to_index 的长度
tag_to_index = {}#创建一个空字典 tag_to_index 用于存储标签到索引的映射

# create word to index dictionary and tag to index dictionary from data
def create_dict(data, check_unk=False):#定义名为 create_dict 的函数
    for line in data:#对于 data 列表中的每一行，执行以下操作
        for word in line[1].split(" "):#对于当前行中第一个元素（索引为 1）按空格进行分割后的单词列表，执行以下操作
            if check_unk == False:#如果 check_unk 为 False，将当前单词 word 添加到 word_to_index 字典中
                if word not in word_to_index:#如果单词不存在于 word_to_index 中
                    word_to_index[word] = len(word_to_index)#对应的索引为当前 word_to_index 的长度
            else:#如果 check_unk 为 True，将当前单词 word 添加到 word_to_index 字典中
                if word not in word_to_index:#如果单词不存在于 word_to_index 中
                    word_to_index[word] = word_to_index["<unk>"]#则对应的索引为 <unk> 的索引

        if line[0] not in tag_to_index:#如果当前行的第一个元素（索引为 0）不存在于 tag_to_index 字典中
            tag_to_index[line[0]] = len(tag_to_index)#则将其添加到 tag_to_index 字典中

create_dict(train_data)#调用 create_dict 函数，传入训练数据 train_data
create_dict(test_data, check_unk=True)#调用 create_dict 函数，传入测试数据 test_data，同时设置 check_unk 为 True

# create word and tag tensors from data
def create_tensor(data):
    for line in data:
        yield([word_to_index[word] for word in line[1].split(" ")], tag_to_index[line[0]])
        #使用 yield 关键字将索引列表和标签索引生成为迭代器的下一个值
train_data = list(create_tensor(train_data))#通过 create_tensor 函数生成的迭代器转换为列表
test_data = list(create_tensor(test_data))#通过 create_tensor 函数生成的迭代器转换为列表

number_of_words = len(word_to_index)#计算 word_to_index 字典中的键的数量
number_of_tags = len(tag_to_index)#计算 tag_to_index 字典中的键的数量

## Model

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
#检查当前系统是否支持CUDA，如果支持，则将device设置为"cuda"，否则将其设置为"cpu"
# create a simple neural network with embedding layer, bias, and xavier initialization
class DeepCBoW(nn.Module):
    def __init__(self, nwords, ntags, hidden_size, num_layers, emb_size):
        super(DeepCBoW, self).__init__()#初始化 num_layers 属性，用于指定模型的层数

        self.num_layers = num_layers

        # layers
        self.embedding = nn.Embedding(nwords, emb_size)#创建一个 nn.Embedding 实例 self.embedding
        self.linears = nn.ModuleList([nn.Linear(emb_size if i ==0 else hidden_size, hidden_size) \
         #创建一个 nn.ModuleList 实例 self.linears
        for i in range(num_layers)])

        # use xavier initialization for weights
        nn.init.xavier_uniform_(self.embedding.weight)#使用 Xavier 初始化方法对嵌入层和线性层的权重进行初始化
        for i in range(self.num_layers):
            nn.init.xavier_uniform_(self.linears[i].weight)

        # output layer
        self.output_layer = nn.Linear(hidden_size, ntags)#创建一个线性层 self.output_layer，输入大小为 hidden_size，输出大小为 ntags

    def forward(self, x):#定义前向传播函数 forward，接受输入 x
        emb = self.embedding(x) # seq x emb_size#将输入 x 通过嵌入层 self.embedding 进行嵌入
        emb_sum = torch.sum(emb, dim=0) # emb_size#对 emb 沿着第 0 维度进行求和，得到 emb_sum
        h = emb_sum.view(1, -1) # reshape to (1, emb_size)
        for i in range(self.num_layers):
            h = torch.tanh(self.linears[i](h))#于 num_layers 层，将 h 传递给线性层 self.linears[i]，并通过 torch.tanh 进行非线性变换，更新 h
        out = self.output_layer(h) # 1 x ntags#将 h 传递给输出层 self.output_layer，得到输出 out。out 的维度为 1 x ntags
        return out#返回输出 out

HIDDEN_SIZE = 64#设置隐藏层的维度为64
NUM_LAYERS = 2 # hidden layers#设置隐藏层的数量为2
EMB_SIZE = 64#设置嵌入层的维度为64
#创建一个名为model的深层CBoW模型实例
model = DeepCBoW(number_of_words, number_of_tags, HIDDEN_SIZE, NUM_LAYERS, EMB_SIZE).to(device)
criterion = nn.CrossEntropyLoss()#创建一个交叉熵损失函数实例 criterion
optimizer = torch.optim.Adam(model.parameters())#创建一个Adam优化器实例 optimizer，用于优化模型的参数
type = torch.LongTensor#创建一个变量type，初始化为torch.LongTensor
#检查当前系统是否支持CUDA。如果支持，则将模型移动到device上，并将type设置为torch.cuda.LongTensor，以充分利用GPU进行计算
if torch.cuda.is_available():
    model.to(device)
    type = torch.cuda.LongTensor

## Model Training

In [4]:
# perform training of the Bow model

for epoch in range(10):
    # perform training
    model.train()#将模型设置为训练模式 model.train()
    random.shuffle(train_data)#随机打乱训练数据 
    total_loss = 0.0#初始化总损失 total_loss 为0.0
    train_correct = 0#初始化训练正确的数量 train_correct 为0
    for sentence, tag in train_data:
        sentence = torch.tensor(sentence).type(type)
        tag = torch.tensor([tag]).type(type)#将句子和标签转换为torch.tensor类型，数据类型为type
        output = model(sentence)#将句子输入到模型中，得到输出 output
        predicted = torch.argmax(output.data.detach()).item()#根据输出找到预测的标签索引 predicted
        
        loss = criterion(output, tag)#计算交叉熵损失 loss
        total_loss += loss.item()#累计损失值 total_loss

        optimizer.zero_grad()#清空优化器的梯度
        loss.backward()#反向传播计算梯度
        optimizer.step()#更新模型的参数

        if predicted == tag: train_correct+=1#如果预测值与实际标签相等，增加 train_correct 的计数

    # perform testing of the model
    model.eval()#将模型设置为评估模式 model.eval()
    test_correct = 0#初始化测试正确的数量 test_correct 为0
    for sentence, tag in test_data:#对于每个句子和标签对 (sentence, tag) 在测试数据中
        sentence = torch.tensor(sentence).type(type)#将句子转换为torch.tensor类型，数据类型为type
        output = model(sentence)#将句子输入到模型中，得到输出 output
        predicted = torch.argmax(output.data.detach()).item()#根据输出找到预测的标签索引 predicted
        if predicted == tag: test_correct += 1#如果预测值与实际标签相等，增加 test_correct 的计数
    
    # print model performance results
    #打印每个epoch的模型性能结果
    log = f'epoch: {epoch+1} | ' \
        f'train loss/sent: {total_loss/len(train_data):.4f} | ' \
        f'train accuracy: {train_correct/len(train_data):.4f} | ' \
        f'test accuracy: {test_correct/len(test_data):.4f}'
    
    print(log)

epoch: 1 | train loss/sent: 1.4293 | train accuracy: 0.3765 | test accuracy: 0.3941
epoch: 2 | train loss/sent: 1.0343 | train accuracy: 0.5729 | test accuracy: 0.4127
epoch: 3 | train loss/sent: 0.6565 | train accuracy: 0.7583 | test accuracy: 0.3801
epoch: 4 | train loss/sent: 0.4013 | train accuracy: 0.8586 | test accuracy: 0.3783
epoch: 5 | train loss/sent: 0.2659 | train accuracy: 0.9079 | test accuracy: 0.3959
epoch: 6 | train loss/sent: 0.1747 | train accuracy: 0.9419 | test accuracy: 0.3787
epoch: 7 | train loss/sent: 0.1257 | train accuracy: 0.9573 | test accuracy: 0.3805
epoch: 8 | train loss/sent: 0.0860 | train accuracy: 0.9702 | test accuracy: 0.3719
epoch: 9 | train loss/sent: 0.0652 | train accuracy: 0.9768 | test accuracy: 0.3747
epoch: 10 | train loss/sent: 0.0434 | train accuracy: 0.9860 | test accuracy: 0.3887


Bad pipe message: %s [b'I7{\xddYY9\x10\xe5', b"\xee\x8a\xf0\xff\xe6\x1a\xd2\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x00", b'\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00']
Bad pipe message: %s [b'\xe1\x05', b'\xb0\x87g\xc6U\xd5G\xa2.\xd2\xf7\x05\x9fL\x00\x00\xa6\xc0,\xc0', b'\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0\xaf\xc0\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V']
Bad pipe message: %s [b"\xc0$\xc0(\x00k\x00j\xc0s\xc0w\x00\xc4\x00\xc3\xc0#\xc0'\x00g\x00@\xc0r\xc0v\x00\xbe\x00\xbd\xc0\n\xc0\x14\x009\x008\x00\x88\x00\x87\xc0\t\xc0\x13\x003\x002\x00\x9a\x00\x99