# Environment

In [110]:
try:
    import transformers
except:
    !pip install transformers

In [111]:
import transformers
import pickle

import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel

In [112]:
transformers.__version__

'4.21.2'

# Global Config

In [113]:
max_length = 128
batch_size = 32

log_after_step = 20

epochs = 10

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [114]:
test_mode = True
if test_mode:
    data_size = 64
    batch_size = 16
    log_after_step = 1
    epochs = 1000

# Data

In [115]:
!gdown '1dC09i57lobL91lEbpebDuUBS0fGz-LAk' --folder --output data

'gdown' 不是内部或外部命令，也不是可运行的程序
或批处理文件。


In [116]:
tokenizer = AutoTokenizer.from_pretrained("hfl/chinese-roberta-wwm-ext")

In [117]:
sentence = "昨天下雨了，天气非常凉爽，今天我们出门去玩吧，就去昨天那个地方。"

In [118]:
class CSCDataset(Dataset):

    def __init__(self):
        super(CSCDataset, self).__init__()
        with open("data/trainall.times2.pkl", mode='br') as f:
            train_data = pickle.load(f)

        self.train_data = train_data

    def __getitem__(self, index):
        tgt = self.train_data[index]['tgt']
        return tgt

    def __len__(self):
        if test_mode:
            return batch_size
        return len(self.train_data)

In [119]:
train_set = CSCDataset()

In [120]:
train_set.__getitem__(0)

'纽约早盘作为基准的低硫轻油，五月份交割价攀升一点三四美元，来到每桶二十八点二五美元，而上周五曾下挫一美元以上。'

# DataLoader

In [121]:
def collate_fn(batch):
    text = list(batch)

    inputs = tokenizer(text, padding='max_length', max_length=max_length, return_tensors='pt', truncation=True)
    targets = inputs['input_ids']

    return inputs, targets

In [122]:
train_loader = DataLoader(train_set, batch_size=batch_size, collate_fn=collate_fn)

In [123]:
inputs, targets = next(iter(train_loader))

# Model

In [124]:
class CopyModel(nn.Module):

    def __init__(self, max_length=128):
        super(CopyModel, self).__init__()

        self.max_length = max_length

        self.bert = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext")

        # self.interlayer = nn.Sequential(
        #     nn.Linear(max_length * 768, 2048),
        #     nn.ReLU(),
        #     nn.Linear(2048, max_length * 768),
        #     nn.ReLU(),
        # )

        # decoder_layer = nn.TransformerDecoderLayer(d_model=768, nhead=12, batch_first=True)
        # self.decoder = nn.TransformerDecoder(decoder_layer, num_layers=1)
        # self.decoder = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext").encoder

        self.predictor = nn.Sequential(
            nn.Linear(768, len(tokenizer)),
            # nn.ReLU(),
            # nn.Linear(1024, max_length * len(tokenizer))
            # nn.Softmax(dim=2)
        )

    def forward(self, inputs):
        outputs = self.bert(**inputs)['last_hidden_state']

        # outputs = self.interlayer(outputs.view(-1, self.max_length * 768))
        # outputs = outputs.view(-1, self.max_length * 768)

        # n_tokens = outputs.size(1)
        # tgt = torch.ones(1, n_tokens, 768)
        # outputs = self.decoder(outputs)['last_hidden_state']
        # outputs = self.decoder(outputs, outputs)

        outputs = self.predictor(outputs)

        return outputs

In [125]:
model = CopyModel().to(device)

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [126]:
outputs = model(inputs.to(device))
outputs.size()

torch.Size([16, 128, 21128])

# Training

In [127]:
criteria = nn.CrossEntropyLoss(ignore_index=0)
optimizer = torch.optim.Adam(model.parameters(), lr=3e-4)
model = model.train()

In [128]:
total_loss = 0.
step = 0

for epoch in range(epochs):
    for inputs, targets in train_loader:
        optimizer.zero_grad()
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criteria(outputs.view(-1, len(tokenizer)), targets.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

        step += 1

        if step % log_after_step == 0:
            print("step {}, loss {:.4f}".format(step, total_loss / log_after_step))
            total_loss = 0.

    # if loss.item() <= 3.7:
    #     break

    # break

step 1, loss 10.0497
step 2, loss 9.0754
step 3, loss 8.3315
step 4, loss 7.6060
step 5, loss 6.9177
step 6, loss 6.3054
step 7, loss 5.7080
step 8, loss 5.1490
step 9, loss 4.6416
step 10, loss 4.1701
step 11, loss 3.7393
step 12, loss 3.3311
step 13, loss 2.9423
step 14, loss 2.5933
step 15, loss 2.2565
step 16, loss 1.9566
step 17, loss 1.6999
step 18, loss 1.4454
step 19, loss 1.2311
step 20, loss 1.0337
step 21, loss 0.8729
step 22, loss 0.7320
step 23, loss 0.6122
step 24, loss 0.5070
step 25, loss 0.4230
step 26, loss 0.3522
step 27, loss 0.2963
step 28, loss 0.2484
step 29, loss 0.2125
step 30, loss 0.1816
step 31, loss 0.1597


KeyboardInterrupt: 

In [None]:
for param in model.interlayer.parameters():
    print(param.grad)

tensor([ 101, 5294, 5276, 3193, 4669,  868,  711, 1825, 1114, 4638,  856, 4800,
        6768, 3779, 8024,  758, 3299,  819,  769, 1200,  817, 3102, 1285,  671,
        4157,  676, 1724, 5401, 1039, 8024, 3341, 1168, 3680, 3446,  753, 1282,
        1061, 4157,  753,  758, 5401, 1039, 8024, 5445,  677, 1453,  758, 3295,
         678, 2919,  671, 5401, 1039,  809,  677,  511,  102,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0])

In [82]:
tokenizer.convert_ids_to_tokens(inputs['input_ids'].squeeze())

['[CLS]',
 '纽',
 '约',
 '早',
 '盘',
 '作',
 '为',
 '基',
 '准',
 '的',
 '低',
 '硫',
 '轻',
 '油',
 '，',
 '五',
 '月',
 '份',
 '交',
 '割',
 '价',
 '攀',
 '升',
 '一',
 '点',
 '三',
 '四',
 '美',
 '元',
 '，',
 '来',
 '到',
 '每',
 '桶',
 '二',
 '十',
 '八',
 '点',
 '二',
 '五',
 '美',
 '元',
 '，',
 '而',
 '上',
 '周',
 '五',
 '曾',
 '下',
 '挫',
 '一',
 '美',
 '元',
 '以',
 '上',
 '。',
 '[SEP]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',
 '[PAD]',


# Inference

In [87]:
sentence = "纽约早盘作为基准的低硫轻油，五越份交割价攀升一点三四人民币，来到每桶二十八点二五美元，而上周五曾下挫一美元以上。"

In [88]:
inputs = tokenizer(sentence, return_tensors='pt', max_length=128, padding='max_length', add_special_tokens=True,
                   truncation=True)

In [89]:
outputs = model(inputs.to(device))

In [90]:
"".join(tokenizer.convert_ids_to_tokens(outputs.argmax(-1)[0]))

'[CLS]纽约早盘作为基准的低硫轻油，五四份交割价攀升一点三四美元元，来到每桶二十八点二五美元，而上周五曾下挫一美元以上。[CLS]。，，，，桶一，一一一，，，，，，桶，，，，一，一一，，，，，，油，，桶，，，，，，，，，，，，，桶，，，，，一，，，，点，，，，，，，，，'

In [261]:
model = AutoModel.from_pretrained("hfl/chinese-roberta-wwm-ext")

Some weights of the model checkpoint at hfl/chinese-roberta-wwm-ext were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [266]:
model.encoder(torch.ones(1, 34, 768))['last_hidden_state'].size()

torch.Size([1, 34, 768])