In [1]:
import csv
import numpy as np
import pandas as pd
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm
from transformers import MBart50TokenizerFast, MBartForConditionalGeneration, MBartTokenizer
import random
import os

In [152]:
import torch
from torch.utils.data import Dataset

In [153]:
train_data_m = 'EvaHan2023_train_data/train_24_histories_m_utf8.txt'
train_data_c = 'EvaHan2023_train_data/train_24-historoes_c_utf8.txt'

In [154]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [155]:
model_name = "facebook/mbart-large-cc25"
output_dir = './model_save/mbart-large-cc25'

In [156]:
def processdata(filename_m, filename_c):
    with open(filename_m, 'r', encoding='utf-8') as f:
        data_m = [i.strip().split('\n') for i in f.readlines()]
    with open(filename_c, 'r', encoding='utf-8') as g:
        data_c = [i.strip().split('\n') for i in g.readlines()]
    df = pd.DataFrame({'source':data_c, 'target':data_m})
    return df

In [185]:
class CustomDataset(Dataset):
    def __init__(self, data, src_lang, tgt_lang, model_name, with_labels = True):
        self.tokenizer = MBartTokenizer.from_pretrained(model_name)
        self.src_lang = src_lang
        self.tgt_lang = tgt_lang
        self.with_labels = with_labels
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        if self.with_labels:
            src = self.data.loc[index,'source']
            tgt = self.data.loc[index,'target']
            batch = self.tokenizer.prepare_seq2seq_batch(src, tgt_texts = tgt, src_lang = self.src_lang, tgt_lang = self.tgt_lang, return_tensors="pt").to(device)
            input_ids = batch["input_ids"].squeeze(0)
            target_ids = batch["labels"].squeeze(0)
            return input_ids, target_ids
        else:
            src = self.data.loc[index,'source']
            batch = self.tokenizer.prepare_seq2seq_batch(src, src_lang = self.src_lang, return_tensors="pt").to(device)
            input_ids = batch["input_ids"].squeeze(0)
            return input_ids

In [186]:
class MyModel(nn.Module):
    def __init__(self, model_name, freeze_bert = False):
        super().__init__()
        self.tokenizer = MBartTokenizer.from_pretrained(model_name)
        self.model = MBartForConditionalGeneration.from_pretrained(model_name).to(device)
        if freeze_bert:
            for p in self.model.parameters():
                p.requires_grad = False

    def forward(self, input_ids, labels):
        output = self.model(input_ids, labels=labels)
        return output.loss

    def generate(self, input_ids, labels, decoder_start_token):
        generated_tokens = self.model.generate(input_ids, decoder_start_token_id = self.tokenizer.lang_code_to_id[decoder_start_token])
        generated_sentences = self.tokenizer.batch_decode(generated_tokens, skip_special_tokens=True)[0]
        ground_truth_sentences = self.tokenizer.batch_decode(labels, skip_special_tokens=True)[0]
        return generated_sentences, ground_truth_sentences

In [187]:
def set_seed(seed):
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark=False
    np.random.seed(seed)
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

In [188]:
def save(model, optimizer):
    torch.save({
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        }, model)
    print('the best model has been saved')

In [189]:
def train_test_split(data, test_size=0.2, shuffle=True, random_state=None):
    train = data[int(len(data)*test_size):].reset_index(drop=True)
    test = data[:int(len(data)*test_size)].reset_index(drop=True)
    return train, test

In [190]:
def train_eval(model, optimizer, train_loader, val_loader, epochs=50):
    print('start training')
    for epoch in range(epochs):
        model.train()
        print('epoch:', epoch+1)
        train_loss = 0
        for batch_idx, batch in enumerate(tqdm(train_loader)):
            batch = tuple(t.to(device) for t in batch)
            loss = model(batch[0], batch[1])
            train_loss += loss.item()

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        train_loss /= len(train_loader)
        print(f"Epoch [{epoch + 1}/{epochs}] - Train Loss: {train_loss:.4f}")
        eval(model, optimizer, val_loader)


In [191]:
def eval(model, optimizer, val_loader):
    model.eval()
    eval_loss = 0
    for batch_idx, batch in enumerate(tqdm(val_loader)):
        batch = tuple(t.to(device) for t in batch)
        with torch.no_grad():
            if batch_idx == 1:
                generated_sentences, ground_truth_sentences = model.generate(batch[0], batch[1], 'zh_CN')
                print('generated_sentences:', generated_sentences)
                print('ground_truth_sentences:', ground_truth_sentences)
            loss = model(batch[0], batch[1])
            eval_loss += loss.item()

    eval_loss /= len(val_loader)
    print(f"Epoch [{epoch + 1}/{epochs}] - Valid Loss: {valid_loss:.4f}")

In [192]:
set_seed(42)

df = processdata(train_data_m, train_data_c)
train, test = train_test_split(df, test_size=0.2, shuffle=True, random_state=42)

train_set = CustomDataset(train, 'ja_XX', 'zh_CN', model_name)
train_dataset = torch.utils.data.DataLoader(train_set, batch_size=1, shuffle=True, num_workers=0)

test_set = CustomDataset(test, 'ja_XX', 'zh_CN', model_name)
test_dataset = torch.utils.data.DataLoader(test_set, batch_size=1, shuffle=True, num_workers=0)

model = MyModel(model_name, freeze_bert = False)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
train_eval(model, optimizer, train_dataset, test_dataset, epochs=50)

start training
epoch: 1


  0%|          | 0/245996 [00:00<?, ?it/s]

(tensor([[     6, 123551,  59600,   3302,    630, 246455, 245306,      4, 123052,
          10560,     37,  26335,     37,  41318,   1971,  29958,   3169,    635,
          46989,     30,      2, 250012]], device='cuda:0'), tensor([[     6, 123551,  59600,   3302,    630, 246455, 245306,    635,      4,
         123052,  10560,     37,  26335,     37,  41318,   1971,  29958,   3169,
            635,  21381,  46989,     30,      2, 250025]], device='cuda:0'))
0 4.659609794616699


  0%|          | 0/245996 [00:02<?, ?it/s]


OutOfMemoryError: CUDA out of memory. Tried to allocate 978.00 MiB (GPU 0; 6.00 GiB total capacity; 4.97 GiB already allocated; 0 bytes free; 5.02 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [3]:
from IPython.display import display, HTML

text = '<h1>全新一代32位高速处理性能</h1> <h2>Intel推出intel80836 - 领先的32位微处理器</h2> <p>32位架构,大幅提升计算性能</p> <p>内存管理单元,优化系统资源利用</p> <p>支持多任务并行,效率翻倍</p> <p>丰富接口,随心扩展无限可能</p> <h2>intel80836,向高效计算的新纪元进发</h2> <p>性能强劲,拓展无限,创造属于你的专业计算体验</p> <table border="1"> <tr><td>数据和地址总线宽度</td><td>32位</td></tr> <tr><td>寻址空间</td><td>物理地址4GB</td></tr> <tr><td>缓存</td><td>支持L1 Cache</td></tr> <tr><td>总线接口</td><td>支持多种外部总线接口,如ISA、PCI等</td></tr> <tr><td>内存管理</td><td>支持虚拟内存和分页机制</td></tr> <tr><td>指令集</td><td>完整支持 x86 指令集</td></tr> <tr><td>时钟频率</td><td>12.5MHz</td></tr> </table>'

display(HTML(text))


0,1
数据和地址总线宽度,32位
寻址空间,物理地址4GB
缓存,支持L1 Cache
总线接口,"支持多种外部总线接口,如ISA、PCI等"
内存管理,支持虚拟内存和分页机制
指令集,完整支持 x86 指令集
时钟频率,12.5MHz
