In [1]:
# mount google drive in colab
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir("/content/drive/MyDrive/Colab Notebooks/mount")

Mounted at /content/drive


In [2]:
!pip install transformers > /dev/null
!pip install datasets > /dev/null

In [3]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

plm = 'EleutherAI/pythia-160m'

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000")
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
tokenizer.padding_side = "left"

PAD_IDX = tokenizer.convert_tokens_to_ids(tokenizer.pad_token)
IGNORED_PAD_IDX = -100
PAD_IDX


tokenizer_config.json:   0%|          | 0.00/396 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.11M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/99.0 [00:00<?, ?B/s]

50278

In [4]:
from datasets import load_dataset, Features, Value
dataset = load_dataset(
  "csv",
  data_files="train_data_must_choose.txt",
  delimiter='\t',
  features = Features({'fid': Value('string'), 'idx': Value('int64'),'content': Value('string'), 'label': Value('string')}),
  column_names=['fid', 'idx', 'content', 'label'],
  keep_default_na=False
)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

In [5]:
sub_datasets = torch.utils.data.random_split(dataset['train'], [10000 , len(dataset['train'])-10000])
print(len(sub_datasets[0]))
for i in range(4): print(sub_datasets[0][i])

10000
{'fid': '1614', 'idx': 5044, 'content': 'Supplementary report by Dr Z Picklesimer', 'label': 'DOCTOR: Z Picklesimer'}
{'fid': '1340', 'idx': 2646, 'content': 'Deeper levels have been examined and no definite invasion of perinephric fat is identified.', 'label': 'PHI: NULL'}
{'fid': 'file9224', 'idx': 7237, 'content': 'Sigmoid nodule: High grade Serous carcinoma including one involved lymph node.', 'label': 'PHI: NULL'}
{'fid': '529', 'idx': 1957, 'content': 'Specimen labelled "Right deep femoral node" consists of an irregular piece of fatty tissue 15 x 10 x 2mm.', 'label': 'PHI: NULL'}


In [6]:
from torch.utils.data import DataLoader
import torch

# train_data = list(sub_datasets[0])
train_data = list(dataset['train'])

def collate_batch(batch):
    texts = [f"{bos} {data['content']} {sep}"+ data['label'].replace('\\n','\n')+f" {eos}" for data in list(batch)] # 範例 prompt
    encoded_seq = tokenizer(texts, padding=True,)

    indexed_tks = torch.tensor(encoded_seq['input_ids'])
    attention_mask = torch.tensor(encoded_seq['attention_mask'])
    encoded_label = torch.tensor(encoded_seq['input_ids'])
    encoded_label[encoded_label == tokenizer.pad_token_id] = IGNORED_PAD_IDX

    return indexed_tks, encoded_label, attention_mask

# train_dataloader = DataLoader(train_data, batch_size=2, shuffle=False, collate_fn=collate_batch)
# titer = iter(train_dataloader)
# tks, labels, masks= next(titer)
# print(tks.shape)
# next(iter(titer))

In [7]:
import random
# BATCH_SIZE = 11 # 自行決定大小
BATCH_SIZE = 6 # 自行決定大小

class BatchSampler():
    def __init__(self, data, batch_size):
        self.pooled_indices = []
        self.data = data
        self.batch_size = batch_size
        self.len = len(list(data))
    def __iter__(self):
        self.pooled_indices = []
        indices = [(index, len(data["content"])) for index, data in enumerate(self.data)]
        random.shuffle(indices)
        for i in range(0, len(indices), BATCH_SIZE * 100):
            self.pooled_indices.extend(sorted(indices[i:i + BATCH_SIZE * 100], key=lambda x: x[1], reverse=True))
        self.pooled_indices = [x[0] for x in self.pooled_indices]

        for i in range(0, len(self.pooled_indices), BATCH_SIZE):
            yield self.pooled_indices[i:i + BATCH_SIZE]
    def __len__(self):
        return (self.len + self.batch_size - 1) // self.batch_size

bucket_train_dataloader = DataLoader(
                train_data,
                batch_sampler=BatchSampler(train_data, BATCH_SIZE),
                collate_fn=collate_batch,
                pin_memory=True
              )

In [8]:
from transformers import AutoConfig
config = AutoConfig.from_pretrained(
        plm,
        bos_token_id=tokenizer.bos_token_id,
        eos_token_id=tokenizer.eos_token_id,
        pad_token_id=tokenizer.pad_token_id,
        sep_token_id=tokenizer.sep_token_id,
        output_hidden_states=False
      )

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config).to(device)
# model

config.json:   0%|          | 0.00/569 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/375M [00:00<?, ?B/s]

In [9]:
from transformers import get_linear_schedule_with_warmup , get_constant_schedule_with_warmup ,get_cosine_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 15 # 設定你的訓練次數
optimizer = AdamW(model.parameters(),lr=5e-5)

steps = len(bucket_train_dataloader)
total_steps = steps * EPOCHS
print(steps, total_steps)

# scheduler = get_linear_schedule_with_warmup(
#     optimizer,
#     num_warmup_steps=total_steps*0.1,
#     num_training_steps=total_steps
# )

scheduler = get_cosine_schedule_with_warmup(
    optimizer,
    num_warmup_steps=total_steps*0.1,
    num_training_steps=total_steps
)

model.resize_token_embeddings(len(tokenizer))
model.to(device)
print(f'Total numbers of steps: {total_steps}')
# model


4298 64470
Total numbers of steps: 64470


In [10]:
import datetime
from datetime import datetime
import pytz

def save_model(path):

    timezone_TW=pytz.timezone('ROC')
    time=datetime.now(timezone_TW).strftime("%Y_%m%d_%H%M_%S")
    name = f"{path}_{time}"
    model.save_pretrained(name)
    tokenizer.save_pretrained(name)

In [11]:
from tqdm import tqdm,trange

global_step = 0
total_loss = 0


model.train()
for ep in trange(EPOCHS, desc="Epoch"):
    model.train()
    total_loss = 0

    predictions , true_labels = [], []

    for step, (seqs, labels, masks) in enumerate(bucket_train_dataloader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels , attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()
        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        scheduler.step()  # Update learning rate schedule
    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))

    save_model( f'./V5/model_v5_EP{ep}')




# from datetime import datetime
# time = datetime.now().strftime("%Y_%m%d_%H%M_%S")
# model.save_pretrained(f"./V3/model_V3_small_batch{time}")
# tokenizer.save_pretrained(f"./V3/model_V3_small_batch{time}")

Epoch:   0%|          | 0/15 [00:00<?, ?it/s]

Average train loss: 1.839977933729233


Epoch:   7%|▋         | 1/15 [03:37<50:49, 217.80s/it]

Average train loss: 1.2491208198127108


Epoch:  13%|█▎        | 2/15 [07:13<46:56, 216.66s/it]

Average train loss: 1.026994262672458


Epoch:  20%|██        | 3/15 [10:46<43:00, 215.03s/it]

Average train loss: 0.8514972693798651


Epoch:  27%|██▋       | 4/15 [14:20<39:21, 214.67s/it]

Average train loss: 0.7244451400039916


Epoch:  33%|███▎      | 5/15 [17:52<35:34, 213.45s/it]

Average train loss: 0.6408679298657826


Epoch:  40%|████      | 6/15 [21:25<31:59, 213.29s/it]

Average train loss: 0.5897020326936838


Epoch:  47%|████▋     | 7/15 [24:57<28:25, 213.14s/it]

Average train loss: 0.5547459748746518


Epoch:  53%|█████▎    | 8/15 [28:31<24:52, 213.19s/it]

Average train loss: 0.5296751173839784


Epoch:  60%|██████    | 9/15 [32:04<21:19, 213.17s/it]

Average train loss: 0.5093539165668346


Epoch:  67%|██████▋   | 10/15 [35:35<17:43, 212.67s/it]

Average train loss: 0.49285207593119273


Epoch:  73%|███████▎  | 11/15 [39:11<14:14, 213.51s/it]

Average train loss: 0.47904468767376157


Epoch:  80%|████████  | 12/15 [42:45<10:41, 213.75s/it]

Average train loss: 0.46700318526908263


Epoch:  87%|████████▋ | 13/15 [46:20<07:07, 213.99s/it]

Average train loss: 0.45787479006977844


Epoch:  93%|█████████▎| 14/15 [49:50<03:32, 212.88s/it]

Average train loss: 0.45195501199769605


Epoch: 100%|██████████| 15/15 [53:30<00:00, 214.06s/it]
