# Dataset

In [None]:
from datasets import load_dataset, Features, Value

dataset = load_dataset("csv", data_files='tsv/notnormalize.tsv', delimiter='\t',
                       features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'], keep_default_na=False)

In [None]:
dataset

In [None]:
dataset['train'][0]

In [None]:
dataset['train'][1]

In [None]:
dataset['train'][7]

For demonstration purpose, we only use the randomly sampled 20000 instances.

In [None]:
import torch
# sub_datasets = torch.utils.data.random_split(dataset['train'], [20000, 65736])
# print(len(sub_datasets[0]))
# for i in range(4): print(sub_datasets[0][i])

#sub_datasets = dataset['train'][0]

# Data loader

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM
from transformers import AutoModelForSeq2SeqLM, AutoTokenizer

plm = "EleutherAI/pythia-70m-deduped" #"EleutherAI/pythia-70m-deduped"

bos = '<|endoftext|>'
eos = '<|END|>'
pad = '<|pad|>'
sep ='\n\n####\n\n'

special_tokens_dict = {'eos_token': eos, 'bos_token': bos, 'pad_token': pad, 'sep_token': sep}

tokenizer = AutoTokenizer.from_pretrained(plm, revision="step3000",)
tokenizer.padding_side = 'left'
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)
print(f"{tokenizer.pad_token}: {tokenizer.pad_token_id}")

In [None]:
from torch.utils.data import DataLoader
from islab.aicup import collate_batch_with_prompt_template

train_data = list(dataset['train'])
train_dataloader = DataLoader(train_data, batch_size=3, shuffle=False, collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer))
titer = iter(train_dataloader)
tks, labels, masks= next(titer)
print(tks.shape)
next(iter(titer))

In [None]:
results = tokenizer(["Lab No: 14H02780", "“STOCKDALE” 653 MONAGHAN RD"], padding=True)
print(results['input_ids'])
print()
print(results['input_ids'][0])
print(tokenizer.decode(results['input_ids'][0]))
print(results['input_ids'][1])
print(tokenizer.decode(results['input_ids'][1]))

In [None]:
results = tokenizer(
    [f"{bos} 9364819.RAN\\nMINTANIA, JEFFRY {sep} ID: 9364819.RAN\\nNAME: MINTANIA, JEFFRY {eos}",
     f"{bos} This is a sentence {sep} PHI: NULL {eos}"],
    padding=True
)
print(results['attention_mask'][0])
print(results['attention_mask'][1])
print(tokenizer.decode(results['input_ids'][0]))
print(tokenizer.decode(results['input_ids'][1]))

In [None]:
from islab.aicup import OpenDeidBatchSampler

BATCH_SIZE = 16
bucket_train_dataloader = DataLoader(train_data, batch_sampler=OpenDeidBatchSampler(train_data, BATCH_SIZE),
                                     collate_fn=lambda batch: collate_batch_with_prompt_template(batch, tokenizer),
                                     pin_memory=True)

for idx, batch in enumerate(bucket_train_dataloader):
    print(batch)
    print(batch[0].shape)
    print(batch[1].shape)
    break

# Model

In [None]:
from transformers import AutoConfig
# the model config to which we add the special tokens
config = AutoConfig.from_pretrained(plm,
                                    bos_token_id=tokenizer.bos_token_id,
                                    eos_token_id=tokenizer.eos_token_id,
                                    pad_token_id=tokenizer.pad_token_id,
                                    sep_token_id=tokenizer.sep_token_id,
                                    output_hidden_states=False)

model = AutoModelForCausalLM.from_pretrained(plm, revision="step3000", config=config)
model

In [None]:
#device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda")
device

In [None]:
from transformers import get_linear_schedule_with_warmup
from torch.optim import AdamW

EPOCHS = 10 # CHANGE TO THE NUMBER OF EPOCHS YOU WANT
optimizer = AdamW(model.parameters(),lr=1e-4) # YOU CAN ADJUST LEARNING RATE

model.resize_token_embeddings(len(tokenizer))
model.to(device)

In [None]:
from tqdm import tqdm

global_step = 0
total_loss = 0

model.train()
for epoch in range(EPOCHS):
    model.train()
    total_loss = 0

    # Training loop
    predictions, true_labels = None, None

    # Create a tqdm progress bar for the training data loader
    data_loader = tqdm(bucket_train_dataloader, desc="Training")
    
    for step, (seqs, labels, masks) in enumerate(data_loader):
        seqs = seqs.to(device)
        labels = labels.to(device)
        masks = masks.to(device)
        model.zero_grad()
        outputs = model(seqs, labels=labels, attention_mask=masks)
        logits = outputs.logits
        loss = outputs.loss
        loss = loss.mean()

        total_loss += loss.item()
        loss.backward()
        optimizer.step()
        
        data_loader.set_postfix({"Loss": loss.item()})

    avg_train_loss = total_loss / len(bucket_train_dataloader)
    print("Average train loss: {}".format(avg_train_loss))


In [None]:
torch.save(model.state_dict(), "temp/allopen70m-none.pt")

In [None]:
from datasets import load_dataset, Features, Value
valid_data = load_dataset("csv", data_files="AICUP/Opendid/opendid_valid.tsv", delimiter='\t',
                          features = Features({
                              'fid': Value('string'), 'idx': Value('int64'),
                              'content': Value('string'), 'label': Value('string')}),
                              column_names=['fid', 'idx', 'content', 'label'])
valid_list= list(valid_data['train'])

In [None]:
from tqdm import tqdm
from islab.aicup import aicup_predict
import io
BATCH_SIZE = 64
sd
with io.open("./answer.txt",'w',encoding='utf8') as f:
#with io.open("answer.txt",'w',encoding='utf8') as f:
    for i in tqdm(range(0, len(valid_list), BATCH_SIZE)):
        with torch.no_grad():
            seeds = valid_list[i:i+BATCH_SIZE]
            print(seeds)
            outputs = aicup_predict(model, tokenizer, input=seeds)
            print(outputs)
            for o in outputs:
                f.write(o)
                f.write('\n')