In [1]:
%cd FiD
%ls

/home/joeyliang/自然語言處理作業/final report/FiD
[0m[01;34mcheckpoint[0m/                     get-model.sh          setup.py
CODE_OF_CONDUCT.md              LICENSE               [01;34msrc[0m/
CONTRIBUTING.md                 [01;34mmodel[0m/                test_reader.py
[01;34mdata[0m/                           passage_retrieval.py  train_reader.py
evaluate_retrieved_passages.py  README.md             train_retriever.py
generate_passage_embeddings.py  requirements.txt
get-data.sh                     run_command.txt


In [2]:
import torch
from torch.utils import tensorboard
import random
import json
import numpy as np
from pathlib import Path

# Load Data

In [3]:
def load_data(data_path=None, global_rank=-1, world_size=-1):
    assert data_path
    if data_path.endswith('.jsonl'):
        data = open(data_path, 'r')
    elif data_path.endswith('.json'):
        with open(data_path, 'r') as fin:
            data = json.load(fin)
    examples = []
    for k, example in enumerate(data):
        if global_rank > -1 and not k%world_size==global_rank:
            continue
        if data_path is not None and data_path.endswith('.jsonl'):
            example = json.loads(example)
        if not 'id' in example:
            example['id'] = k
        for c in example['ctxs']:
            if not 'score' in c:
                c['score'] = 1.0 / (k + 1)
        examples.append(example)
    ## egrave: is this needed?
    if data_path is not None and data_path.endswith('.jsonl'):
        data.close()

    return examples

train_examples = load_data("data/train_data.json")
eval_examples  = load_data("data/eval_data.json")

# Convert. to Dataset Fromat

In [4]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self,
                 data,
                 n_context=None,
                 question_prefix='question:',
                 title_prefix='title:',
                 passage_prefix='context:'):
        self.data = data
        self.n_context = n_context
        self.question_prefix = question_prefix
        self.title_prefix = title_prefix
        self.passage_prefix = passage_prefix
        self.sort_data()

    def __len__(self):
        return len(self.data)

    def get_target(self, example):
        if 'target' in example:
            target = example['target']
            return target + ' </s>'
        elif 'answers' in example:
            return random.choice(example['answers']) + ' </s>'
        else:
            return None

    def __getitem__(self, index):
        example = self.data[index]
        question = self.question_prefix + " " + example['question']
        target = self.get_target(example)

        if 'ctxs' in example and self.n_context is not None:
            f = self.title_prefix + " {} " + self.passage_prefix + " {}"
            contexts = example['ctxs'][:self.n_context]
            passages = [f.format(c['title'], c['text']) for c in contexts]
            scores = [float(c['score']) for c in contexts]
            scores = torch.tensor(scores)
            # TODO(egrave): do we want to keep this?
            if len(contexts) == 0:
                contexts = [question]
        else:
            passages, scores = None, None


        return {
            'index' : index,
            'question' : question,
            'target' : target,
            'passages' : passages,
            'scores' : scores
        }

    def sort_data(self):
        if self.n_context is None or not 'score' in self.data[0]['ctxs'][0]:
            return
        for ex in self.data:
            ex['ctxs'].sort(key=lambda x: float(x['score']), reverse=True)

    def get_example(self, index):
        return self.data[index]

In [5]:
N_CONTEXT = 40
train_dataset = Dataset(train_examples, N_CONTEXT)
eval_dataset  = Dataset(eval_examples, N_CONTEXT)

# Defined parameters by Options

In [11]:
class Options():
    def __init__(self):
        self.checkpoint_dir = './checkpoint/'
        self.name = "baseline_FiD"
        self.batch_size = 1
        self.local_rank = 0
        self.main_port = -1
        self.seed = 0
        self.total_steps = 100
        self.accumulation_steps = 1
        self.clip = 1.
        self.eval_freq = 5
        self.save_freq = 10
        self.world_size = -1
        self.is_distributed = False #unknown parameter, 猜是多GPU  
        self.is_main = True
        
opt = Options()

# Load Model

In [12]:
# Set GPU / CPU
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [13]:
import transformers
import src.slurm
import src.util
import src.evaluation
import src.data
import src.model
t5 = transformers.T5ForConditionalGeneration.from_pretrained('t5-base')
model = src.model.FiDT5(t5.config)
model.load_t5(t5.state_dict())
model.to(device)
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5, weight_decay=0.01)
scheduler = src.util.WarmupLinearScheduler(optimizer, warmup_steps=1000, scheduler_steps=1000, min_ratio=0.,fixed_lr = True)
step, best_dev_em = 0, 0.0

4444444
5555555
4444444
6666666


In [14]:
tokenizer = transformers.T5Tokenizer.from_pretrained('t5-base')
collator = src.data.Collator(125, tokenizer, answer_maxlength=-1)

# Training 

In [10]:
from torch.utils.data import DataLoader, RandomSampler, DistributedSampler, SequentialSampler
def train(model, optimizer, scheduler, step, train_dataset, eval_dataset, opt, collator, best_dev_em, checkpoint_path):

    if opt.is_main:
        try:
            tb_logger = torch.utils.tensorboard.SummaryWriter(Path(opt.checkpoint_dir)/opt.name)
        except:
            tb_logger = None
            logger.warning('Tensorboard is not available.')

    torch.manual_seed(200) #different seed for different sampling depending on global_rank
    train_sampler = RandomSampler(train_dataset)
    train_dataloader = DataLoader(
        train_dataset,
        sampler=train_sampler,
        batch_size=opt.batch_size,
        drop_last=True,
        num_workers=10,
        collate_fn=collator
    )

    loss, curr_loss = 0.0, 0.0
    epoch = 1
    model.train()
    while step < opt.total_steps:
        epoch += 1
        for i, batch in enumerate(train_dataloader):
            step += 1
            (idx, labels, _, context_ids, context_mask) = batch

            train_loss = model(
                input_ids=context_ids.cuda(),
                attention_mask=context_mask.cuda(),
                labels=labels.cuda(),
                return_dict=False
            )[0]

            train_loss.backward()

            if step % opt.accumulation_steps == 0:
                torch.nn.utils.clip_grad_norm_(model.parameters(), opt.clip)
                optimizer.step()
                scheduler.step()
                model.zero_grad()

            train_loss = src.util.average_main(train_loss, opt)
            curr_loss += train_loss.item()

            if step % opt.eval_freq == 0:
                dev_em = evaluate(model, eval_dataset, tokenizer, collator, opt)
                model.train()
                if opt.is_main:
                    if dev_em > best_dev_em:
                        best_dev_em = dev_em
                        src.util.save(model, optimizer, scheduler, step, best_dev_em,
                                  opt, checkpoint_path, 'best_dev')
                    log = f"{step} / {opt.total_steps} |"
                    log += f"train: {curr_loss/opt.eval_freq:.3f} |"
                    log += f"evaluation: {100*dev_em:.2f}EM |"
                    log += f"lr: {scheduler.get_last_lr()[0]:.5f}"
                    logger.info(log)    
                    if tb_logger is not None:
                        tb_logger.add_scalar("Evaluation", dev_em, step)
                        tb_logger.add_scalar("Training", curr_loss / (opt.eval_freq), step)
                    curr_loss = 0.

            if opt.is_main and step % opt.save_freq == 0:
                src.util.save(model, optimizer, scheduler, step, best_dev_em,
                          opt, checkpoint_path, f"step-{step}")
            if step > opt.total_steps:
                break

def evaluate(model, dataset, tokenizer, collator, opt):
    sampler = SequentialSampler(dataset)
    dataloader = DataLoader(dataset,
        sampler=sampler,
        batch_size=opt.batch_size,
        drop_last=False,
        num_workers=10,
        collate_fn=collator
    )
    model.eval()
    total = 0
    exactmatch = []
    model = model.module if hasattr(model, "module") else model
    with torch.no_grad():
        for i, batch in enumerate(dataloader):
            (idx, _, _, context_ids, context_mask) = batch

            outputs = model.generate(
                input_ids=context_ids.cuda(),
                attention_mask=context_mask.cuda(),
                max_length=50
            )

            for k, o in enumerate(outputs):
                ans = tokenizer.decode(o, skip_special_tokens=True)
                gold = dataset.get_example(idx[k])['answers']
                score = src.evaluation.ems(ans, gold)
                total += 1
                exactmatch.append(score)

    exactmatch, total = src.util.weighted_average(np.mean(exactmatch), total, opt)
    return exactmatch  

In [14]:
checkpoint_path = Path(opt.checkpoint_dir)/opt.name
checkpoint_path.mkdir(parents=True, exist_ok=True)
logger = src.util.init_logger(
        opt.is_main,
        opt.is_distributed,
        checkpoint_path / 'run.log'
    )
model.to(device)
logger.info("Start training")
train(
        model,
        optimizer,
        scheduler,
        step,
        train_dataset,
        eval_dataset,
        opt,
        collator,
        best_dev_em,
        checkpoint_path
    )



RuntimeError: CUDA out of memory. Tried to allocate 16.00 MiB (GPU 0; 7.93 GiB total capacity; 7.01 GiB already allocated; 10.38 MiB free; 7.30 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [None]:
#error message
#FID's requirement is transformers 3.0.2
#so, this version's model output is formed as tuple.
#you can fix this issue if add input variable 'return_dict=False' at model input. (on transformers latest version)
#as follow
#train_loss = model( input_ids=context_ids.cuda(), attention_mask=context_mask.cuda(), labels=labels.cuda(), return_dict=False )[0]