-
Notifications
You must be signed in to change notification settings - Fork 0
/
2_finetune_init_lf_bigpurple.py
135 lines (115 loc) · 5.92 KB
/
2_finetune_init_lf_bigpurple.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
import logging
import argparse
import os
from os.path import join
import math
import torch
import torch.nn as nn
from dataclasses import dataclass, field
from torch.utils.data import ConcatDataset
import glob
import numpy as np
from transformers import TextDataset, LineByLineTextDataset, DataCollatorForLanguageModeling, Trainer
# use longformer directly instead of using create long model for Roberta
from transformers import LongformerForMaskedLM, LongformerConfig, BertTokenizer
from transformers import TrainingArguments, HfArgumentParser
from transformers.modeling_longformer import LongformerSelfAttention
# Choose GPU
import os
#os.environ["CUDA_VISIBLE_DEVICES"] = "0"
def use_embeddings_fasttext(model, word_embeddings):
emb_tensor = torch.from_numpy(word_embeddings).float()
model.longformer.embeddings.word_embeddings.weight.data = emb_tensor
return model
def pretrain_and_evaluate(args, model, tokenizer, train_only, eval_only, model_path=None):
# train from scrath if model_path=None
def _dataset(file_path):
return LineByLineTextDataset(tokenizer=tokenizer, file_path=file_path, block_size=512)
if train_only:
logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
train_dataset = ConcatDataset([_dataset(f) for f in glob.glob('/gpfs/scratch/xl3119/capstone/data/splited_train/*')])
val_dataset = _dataset(args.val_datapath)
elif eval_only:
print("Assign validation dataset")
val_dataset = _dataset(args.val_datapath)
train_dataset = val_dataset
else:
logger.info(f'Loading and tokenizing training data is usually slow: {args.train_datapath}')
#train_dataset = ConcatDataset([_dataset(f) for f in glob.glob('/scratch/xl3119/capstone/data/splited_train/*')])
train_dataset = ConcatDataset([_dataset(f) for f in glob.glob('/scratch/xl3119/capstone/data/splited_train/*')])
val_dataset = _dataset(args.val_datapath)
print("Creating data collator with mlm")
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer,
mlm=True,
mlm_probability=0.15)
print("Start Trainer")
trainer = Trainer(model=model,
args=args,
data_collator=data_collator,
train_dataset=train_dataset,
eval_dataset=val_dataset,
prediction_loss_only=True,)
if not eval_only:
trainer.train(model_path=model_path) # None train from scratch
trainer.save_model(args.output_dir) # save model to the output_dir
# Evaluation
results = {}
logger.info("*** Evaluate ***")
eval_loss = trainer.evaluate()
eval_loss = eval_loss['eval_loss']
perplexity = math.exp(eval_loss)
results["perplexity"] = perplexity
results["bpc"] = eval_loss/math.log(2)
output_eval_file = os.path.join(training_args.output_dir, "eval_results_mlm.txt")
with open(output_eval_file, "a") as writer:
writer.write("***** Eval results *****")
logger.info("***** Eval results *****")
for key, value in results.items():
logger.info(f" {key} = {value}")
writer.write(f"{key} = {value}\n")
@dataclass
class ModelArgs:
attention_window: int = field(default=512, metadata={"help": "Size of attention window"})
max_pos: int = field(default=4096, metadata={"help": "Maximum position"})
if __name__ == "__main__":
logger = logging.getLogger(__name__)
logging.basicConfig(level=logging.INFO)
parser = HfArgumentParser((TrainingArguments, ModelArgs,))
training_args, model_args = parser.parse_args_into_dataclasses(look_for_args_file=False, args=[
'--output_dir', '/gpfs/scratch/xl3119/capstone/checkpoints/longformer_mimic_tokenizer_gpu4_short',
'--warmup_steps', '250',
'--learning_rate', '1e-4',
'--weight_decay', '0.01',
'--adam_epsilon', '1e-6',
'--max_steps', '150000',
'--logging_steps', '500',
'--save_steps', '500',
'--max_grad_norm', '5.0',
'--per_gpu_eval_batch_size', '1',
'--per_gpu_train_batch_size', '1', # 32GB gpu with fp32
'--gradient_accumulation_steps', '16',
#'--evaluate_during_training', # this is removed to reduce training time
'--do_train',
])
#train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients.txt'
#val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients.txt'
# these are small file for test
train_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_train_patients_token.txt'
val_fn = '/gpfs/scratch/xl3119/capstone/data/Preproc0_clinical_sentences_all_without_number_val_patients_token.txt'
training_args.train_datapath = train_fn
training_args.val_datapath = val_fn
##################### use pretrianed longformer in transformer
init_config = LongformerConfig.from_json_file('config_files/longformer_base_4096/config.json')
mimic_tokenizer = BertTokenizer.from_pretrained('mimic_tokenizer')
word_embeddings = np.loadtxt(join('/gpfs/scratch/xl3119/capstone/wd_emb',"word_embedding_matrix.txt"))
longformer_model = LongformerForMaskedLM(init_config)
longformer_model = use_embeddings_fasttext(longformer_model, word_embeddings)
# longformer_tokenizer = LongformerTokenizer.from_pretrained('allenai/longformer-base-4096')
logger.info('Train and eval with Longformer pretrained ...')
pretrain_and_evaluate(training_args,
longformer_model,
mimic_tokenizer,
train_only=True,
eval_only=False,
model_path=None)
#,model_path=training_args.output_dir # Local path to the model if the model to train has been ins tantiated from a local path.