In [1]:
from transformers import PreTrainedTokenizerFast, LineByLineTextDataset

tokenizer = PreTrainedTokenizerFast.from_pretrained('bert-base-dv')

train_file = '../data/dv-corpus-clean-unique-2m.txt'

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path=train_file,
    block_size=128
)



In [2]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [3]:
from transformers import BertConfig

config = BertConfig(
    vocab_size=20_000,
    max_position_embeddings=514,
    hidden_size=768,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1
)

In [4]:
from transformers import BertForMaskedLM

model = BertForMaskedLM(config)

In [5]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="bert-base-dv",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=32,
    save_steps=10_000,
    save_total_limit=2,
    prediction_loss_only=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [6]:
trainer.train()

Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
***** Running training *****
  Num examples = 2000001
  Num Epochs = 1
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 32
  Gradient Accumulation steps = 1
  Total optimization steps = 62501
Using deprecated `--per_gpu_train_batch_size` argument which will be removed in a future version. Using `--per_device_train_batch_size` is preferred.
  1%|          | 502/62501 [00:39<1:20:35, 12.82it/s]

{'loss': 8.7396, 'learning_rate': 4.96000063998976e-05, 'epoch': 0.01}


  2%|▏         | 1001/62501 [01:18<1:27:30, 11.71it/s]

{'loss': 8.291, 'learning_rate': 4.92000127997952e-05, 'epoch': 0.02}


  2%|▏         | 1502/62501 [01:55<1:18:03, 13.02it/s]

{'loss': 8.0642, 'learning_rate': 4.880001919969281e-05, 'epoch': 0.02}


  3%|▎         | 2002/62501 [02:31<1:15:45, 13.31it/s]

{'loss': 7.8933, 'learning_rate': 4.840002559959041e-05, 'epoch': 0.03}


  4%|▍         | 2502/62501 [03:08<1:14:29, 13.42it/s]

{'loss': 7.7937, 'learning_rate': 4.800003199948801e-05, 'epoch': 0.04}


  5%|▍         | 3002/62501 [03:44<1:12:59, 13.58it/s]

{'loss': 7.6676, 'learning_rate': 4.760003839938561e-05, 'epoch': 0.05}


  6%|▌         | 3503/62501 [04:22<1:11:43, 13.71it/s]

{'loss': 7.6015, 'learning_rate': 4.7200044799283216e-05, 'epoch': 0.06}


  6%|▋         | 4001/62501 [05:02<1:22:44, 11.78it/s]

{'loss': 7.5397, 'learning_rate': 4.680005119918082e-05, 'epoch': 0.06}


  7%|▋         | 4503/62501 [05:41<1:10:12, 13.77it/s]

{'loss': 7.4722, 'learning_rate': 4.6400057599078416e-05, 'epoch': 0.07}


  8%|▊         | 5001/62501 [06:20<1:25:13, 11.24it/s]

{'loss': 7.4127, 'learning_rate': 4.600006399897602e-05, 'epoch': 0.08}


  9%|▉         | 5501/62501 [06:58<1:00:27, 15.71it/s]

{'loss': 7.3379, 'learning_rate': 4.560007039887362e-05, 'epoch': 0.09}


 10%|▉         | 6002/62501 [07:35<1:12:03, 13.07it/s]

{'loss': 7.2808, 'learning_rate': 4.520007679877122e-05, 'epoch': 0.1}


 10%|█         | 6502/62501 [08:13<1:02:12, 15.00it/s]

{'loss': 7.1996, 'learning_rate': 4.4800083198668825e-05, 'epoch': 0.1}


 11%|█         | 7002/62501 [08:50<1:08:39, 13.47it/s]

{'loss': 7.121, 'learning_rate': 4.440008959856642e-05, 'epoch': 0.11}


 12%|█▏        | 7501/62501 [09:28<1:06:58, 13.69it/s]

{'loss': 7.0239, 'learning_rate': 4.4000095998464026e-05, 'epoch': 0.12}


 13%|█▎        | 8001/62501 [10:05<1:14:37, 12.17it/s]

{'loss': 6.8887, 'learning_rate': 4.360010239836163e-05, 'epoch': 0.13}


 14%|█▎        | 8501/62501 [10:43<1:07:33, 13.32it/s]

{'loss': 6.8052, 'learning_rate': 4.3200108798259233e-05, 'epoch': 0.14}


 14%|█▍        | 9001/62501 [11:20<1:07:45, 13.16it/s]

{'loss': 6.6964, 'learning_rate': 4.280011519815683e-05, 'epoch': 0.14}


 15%|█▌        | 9502/62501 [11:58<1:05:40, 13.45it/s]

{'loss': 6.5947, 'learning_rate': 4.2400121598054434e-05, 'epoch': 0.15}


 16%|█▌        | 10000/62501 [12:35<1:10:46, 12.36it/s]Saving model checkpoint to bert-base-dv\checkpoint-10000
Configuration saved in bert-base-dv\checkpoint-10000\config.json


{'loss': 6.5121, 'learning_rate': 4.200012799795203e-05, 'epoch': 0.16}


Model weights saved in bert-base-dv\checkpoint-10000\pytorch_model.bin
 17%|█▋        | 10502/62501 [13:13<1:02:19, 13.91it/s]

{'loss': 6.418, 'learning_rate': 4.1600134397849635e-05, 'epoch': 0.17}


 18%|█▊        | 11002/62501 [13:50<1:02:50, 13.66it/s]

{'loss': 6.3651, 'learning_rate': 4.120014079774724e-05, 'epoch': 0.18}


 18%|█▊        | 11501/62501 [14:27<1:07:00, 12.68it/s]

{'loss': 6.2611, 'learning_rate': 4.0800147197644836e-05, 'epoch': 0.18}


 19%|█▉        | 12003/62501 [15:05<1:00:21, 13.94it/s]

{'loss': 6.1837, 'learning_rate': 4.040015359754244e-05, 'epoch': 0.19}


 20%|██        | 12502/62501 [15:42<1:01:04, 13.64it/s]

{'loss': 6.0915, 'learning_rate': 4.0000159997440044e-05, 'epoch': 0.2}


 21%|██        | 13001/62501 [16:19<1:06:04, 12.49it/s]

{'loss': 6.0472, 'learning_rate': 3.960016639733765e-05, 'epoch': 0.21}


 22%|██▏       | 13501/62501 [16:56<1:08:00, 12.01it/s]

{'loss': 6.0004, 'learning_rate': 3.9200172797235245e-05, 'epoch': 0.22}


 22%|██▏       | 14002/62501 [17:34<1:02:34, 12.92it/s]

{'loss': 5.9526, 'learning_rate': 3.880017919713285e-05, 'epoch': 0.22}


 23%|██▎       | 14501/62501 [18:11<1:03:12, 12.66it/s]

{'loss': 5.8898, 'learning_rate': 3.8400185597030445e-05, 'epoch': 0.23}


 24%|██▍       | 15001/62501 [18:48<51:54, 15.25it/s]

{'loss': 5.8364, 'learning_rate': 3.8000191996928056e-05, 'epoch': 0.24}


 25%|██▍       | 15502/62501 [19:26<1:04:49, 12.08it/s]

{'loss': 5.8091, 'learning_rate': 3.760019839682565e-05, 'epoch': 0.25}


 26%|██▌       | 16001/62501 [20:02<55:07, 14.06it/s]

{'loss': 5.7239, 'learning_rate': 3.720020479672326e-05, 'epoch': 0.26}


 26%|██▋       | 16501/62501 [20:40<56:03, 13.68it/s]

{'loss': 5.7062, 'learning_rate': 3.6800211196620854e-05, 'epoch': 0.26}


 27%|██▋       | 17001/62501 [21:17<54:15, 13.98it/s]

{'loss': 5.6145, 'learning_rate': 3.640021759651846e-05, 'epoch': 0.27}


 28%|██▊       | 17502/62501 [21:54<1:00:55, 12.31it/s]

{'loss': 5.606, 'learning_rate': 3.600022399641606e-05, 'epoch': 0.28}


 29%|██▉       | 18002/62501 [22:32<1:04:29, 11.50it/s]

{'loss': 5.5229, 'learning_rate': 3.560023039631366e-05, 'epoch': 0.29}


 30%|██▉       | 18502/62501 [23:09<57:17, 12.80it/s]

{'loss': 5.5304, 'learning_rate': 3.520023679621126e-05, 'epoch': 0.3}


 30%|███       | 19002/62501 [23:46<50:55, 14.24it/s]

{'loss': 5.4259, 'learning_rate': 3.480024319610886e-05, 'epoch': 0.3}


 31%|███       | 19503/62501 [24:23<48:06, 14.90it/s]

{'loss': 5.4166, 'learning_rate': 3.440024959600647e-05, 'epoch': 0.31}


 32%|███▏      | 20000/62501 [25:01<57:59, 12.22it/s]Saving model checkpoint to bert-base-dv\checkpoint-20000
Configuration saved in bert-base-dv\checkpoint-20000\config.json


{'loss': 5.3895, 'learning_rate': 3.400025599590407e-05, 'epoch': 0.32}


Model weights saved in bert-base-dv\checkpoint-20000\pytorch_model.bin
 33%|███▎      | 20501/62501 [25:39<45:04, 15.53it/s]

{'loss': 5.3612, 'learning_rate': 3.360026239580167e-05, 'epoch': 0.33}


 34%|███▎      | 21001/62501 [26:17<54:16, 12.74it/s]

{'loss': 5.3012, 'learning_rate': 3.320026879569927e-05, 'epoch': 0.34}


 34%|███▍      | 21501/62501 [26:54<53:39, 12.73it/s]

{'loss': 5.2987, 'learning_rate': 3.280027519559687e-05, 'epoch': 0.34}


 35%|███▌      | 22001/62501 [27:32<53:22, 12.65it/s]

{'loss': 5.25, 'learning_rate': 3.2400281595494476e-05, 'epoch': 0.35}


 36%|███▌      | 22501/62501 [28:09<51:44, 12.88it/s]

{'loss': 5.1976, 'learning_rate': 3.200028799539207e-05, 'epoch': 0.36}


 37%|███▋      | 23001/62501 [28:47<47:04, 13.98it/s]

{'loss': 5.1795, 'learning_rate': 3.1600294395289677e-05, 'epoch': 0.37}


 38%|███▊      | 23502/62501 [29:24<47:41, 13.63it/s]

{'loss': 5.1235, 'learning_rate': 3.1200300795187274e-05, 'epoch': 0.38}


 38%|███▊      | 24001/62501 [30:01<43:01, 14.91it/s]

{'loss': 5.1173, 'learning_rate': 3.0800307195084884e-05, 'epoch': 0.38}


 39%|███▉      | 24500/62501 [30:38<46:59, 13.48it/s]

{'loss': 5.1143, 'learning_rate': 3.040031359498248e-05, 'epoch': 0.39}


 40%|████      | 25002/62501 [31:16<42:34, 14.68it/s]

{'loss': 5.081, 'learning_rate': 3.000031999488008e-05, 'epoch': 0.4}


 41%|████      | 25501/62501 [31:54<47:45, 12.91it/s]

{'loss': 5.0587, 'learning_rate': 2.9600326394777682e-05, 'epoch': 0.41}


 42%|████▏     | 26002/62501 [32:31<49:39, 12.25it/s]

{'loss': 5.0184, 'learning_rate': 2.920033279467529e-05, 'epoch': 0.42}


 42%|████▏     | 26502/62501 [33:08<47:02, 12.75it/s]

{'loss': 4.9875, 'learning_rate': 2.880033919457289e-05, 'epoch': 0.42}


 43%|████▎     | 27002/62501 [33:45<40:38, 14.56it/s]

{'loss': 4.9831, 'learning_rate': 2.840034559447049e-05, 'epoch': 0.43}


 44%|████▍     | 27502/62501 [34:22<45:03, 12.95it/s]

{'loss': 4.9554, 'learning_rate': 2.800035199436809e-05, 'epoch': 0.44}


 45%|████▍     | 28002/62501 [34:59<40:55, 14.05it/s]

{'loss': 4.9189, 'learning_rate': 2.760035839426569e-05, 'epoch': 0.45}


 46%|████▌     | 28501/62501 [35:37<46:57, 12.07it/s]

{'loss': 4.8981, 'learning_rate': 2.7200364794163298e-05, 'epoch': 0.46}


 46%|████▋     | 29002/62501 [36:15<45:20, 12.31it/s]

{'loss': 4.8886, 'learning_rate': 2.68003711940609e-05, 'epoch': 0.46}


 47%|████▋     | 29502/62501 [36:52<43:20, 12.69it/s]

{'loss': 4.8727, 'learning_rate': 2.6400377593958496e-05, 'epoch': 0.47}


 48%|████▊     | 30000/62501 [37:29<42:44, 12.67it/s]Saving model checkpoint to bert-base-dv\checkpoint-30000
Configuration saved in bert-base-dv\checkpoint-30000\config.json


{'loss': 4.87, 'learning_rate': 2.6000383993856096e-05, 'epoch': 0.48}


Model weights saved in bert-base-dv\checkpoint-30000\pytorch_model.bin
Deleting older checkpoint [bert-base-dv\checkpoint-10000] due to args.save_total_limit
 49%|████▉     | 30502/62501 [38:08<37:44, 14.13it/s]

{'loss': 4.819, 'learning_rate': 2.5600390393753703e-05, 'epoch': 0.49}


 50%|████▉     | 31003/62501 [38:45<37:44, 13.91it/s]

{'loss': 4.7908, 'learning_rate': 2.5200396793651304e-05, 'epoch': 0.5}


 50%|█████     | 31502/62501 [39:22<34:33, 14.95it/s]

{'loss': 4.7886, 'learning_rate': 2.4800403193548904e-05, 'epoch': 0.5}


 51%|█████     | 32003/62501 [39:59<35:27, 14.33it/s]

{'loss': 4.7746, 'learning_rate': 2.4400409593446505e-05, 'epoch': 0.51}


 52%|█████▏    | 32501/62501 [40:36<41:03, 12.18it/s]

{'loss': 4.7641, 'learning_rate': 2.400041599334411e-05, 'epoch': 0.52}


 53%|█████▎    | 33001/62501 [41:14<37:57, 12.95it/s]

{'loss': 4.7278, 'learning_rate': 2.360042239324171e-05, 'epoch': 0.53}


 54%|█████▎    | 33502/62501 [41:51<33:26, 14.45it/s]

{'loss': 4.7158, 'learning_rate': 2.3200428793139313e-05, 'epoch': 0.54}


 54%|█████▍    | 34002/62501 [42:28<37:48, 12.56it/s]

{'loss': 4.714, 'learning_rate': 2.2800435193036913e-05, 'epoch': 0.54}


 55%|█████▌    | 34501/62501 [43:06<38:06, 12.24it/s]

{'loss': 4.6824, 'learning_rate': 2.2400441592934514e-05, 'epoch': 0.55}


 56%|█████▌    | 35001/62501 [43:43<33:20, 13.75it/s]

{'loss': 4.6637, 'learning_rate': 2.2000447992832114e-05, 'epoch': 0.56}


 57%|█████▋    | 35501/62501 [44:20<35:52, 12.54it/s]

{'loss': 4.6588, 'learning_rate': 2.1600454392729718e-05, 'epoch': 0.57}


 58%|█████▊    | 36003/62501 [44:57<30:39, 14.41it/s]

{'loss': 4.6259, 'learning_rate': 2.120046079262732e-05, 'epoch': 0.58}


 58%|█████▊    | 36501/62501 [45:34<35:02, 12.37it/s]

{'loss': 4.6465, 'learning_rate': 2.080046719252492e-05, 'epoch': 0.58}


 59%|█████▉    | 37001/62501 [46:12<32:26, 13.10it/s]

{'loss': 4.6042, 'learning_rate': 2.0400473592422523e-05, 'epoch': 0.59}


 60%|██████    | 37501/62501 [46:49<30:54, 13.48it/s]

{'loss': 4.6076, 'learning_rate': 2.0000479992320123e-05, 'epoch': 0.6}


 61%|██████    | 38001/62501 [47:26<31:42, 12.88it/s]

{'loss': 4.5923, 'learning_rate': 1.9600486392217727e-05, 'epoch': 0.61}


 62%|██████▏   | 38501/62501 [48:03<33:13, 12.04it/s]

{'loss': 4.5844, 'learning_rate': 1.9200492792115327e-05, 'epoch': 0.62}


 62%|██████▏   | 39002/62501 [48:41<31:08, 12.58it/s]

{'loss': 4.5566, 'learning_rate': 1.880049919201293e-05, 'epoch': 0.62}


 63%|██████▎   | 39502/62501 [49:19<30:48, 12.44it/s]

{'loss': 4.5726, 'learning_rate': 1.840050559191053e-05, 'epoch': 0.63}


 64%|██████▍   | 40000/62501 [49:56<29:00, 12.93it/s]Saving model checkpoint to bert-base-dv\checkpoint-40000
Configuration saved in bert-base-dv\checkpoint-40000\config.json


{'loss': 4.535, 'learning_rate': 1.8000511991808132e-05, 'epoch': 0.64}


Model weights saved in bert-base-dv\checkpoint-40000\pytorch_model.bin
Deleting older checkpoint [bert-base-dv\checkpoint-20000] due to args.save_total_limit
 65%|██████▍   | 40502/62501 [50:34<25:58, 14.12it/s]

{'loss': 4.5324, 'learning_rate': 1.7600518391705732e-05, 'epoch': 0.65}


 66%|██████▌   | 41002/62501 [51:12<27:09, 13.20it/s]

{'loss': 4.5063, 'learning_rate': 1.7200524791603333e-05, 'epoch': 0.66}


 66%|██████▋   | 41502/62501 [51:48<27:47, 12.59it/s]

{'loss': 4.4647, 'learning_rate': 1.6800531191500937e-05, 'epoch': 0.66}


 67%|██████▋   | 42002/62501 [52:25<24:55, 13.70it/s]

{'loss': 4.4943, 'learning_rate': 1.6400537591398537e-05, 'epoch': 0.67}


 68%|██████▊   | 42503/62501 [53:03<24:25, 13.64it/s]

{'loss': 4.4723, 'learning_rate': 1.600054399129614e-05, 'epoch': 0.68}


 69%|██████▉   | 43002/62501 [53:40<25:12, 12.89it/s]

{'loss': 4.4981, 'learning_rate': 1.560055039119374e-05, 'epoch': 0.69}


 70%|██████▉   | 43502/62501 [54:17<25:29, 12.42it/s]

{'loss': 4.4688, 'learning_rate': 1.5200556791091345e-05, 'epoch': 0.7}


 70%|███████   | 44002/62501 [54:54<23:40, 13.02it/s]

{'loss': 4.4385, 'learning_rate': 1.4800563190988944e-05, 'epoch': 0.7}


 71%|███████   | 44502/62501 [55:32<22:51, 13.12it/s]

{'loss': 4.4361, 'learning_rate': 1.4400569590886548e-05, 'epoch': 0.71}


 72%|███████▏  | 45002/62501 [56:09<20:58, 13.91it/s]

{'loss': 4.4146, 'learning_rate': 1.4000575990784148e-05, 'epoch': 0.72}


 73%|███████▎  | 45502/62501 [56:46<19:27, 14.56it/s]

{'loss': 4.4255, 'learning_rate': 1.3600582390681749e-05, 'epoch': 0.73}


 74%|███████▎  | 46001/62501 [57:23<20:22, 13.50it/s]

{'loss': 4.4163, 'learning_rate': 1.3200588790579352e-05, 'epoch': 0.74}


 74%|███████▍  | 46501/62501 [58:00<22:54, 11.64it/s]

{'loss': 4.3826, 'learning_rate': 1.2800595190476953e-05, 'epoch': 0.74}


 75%|███████▌  | 47001/62501 [58:37<21:45, 11.88it/s]

{'loss': 4.3806, 'learning_rate': 1.2400601590374553e-05, 'epoch': 0.75}


 76%|███████▌  | 47501/62501 [59:15<19:30, 12.82it/s]

{'loss': 4.3525, 'learning_rate': 1.2000607990272155e-05, 'epoch': 0.76}


 77%|███████▋  | 48002/62501 [59:52<17:47, 13.58it/s]

{'loss': 4.3635, 'learning_rate': 1.1600614390169758e-05, 'epoch': 0.77}


 78%|███████▊  | 48502/62501 [1:00:30<19:10, 12.17it/s]

{'loss': 4.3626, 'learning_rate': 1.120062079006736e-05, 'epoch': 0.78}


 78%|███████▊  | 49001/62501 [1:01:07<17:58, 12.51it/s]

{'loss': 4.3716, 'learning_rate': 1.0800627189964962e-05, 'epoch': 0.78}


 79%|███████▉  | 49501/62501 [1:01:44<16:40, 12.99it/s]

{'loss': 4.3305, 'learning_rate': 1.0400633589862562e-05, 'epoch': 0.79}


 80%|███████▉  | 50000/62501 [1:02:21<15:46, 13.21it/s]Saving model checkpoint to bert-base-dv\checkpoint-50000
Configuration saved in bert-base-dv\checkpoint-50000\config.json


{'loss': 4.341, 'learning_rate': 1.0000639989760164e-05, 'epoch': 0.8}


Model weights saved in bert-base-dv\checkpoint-50000\pytorch_model.bin
Deleting older checkpoint [bert-base-dv\checkpoint-30000] due to args.save_total_limit
 81%|████████  | 50502/62501 [1:02:59<15:18, 13.06it/s]

{'loss': 4.3448, 'learning_rate': 9.600646389657766e-06, 'epoch': 0.81}


 82%|████████▏ | 51002/62501 [1:03:36<14:34, 13.14it/s]

{'loss': 4.3215, 'learning_rate': 9.200652789555369e-06, 'epoch': 0.82}


 82%|████████▏ | 51501/62501 [1:04:13<13:40, 13.40it/s]

{'loss': 4.3329, 'learning_rate': 8.800659189452969e-06, 'epoch': 0.82}


 83%|████████▎ | 52002/62501 [1:04:51<12:12, 14.34it/s]

{'loss': 4.3095, 'learning_rate': 8.400665589350571e-06, 'epoch': 0.83}


 84%|████████▍ | 52502/62501 [1:05:29<12:54, 12.91it/s]

{'loss': 4.3209, 'learning_rate': 8.000671989248172e-06, 'epoch': 0.84}


 85%|████████▍ | 53001/62501 [1:06:07<12:58, 12.20it/s]

{'loss': 4.309, 'learning_rate': 7.600678389145774e-06, 'epoch': 0.85}


 86%|████████▌ | 53501/62501 [1:06:44<11:45, 12.75it/s]

{'loss': 4.2963, 'learning_rate': 7.200684789043376e-06, 'epoch': 0.86}


 86%|████████▋ | 54002/62501 [1:07:22<09:54, 14.30it/s]

{'loss': 4.297, 'learning_rate': 6.800691188940977e-06, 'epoch': 0.86}


 87%|████████▋ | 54502/62501 [1:07:59<10:28, 12.73it/s]

{'loss': 4.2721, 'learning_rate': 6.400697588838579e-06, 'epoch': 0.87}


 88%|████████▊ | 55003/62501 [1:08:36<08:47, 14.20it/s]

{'loss': 4.2842, 'learning_rate': 6.0007039887361805e-06, 'epoch': 0.88}


 89%|████████▉ | 55502/62501 [1:09:14<08:58, 13.01it/s]

{'loss': 4.2663, 'learning_rate': 5.600710388633782e-06, 'epoch': 0.89}


 90%|████████▉ | 56002/62501 [1:09:52<07:59, 13.54it/s]

{'loss': 4.2773, 'learning_rate': 5.200716788531384e-06, 'epoch': 0.9}


 90%|█████████ | 56502/62501 [1:10:29<07:27, 13.42it/s]

{'loss': 4.2774, 'learning_rate': 4.800723188428985e-06, 'epoch': 0.9}


 91%|█████████ | 57002/62501 [1:11:06<06:43, 13.62it/s]

{'loss': 4.2729, 'learning_rate': 4.4007295883265865e-06, 'epoch': 0.91}


 92%|█████████▏| 57502/62501 [1:11:43<06:56, 12.00it/s]

{'loss': 4.2855, 'learning_rate': 4.000735988224189e-06, 'epoch': 0.92}


 93%|█████████▎| 58003/62501 [1:12:21<05:23, 13.88it/s]

{'loss': 4.2892, 'learning_rate': 3.6007423881217903e-06, 'epoch': 0.93}


 94%|█████████▎| 58503/62501 [1:12:58<04:40, 14.23it/s]

{'loss': 4.2684, 'learning_rate': 3.200748788019392e-06, 'epoch': 0.94}


 94%|█████████▍| 59002/62501 [1:13:36<04:48, 12.14it/s]

{'loss': 4.2787, 'learning_rate': 2.8007551879169933e-06, 'epoch': 0.94}


 95%|█████████▌| 59502/62501 [1:14:14<03:39, 13.66it/s]

{'loss': 4.2454, 'learning_rate': 2.4007615878145954e-06, 'epoch': 0.95}


 96%|█████████▌| 60000/62501 [1:14:51<02:58, 14.00it/s]Saving model checkpoint to bert-base-dv\checkpoint-60000
Configuration saved in bert-base-dv\checkpoint-60000\config.json


{'loss': 4.2455, 'learning_rate': 2.0007679877121967e-06, 'epoch': 0.96}


Model weights saved in bert-base-dv\checkpoint-60000\pytorch_model.bin
Deleting older checkpoint [bert-base-dv\checkpoint-40000] due to args.save_total_limit
 97%|█████████▋| 60501/62501 [1:15:29<02:37, 12.66it/s]

{'loss': 4.233, 'learning_rate': 1.6007743876097984e-06, 'epoch': 0.97}


 98%|█████████▊| 61002/62501 [1:16:07<01:54, 13.10it/s]

{'loss': 4.2438, 'learning_rate': 1.2007807875074e-06, 'epoch': 0.98}


 98%|█████████▊| 61502/62501 [1:16:44<01:23, 11.97it/s]

{'loss': 4.2411, 'learning_rate': 8.007871874050015e-07, 'epoch': 0.98}


 99%|█████████▉| 62002/62501 [1:17:22<00:39, 12.70it/s]

{'loss': 4.2689, 'learning_rate': 4.007935873026032e-07, 'epoch': 0.99}


100%|█████████▉| 62500/62501 [1:17:59<00:00, 11.90it/s]

Training completed. Do not forget to share your model on huggingface.co/models =)


100%|██████████| 62501/62501 [1:17:59<00:00, 13.36it/s]

{'loss': 4.2429, 'learning_rate': 7.999872002047968e-10, 'epoch': 1.0}
{'train_runtime': 4679.9178, 'train_samples_per_second': 427.358, 'train_steps_per_second': 13.355, 'train_loss': 5.240891586652582, 'epoch': 1.0}





TrainOutput(global_step=62501, training_loss=5.240891586652582, metrics={'train_runtime': 4679.9178, 'train_samples_per_second': 427.358, 'train_steps_per_second': 13.355, 'train_loss': 5.240891586652582, 'epoch': 1.0})

In [7]:
trainer.save_model('bert-base-dv')

Saving model checkpoint to bert-base-dv
Configuration saved in bert-base-dv\config.json
Model weights saved in bert-base-dv\pytorch_model.bin
