In [1]:
# Step 1: Loading the dataset
# Load kant.txt using colab file manager
#2 Downloading the file from Github
!curl -L https://raw.githubusercontent.com/Denis2054/Transformers-for-NLP-2nd-Edition/master/Chapter04/kant.txt --output "kant.txt"

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 10.7M  100 10.7M    0     0  16.4M      0 --:--:-- --:--:-- --:--:-- 16.4M


In [2]:
# Step 2: Installing HuggingFace Transformers
!pip uninstall -y tensorflow
# Install transfomers from master
!pip install git+https://github.com/huggingface/Transformers
!pip list | grep -E 'transformers|tokenizers'

[0mCollecting git+https://github.com/huggingface/Transformers
  Cloning https://github.com/huggingface/Transformers to /tmp/pip-req-build-07v49bwc
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/Transformers /tmp/pip-req-build-07v49bwc
  Resolved https://github.com/huggingface/Transformers to commit 4ed0e51cc3cb0c997038f5e04ed3eca45b34bc3f
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
tokenizers                       0.19.1
transformers                     4.41.0.dev0


In [3]:
# Step 3: Training a tokenizer
%%time
from pathlib import Path
from tokenizers import ByteLevelBPETokenizer
paths = [str(x) for x in Path(".").glob("**/*.txt")]
# initialize a tokenizer
tokenizer = ByteLevelBPETokenizer()
# Customize training
tokenizer.train(files=paths, vocab_size=52_000, min_frequency=2, special_tokens=[
    "<s>",
    "<pad>",
    "</s>",
    "<unk>",
    "<mask>",
])



CPU times: user 7.83 s, sys: 227 ms, total: 8.06 s
Wall time: 7.13 s


In [4]:
# Step 4: Saving the files to disk
import os
token_dir = '/content/KantaiBERT'
if not os.path.exists(token_dir):
  os.makedirs(token_dir)
tokenizer.save_model('KantaiBERT')

['KantaiBERT/vocab.json', 'KantaiBERT/merges.txt']

In [5]:
# Step 5: Loading the Trained Tokenizer Files
from tokenizers.implementations import ByteLevelBPETokenizer
from tokenizers.processors import BertProcessing
tokenizer = ByteLevelBPETokenizer(
    "./KantaiBERT/vocab.json",
    "./KantaiBERT/merges.txt"
)

In [6]:
tokenizer.encode("The Critque of Pure Reason.").tokens

['The', 'ĠC', 'rit', 'que', 'Ġof', 'ĠPure', 'ĠReason', '.']

In [7]:
tokenizer._tokenizer.post_processor = BertProcessing(
    ("</s>", tokenizer.token_to_id("</s>")),
    ("<s>", tokenizer.token_to_id("<s>")),
)
tokenizer.enable_truncation(max_length=512)

In [8]:
tokenizer.encode("The Critique of Pure Reason.")

Encoding(num_tokens=8, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflowing])

In [9]:
tokenizer.encode("The critique of pure reason.").tokens

['<s>', 'The', 'Ġcritique', 'Ġof', 'Ġpure', 'Ġreason', '.', '</s>']

In [10]:
# Step 6: Checking Resource Constraints: GPU and NVIDIA
!nvidia-smi


Fri Apr 19 11:33:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8              10W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [11]:
# Checking that pytorch sees CUDA
import torch
torch.cuda.is_available()

True

In [12]:
# Step 7: Defining the configuration of the Model
from transformers import RobertaConfig
config = RobertaConfig(
    vocab_size=52_000,
    max_position_embeddings=514,
    num_attention_heads=12,
    num_hidden_layers=6,
    type_vocab_size=1,
)

In [13]:
# Step 8: Re-creating the Tokenizer in Transformers
from transformers import RobertaTokenizer
tokenizer = RobertaTokenizer.from_pretrained("./KantaiBERT", max_length=512)

In [14]:
# Step 9: Initializing a model from scratch
from transformers import RobertaForMaskedLM
model = RobertaForMaskedLM(config=config)
print(model)

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): La

In [15]:
print(model.num_parameters)

<bound method ModuleUtilsMixin.num_parameters of RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(52000, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-5): 6 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_featu

In [16]:
# Exploring the parameters
LP=list(model.parameters())
lp = len(LP)
print(lp)

106


In [17]:
for p in range(0,lp):
  print(LP[p])

Parameter containing:
tensor([[-2.1681e-02, -8.1113e-04, -2.2322e-02,  ...,  1.2246e-03,
         -1.8122e-02, -6.0616e-02],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        [ 1.2762e-02, -9.0379e-03, -9.9065e-03,  ..., -1.0969e-03,
         -1.4554e-02, -1.8661e-02],
        ...,
        [ 9.1144e-03,  1.4986e-02, -4.5775e-03,  ..., -2.1664e-02,
          2.4946e-02,  8.3299e-03],
        [-1.2688e-02, -1.8987e-02, -1.7122e-02,  ..., -7.8786e-03,
          1.3387e-02,  7.3897e-03],
        [-6.9921e-03,  9.2463e-04,  4.6264e-05,  ...,  1.0505e-02,
         -7.9882e-03,  1.1022e-02]], requires_grad=True)
Parameter containing:
tensor([[ 0.0111,  0.0232, -0.0249,  ..., -0.0248, -0.0071, -0.0146],
        [ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 0.0282,  0.0107,  0.0012,  ...,  0.0175, -0.0299,  0.0545],
        ...,
        [ 0.0010,  0.0230,  0.0016,  ..., -0.0498,  0.0203,  0.0246],
        [ 0

In [18]:
# Counting the parameters
np =0
for p in range(0,lp):
  PL2=True
  try:
    L2=len(LP[p][0]) # check if 2D
  except:
    L2=1
    PL2=False
    L1=len(LP[p])
    L3=L1*L2
    np+=L3
    if PL2==True:
      print(p,L1,L2,L3)
    if PL2==False:
      print(p, L1,L3)
print(np)

3 768 768
4 768 768
6 768 768
8 768 768
10 768 768
12 768 768
13 768 768
14 768 768
16 3072 3072
18 768 768
19 768 768
20 768 768
22 768 768
24 768 768
26 768 768
28 768 768
29 768 768
30 768 768
32 3072 3072
34 768 768
35 768 768
36 768 768
38 768 768
40 768 768
42 768 768
44 768 768
45 768 768
46 768 768
48 3072 3072
50 768 768
51 768 768
52 768 768
54 768 768
56 768 768
58 768 768
60 768 768
61 768 768
62 768 768
64 3072 3072
66 768 768
67 768 768
68 768 768
70 768 768
72 768 768
74 768 768
76 768 768
77 768 768
78 768 768
80 3072 3072
82 768 768
83 768 768
84 768 768
86 768 768
88 768 768
90 768 768
92 768 768
93 768 768
94 768 768
96 3072 3072
98 768 768
99 768 768
100 768 768
101 52000 52000
103 768 768
104 768 768
105 768 768
115744


In [22]:
# Step 10: Building the dataset
%%time
from transformers import LineByLineTextDataset
dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="./kant.txt",
    block_size=128
)



CPU times: user 24.6 s, sys: 391 ms, total: 25 s
Wall time: 29.2 s


In [23]:
# Step 11: Defining a Daa Collator
from transformers import DataCollatorForLanguageModeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [24]:
# Step 12: initializing the trainer
from transformers import Trainer, TrainingArguments
training_args = TrainingArguments(
    output_dir="./KantaiBERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=64,
    save_steps=10_000,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [25]:
# Step 13: Pre-training the model
%%time
trainer.train()

Step,Training Loss
500,6.5777
1000,5.7167
1500,5.2381
2000,5.0054
2500,4.8973


CPU times: user 9min 59s, sys: 2.08 s, total: 10min 1s
Wall time: 10min 20s


TrainOutput(global_step=2672, training_loss=5.445222135075552, metrics={'train_runtime': 620.4184, 'train_samples_per_second': 275.562, 'train_steps_per_second': 4.307, 'total_flos': 873691623267840.0, 'train_loss': 5.445222135075552, 'epoch': 1.0})

In [27]:
# Step 14: Saving the Final Model(+tokenizer + config) to disk
trainer.save_model("./KantaiBERT")

In [29]:
# Step 15: Language Modeling with the FillMaskPipeline
from transformers import pipeline
fill_mask = pipeline(
    "fill-mask",
    model="./KantaiBERT",
    tokenizer="./KantaiBERT"
)

In [31]:
fill_mask("Human thinking involves human <mask>.")

[{'score': 0.015040884725749493,
  'token': 396,
  'token_str': ' object',
  'sequence': 'Human thinking involves human object.'},
 {'score': 0.01342266146093607,
  'token': 394,
  'token_str': ' reason',
  'sequence': 'Human thinking involves human reason.'},
 {'score': 0.010482360608875751,
  'token': 588,
  'token_str': ' nature',
  'sequence': 'Human thinking involves human nature.'},
 {'score': 0.008680302649736404,
  'token': 610,
  'token_str': ' conceptions',
  'sequence': 'Human thinking involves human conceptions.'},
 {'score': 0.00785766914486885,
  'token': 604,
  'token_str': ' understanding',
  'sequence': 'Human thinking involves human understanding.'}]