In [14]:
import os
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# !pip install transformers

# !pip install tokenizers

In [None]:
# os.chdir("drive/My Drive/llms/")

In [21]:
os.listdir()


['corpus.txt']

In [23]:
from tokenizers import BertWordPieceTokenizer

bert_wordpiece_tokenizer = BertWordPieceTokenizer()
bert_wordpiece_tokenizer.train("corpus.txt")

In [None]:
# access the trained vocabulary by using the get_vocab() function of the trained tokenizer object.

bert_wordpiece_tokenizer.get_vocab()

In [25]:
# Using the save_model() function of the object and providing the directory will save the tokenizer vocabulary for further usage

!mkdir tokenizer
bert_wordpiece_tokenizer.save_model("tokenizer")

['tokenizer/vocab.txt']

In [26]:
# reload

tokenizer = BertWordPieceTokenizer.from_file("tokenizer/vocab.txt")

In [28]:
tokenized_sentence = tokenizer.encode("Oh it works just fine")

tokenized_sentence.tokens

# The special [CLS] and [SEP] tokens will be automatically added to the list of tokens because BERT needs them for processing input.

['[CLS]', 'oh', 'it', 'works', 'just', 'fine', '[SEP]']

In [29]:
tokenized_sentence = tokenizer.encode("ohoh i thougt it might be workingg well")

In [30]:
from transformers import BertTokenizerFast

tokenizer = BertTokenizerFast.from_pretrained("tokenizer")

# In most of the pretrained models’ documentation and cards, it is highly recommended to use the BertTokenizerFast version.

In [31]:
# prepare the corpus for faster training

from transformers import LineByLineTextDataset

dataset = LineByLineTextDataset(
    tokenizer=tokenizer,
    file_path="corpus.txt",
    block_size=128,
)



In [32]:
# provide a data collator for MLM

from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, mlm=True, mlm_probability=0.15
)

In [33]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="BERT",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=128)

In [34]:
from transformers import BertConfig, BertForMaskedLM

bert = BertForMaskedLM(BertConfig())

BertForMaskedLM has generative capabilities, as `prepare_inputs_for_generation` is explicitly overwritten. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


In [35]:
# ***** the final step: make a Trainer object

# trainer = Trainer(
#     model=bert,               # 要训练的模型, 通常是一个预训练模型
#     args=training_args,       # 训练参数, 通过 TrainingArguments 类设置
#     data_collator=data_collator,  # 数据整理器, 用于将数据集中的样本整理成批次
#     train_dataset=dataset,    # 训练数据集, 通常是一个 Dataset 对象
# )

from transformers import Trainer

trainer = Trainer(
    model=bert,
    args=training_args,
    data_collator=data_collator,
    train_dataset=dataset,
)

In [None]:
# train

trainer.train()

In [None]:
# save model

trainer.save_model("my-BERT")

In [37]:
from transformers import BertConfig

BertConfig()

# Note that changing these parameters, especially max_position_embedding, num_attention_heads, num_hidden_layers,
# intermediate_size, and hidden_size, directly affects the training time.

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [38]:
# example: a tiny version of BERT for faster training

tiny_bert_config = BertConfig(max_position_embeddings=512,
        hidden_size=128,
        num_attention_heads=2,
        num_hidden_layers=2,
        intermediate_size=512)

tiny_bert_config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 128,
  "initializer_range": 0.02,
  "intermediate_size": 512,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 2,
  "num_hidden_layers": 2,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.48.3",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [39]:
tiny_bert = BertForMaskedLM(tiny_bert_config)

In [None]:
trainer = Trainer(model=tiny_bert, args=training_args,
    data_collator=data_collator,
    train_dataset=dataset)

trainer.train()

In [41]:
from transformers import (
    TFBertModel, BertTokenizerFast)

bert = TFBertModel.from_pretrained("bert-base-uncased")

tokenizer = \
    BertTokenizerFast.from_pretrained("bert-base-uncased")

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [42]:
bert.layers

[<transformers.models.bert.modeling_tf_bert.TFBertMainLayer at 0x7dd97278a690>]

In [43]:
tokenized_text = tokenizer.batch_encode_plus(
    ["hello how is it going with you",
    "lets test it"],
    return_tensors="tf",
    max_length=256,
    truncation=True,
    pad_to_max_length=True)

bert(tokenized_text)



TFBaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=<tf.Tensor: shape=(2, 256, 768), dtype=float32, numpy=
array([[[ 1.00471482e-01,  6.77027702e-02, -8.33593458e-02, ...,
         -4.93304521e-01,  1.16539404e-01,  2.26647347e-01],
        [ 3.23624790e-01,  3.70718509e-01,  6.14685833e-01, ...,
         -6.27267718e-01,  3.79083008e-01,  7.05303252e-02],
        [ 1.99533507e-01, -8.75509262e-01, -6.47860318e-02, ...,
         -1.28087141e-02,  3.07651460e-01, -2.07329299e-02],
        ...,
        [-6.53300136e-02,  1.19046159e-01,  5.76847076e-01, ...,
         -2.95460641e-01,  2.49744691e-02,  1.13964200e-01],
        [-2.64715314e-01, -7.86391348e-02,  5.47280669e-01, ...,
         -1.37515366e-01, -5.94692305e-02, -5.17934039e-02],
        [-2.44958907e-01, -1.14799649e-01,  5.92174232e-01, ...,
         -1.56881928e-01, -3.39757986e-02, -8.46138969e-02]],

       [[ 2.94559058e-02,  2.30868489e-01,  2.92651713e-01, ...,
         -1.30421668e-01,  1.89659417e-01,  

In [44]:
# make a Keras model using this new embedding

from tensorflow import keras
import tensorflow as tf

max_length = 256

tokens = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32)

masks = keras.layers.Input(shape=(max_length,), dtype=tf.dtypes.int32)

embedding_layer = bert.layers[0]([tokens,masks])[0][:,0,:]

dense = tf.keras.layers.Dense(units=2, activation="softmax")(embedding_layer)

model = keras.Model([tokens,masks],dense)

The model object, which is a **Keras** model, has two inputs: one for **tokens** and one for **masks**. Tokens have **inputs_ids** from the tokenizer output and the masks will have **attention_mask**.

In [45]:
tokenized = tokenizer.batch_encode_plus(
    ["hello how is it going with you",
    "lets test it"],
    return_tensors="tf",
    max_length=256,
    truncation=True,
    pad_to_max_length=True)

# It is important to use max_length, truncation, and pad_to_max_length when using a tokenizer.



In [46]:
model([tokenized["input_ids"],tokenized["attention_mask"]])

<tf.Tensor: shape=(2, 2), dtype=float32, numpy=
array([[0.61480767, 0.38519228],
       [0.6112972 , 0.38870284]], dtype=float32)>

In [47]:
# When training the model, you need to compile it using the compile function

model.compile(
    optimizer = "Adam",
    loss = "categorical_crossentropy",
    metrics = ["accuracy"]
)

model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1094822   ['input_1[0][0]',             
                             ngAndCrossAttentions(last_   40         'input_2[0][0]']             
                             hidden_state=(None, 256, 7                                           
                             68),                                                             

In [48]:
model.layers[2].trainable = False

In [49]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_1 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 256)]                0         []                            
                                                                                                  
 bert (TFBertMainLayer)      TFBaseModelOutputWithPooli   1094822   ['input_1[0][0]',             
                             ngAndCrossAttentions(last_   40         'input_2[0][0]']             
                             hidden_state=(None, 256, 7                                           
                             68),                                                             

## Working with tokenization algorithms

In [50]:
from transformers import AutoModel, AutoTokenizer

tokenizerTUR = AutoTokenizer.from_pretrained("dbmdz/bert-base-turkish-uncased")

print(f"VOC size is: {tokenizerTUR.vocab_size}")
print(f"The model is: {type(tokenizerTUR)}")

tokenizer_config.json:   0%|          | 0.00/59.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/263k [00:00<?, ?B/s]

VOC size is: 32000
The model is: <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


In [51]:
# loads an English BERT tokenizer for the bert-base-uncased model

from transformers import AutoModel, AutoTokenizer

tokenizerEN = AutoTokenizer.from_pretrained("bert-base-uncased")

print(f"VOC size is: {tokenizerEN.vocab_size}")
print(f"The model is {type(tokenizerEN)}")

VOC size is: 30522
The model is <class 'transformers.models.bert.tokenization_bert_fast.BertTokenizerFast'>


In [52]:
word_en = "telecommunication"

print(f"is in Turkish Model ? {word_en in tokenizerTUR.vocab}")
print(f"is in English Model ? {word_en in tokenizerEN.vocab}")

is in Turkish Model ? False
is in English Model ? True


In [54]:
tokens=tokenizerTUR.tokenize(word_en)

tokens
['tel', '##eco', '##mm', '##un', '##ica', '##tion']

['tel', '##eco', '##mm', '##un', '##ica', '##tion']

In [55]:
[t in tokenizerTUR.vocab for t in tokens]

[True, True, True, True, True, True]

In [56]:
tokenizerEN.tokenize(word_en)

['telecommunication']

### Training BPE

In [64]:
# !pip install nltk



In [98]:
from nltk.corpus import gutenberg
# nltk.download('punkt')

# 加载 Gutenberg 语料库
print(gutenberg.fileids())  # 查看可用的文件
text = gutenberg.raw('shakespeare-caesar.txt')  # 加载文本
print(text[:200])

['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
[The Tragedie of Julius Caesar by William Shakespeare 1599]


Actus Primus. Scoena Prima.

Enter Flauius, Murellus, and certaine Commoners ouer the Stage.

  Flauius. Hence: home you idle Creatures, g


In [103]:

import nltk
from nltk.corpus import gutenberg

nltk.download('punkt')
nltk.download('gutenberg')

print(gutenberg.fileids())

file_id = 'shakespeare-macbeth.txt'

try:
    shakespeare = gutenberg.sents(file_id)
    print(shakespeare)

except Exception as e:
    print(f"Error: {e}")

# plays=['shakespeare-macbeth.txt','shakespeare-hamlet.txt', 'shakespeare-caesar.txt']

# shakespeare=[" ".join(s) for ply in plays \
#     for s in gutenberg.sents(ply)]



['austen-emma.txt', 'austen-persuasion.txt', 'austen-sense.txt', 'bible-kjv.txt', 'blake-poems.txt', 'bryant-stories.txt', 'burgess-busterbrown.txt', 'carroll-alice.txt', 'chesterton-ball.txt', 'chesterton-brown.txt', 'chesterton-thursday.txt', 'edgeworth-parents.txt', 'melville-moby_dick.txt', 'milton-paradise.txt', 'shakespeare-caesar.txt', 'shakespeare-hamlet.txt', 'shakespeare-macbeth.txt', 'whitman-leaves.txt']
Error: No sentence tokenizer for this corpus


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [86]:
from tokenizers.processors import TemplateProcessing

special_tokens= ["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"]

temp_proc= TemplateProcessing(
    single="[CLS] $A [SEP]",
    pair="[CLS] $A [SEP] $B:1 [SEP]:1",
    special_tokens=[
        ("[CLS]", special_tokens.index("[CLS]")),
        ("[SEP]", special_tokens.index("[SEP]")),
    ],
)

In [82]:
from tokenizers import Tokenizer
from tokenizers.normalizers import ( Sequence,Lowercase, NFD, StripAccents)
from tokenizers.pre_tokenizers import Whitespace
from tokenizers.models import BPE
from tokenizers.decoders import BPEDecoder

In [87]:
tokenizer = Tokenizer(BPE())

In [88]:
tokenizer.normalizer = Sequence([NFD(),Lowercase(),StripAccents()])
tokenizer.pre_tokenizer = Whitespace()

tokenizer.decoder = BPEDecoder()
tokenizer.post_processor=temp_proc

In [89]:
from tokenizers.trainers import BpeTrainer
trainer = BpeTrainer(vocab_size=5000, special_tokens= special_tokens)

tokenizer.train_from_iterator(shakespeare, trainer=trainer)

print(f"Trained vocab size:{tokenizer.get_vocab_size()}" )

Trained vocab size:53


In [105]:
sen= "Is this a dagger which I see before me, the handle toward my hand?"
sen_enc=tokenizer.encode(sen)

print(f"Output: {format(sen_enc.tokens)}")

Output: ['[CLS]', 'i', 's', 't', 'h', 'i', 's', 'a', 'd', 'a', 'g', 'g', 'e', 'r', 'w', 'h', 'i', 'c', 'h', 'i', 's', 'e', 'e', 'b', 'e', 'f', 'o', 'r', 'e', 'm', 'e', ',', 't', 'h', 'e', 'h', 'a', 'n', 'd', 'l', 'e', 't', 'o', 'w', 'a', 'r', 'd', 'm', 'y', 'h', 'a', 'n', 'd', '?', '[SEP]']


#### Training WordPiece

In [106]:
from tokenizers.models import WordPiece
from tokenizers.decoders import WordPiece as WordPieceDecoder
from tokenizers.normalizers import BertNormalizer

#BERT normalizer includes cleaning the text, handling accents, chinese chars and lowercasing

tokenizer = Tokenizer(WordPiece())
tokenizer.normalizer=BertNormalizer()
tokenizer.pre_tokenizer = Whitespace()

tokenizer.decoder= WordPieceDecoder()

In [107]:
from tokenizers.trainers import WordPieceTrainer
trainer = WordPieceTrainer(vocab_size=5000, special_tokens=["[UNK]", "[CLS]", "[SEP]", "[PAD]", "[MASK]"])

tokenizer.train_from_iterator(shakespeare, trainer=trainer)
output = tokenizer.encode(sen)
print(output.tokens)

['[UNK]', '[UNK]', 'a', '[UNK]', '[UNK]', 'i', '[UNK]', '[UNK]', '[UNK]', ',', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '[UNK]', '?']


In [108]:
tokenizer.decode(output.ids)

'a i,?'