In [None]:
!pip install SentencePiece
!pip install transformers

In [None]:
import pandas as pd

In [None]:
# BERT-BASE (L=12, H=768, A=12, Total Parameters=110M)
from transformers import BertConfig, BertModel

bert_base = BertConfig()
model = BertModel(bert_base)
print(f"{model.num_parameters() /(10**6)} million parameters")

In [None]:
# Albert-base Configuration
from transformers import AlbertConfig, AlbertModel

albert_base = AlbertConfig(
    hidden_size=768,
    num_attention_heads=12,
    intermediate_size=3072,
)
model = AlbertModel(albert_base)
print(f"{model.num_parameters() /(10**6)} million parameters")

In [None]:
# Table from the original ALBERT paper
from IPython.display import Image

albert = Image(filename="albert.png")

display(albert)

In [None]:
# BERT-LARGE (L=24, H=1024, A=16, Total Parameters=340M).
from transformers import BertConfig, BertModel

bert_large = BertConfig(
    hidden_size=1024,
    num_hidden_layers=24,
    num_attention_heads=16,
    intermediate_size=4096,
)
model = BertModel(bert_large)
print(f"{model.num_parameters() /(10**6)} million parameters")

In [None]:
# ALBERT-xxlarge configuration  by default
from transformers import AlbertConfig, AlbertModel

albert_xxlarge = AlbertConfig()
model = AlbertModel(albert_xxlarge)
print(f"{model.num_parameters() /(10**6)} million parameters")

In [None]:
# how to use Albert model

In [None]:
from transformers import AlbertTokenizer, AlbertModel

tokenizer = AlbertTokenizer.from_pretrained("albert-large-v2")
model = AlbertModel.from_pretrained("albert-large-v2")
text = "The cat is so sad ."
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)

You can use output object or use the model directly with a pipeline for masked language modeling:

In [None]:
from transformers import pipeline

fillmask = pipeline("fill-mask", model="albert-base-v2")
pd.DataFrame(fillmask("The cat is so [MASK] ."))

ROBERTA

In [None]:
from transformers import RobertaTokenizer, RobertaModel

tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
model = RobertaModel.from_pretrained("roberta-base")
text = "The cat is so sad ."
encoded_input = tokenizer(text, return_tensors="pt")
output = model(**encoded_input)

In [None]:
from transformers import RobertaConfig, RobertaModel

conf = RobertaConfig()
model = RobertaModel(conf)
print(f"{model.num_parameters() /(10**6)} million parameters")

In [None]:
from transformers import pipeline

fillmask = pipeline("fill-mask", model="roberta-base")
pd.DataFrame(fillmask("The cat is so <mask> ."))

In [None]:
tokenizer = AlbertTokenizer.from_pretrained("albert-base-v2")
print(tokenizer.mask_token)
tokenizer = RobertaTokenizer.from_pretrained("roberta-base")
print(tokenizer.mask_token)

In [None]:
from transformers import pipeline

fillmask = pipeline("fill-mask", model="google/electra-small-generator")
q = fillmask(f"The cat is very {fillmask.tokenizer.mask_token} .")
pd.DataFrame(q)