<a href="https://colab.research.google.com/github/Jean-Rd/Algoritms_Intro_machineLearningWithPython/blob/master/my_beto.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Fist install the library and download the models from github

!pip install transformers
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/pytorch_weights.tar.gz 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/vocab.txt 
!wget https://users.dcc.uchile.cl/~jperez/beto/cased_2M/config.json 
!tar -xzvf pytorch_weights.tar.gz
!mv config.json pytorch/.
!mv vocab.txt pytorch/.

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d5/43/cfe4ee779bbd6a678ac6a97c5a5cdeb03c35f9eaebbb9720b036680f9a2d/transformers-4.6.1-py3-none-any.whl (2.2MB)
[K     |████████████████████████████████| 2.3MB 6.9MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 33.2MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/d4/e2/df3543e8ffdab68f5acc73f613de9c2b155ac47f162e725dcac87c521c11/tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 29.9MB/s 
Collecting huggingface-hub==0.0.8
  Downloading https://files.pythonhosted.org/packages/a1/88/7b1e45720ecf59c6c6737ff332f41c955963090a18e72acbcbe

In [2]:
# import the necessary

import torch
from transformers import BertForMaskedLM, BertTokenizer

In [3]:
# create the tokenizer and the model

tokenizer = BertTokenizer.from_pretrained("pytorch/", do_lower_case=False)
model = BertForMaskedLM.from_pretrained("pytorch/")
e = model.eval()

Some weights of the model checkpoint at pytorch/ were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [29]:
# Now test it

text = "[CLS] Bolivia perdio su [MASK], en el [MASK]. [SEP]"
masked_indxs = (5,9)

tokens = tokenizer.tokenize(text)
indexed_tokens = tokenizer.convert_tokens_to_ids(tokens)
tokens_tensor = torch.tensor([indexed_tokens])

predictions = model(tokens_tensor)[0]

for i,midx in enumerate(masked_indxs):
    idxs = torch.argsort(predictions[0,midx], descending=True)
    predicted_token = tokenizer.convert_ids_to_tokens(idxs[:5])
    print('MASK',i,':',predicted_token)

MASK 0 : ['independencia', 'soberanía', 'territorio', 'autonomía', 'nombre']
MASK 1 : ['[UNK]', '2008', '2010', '2014', '2009']


In [54]:
type(list(model.children())[-1])

transformers.models.bert.modeling_bert.BertOnlyMLMHead

In [57]:
print(list(model.children())[-1])

BertOnlyMLMHead(
  (predictions): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=31002, bias=True)
  )
)


In [55]:
import transformers
import pytorch

In [62]:
transformers.models.bert.modeling_bert.BertOnlyMLMHead

In [60]:
def gelu(x):
    """ Original Implementation of the gelu activation function in Google Bert repo when initially created.
        For information: OpenAI GPT's gelu is slightly different (and gives slightly different results):
        0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))
        Also see https://arxiv.org/abs/1606.08415
    """
    return x * 0.5 * (1.0 + torch.erf(x / math.sqrt(2.0)))


def gelu_new(x):
    """ Implementation of the gelu activation function currently in Google Bert repo (identical to OpenAI GPT).
        Also see https://arxiv.org/abs/1606.08415
    """
    return 0.5 * x * (1 + torch.tanh(math.sqrt(2 / math.pi) * (x + 0.044715 * torch.pow(x, 3))))


def swish(x):
    return x * torch.sigmoid(x)


In [90]:
ACT2FN = {"gelu": gelu, "relu": torch.nn.functional.relu, "swish": swish, "gelu_new": gelu_new}

In [109]:
class BertPredictionHeadTransform(torch.nn.Module):

  def __init__(self, config):

    super().__init__()
    self.dense = torch.nn.Linear(config.hidden_size, config.hidden_size)
    if isinstance(config.hidden_act, str):
      self.transform_act_fn = ACT2FN[config.hidden_act]
    else:
      self.transform_act_fn = ACT2FN(config.hidden_act)
    self.layernorm = torch.nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)


  def forward(self, hidden_states):

    hidden_states = self.dense(hidden_states)
    hidden_states = self.transform_act_fn(hidden_states)
    hidden_states = self.dense(hidden_states)
    hidden_states = self.transform_act_fn(hidden_states)
    hidden_states = self.layernorm(hidden_states)

    return hidden_states


class BertLMPredictionHead(torch.nn.Module):

  def __init__(self, config):
    super().__init__()
    self.transform = BertPredictionHeadTransform(config)
    self.decoder = torch.nn.Linear(config.hidden_size, config.vocab_size, bias=False)
    self.bias = torch.nn.Parameter(torch.zeros(config.vocab_size))

    self.decoder.bias = self.bias

  def forward(self, hidden_states):

    hidden_states = self.transform(hidden_states)
    hidden_states = self.decoder(hidden_states)

    return hidden_states


class BertOnlyMLMHead(torch.nn.Module):

  def __init__(self, config):

    super().__init__()
    self.prediction = BertLMPredictionHead(config)

  def forward(self, sequence_output):

    prediction_scores = self.prediction(sequence_output)
    return prediction_scores

In [110]:
class Config:

  def __init__(self, **kwargs):

    self.hidden_size = kwargs['hidden_size']
    self.vocab_size = kwargs['vocab_size']
    self.layer_norm_eps = kwargs['layer_norm_eps']
    self.hidden_act = kwargs['hidden_act']

In [111]:
config = Config(hidden_size=768, vocab_size=31002, layer_norm_eps=1e-12, hidden_act='gelu_new')

In [112]:
BertOnlyMLMHead(config)

BertOnlyMLMHead(
  (prediction): BertLMPredictionHead(
    (transform): BertPredictionHeadTransform(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (layernorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    )
    (decoder): Linear(in_features=768, out_features=31002, bias=True)
  )
)

In [123]:
class Model(torch.nn.Module):

  def __init__(self, out_features=31002, freeze=False):

    super().__init__()
    model_beto = model

    self.model_beto = torch.nn.Sequential(*list(model_beto.children())[:-1])

    if freeze:
      for param in self.model_beto.parameters():
        param.requires_grad = False
      
    self.fc_output = BertOnlyMLMHead(config)

  def forward(self, status_hidden):

    status_hidden = self.fc_output(status_hidden)
    return status_hidden


In [124]:
my_beto = Model(freeze=True)