# Bert tutorial using huggingface pytorch library

In [147]:
from collections import Counter
import logging
import torch
from transformers import (pipeline, BertTokenizer, BertModel, BertConfig,
                          BertForNextSentencePrediction,
                          BertForSequenceClassification,
                          BertForTokenClassification,
                          BertForQuestionAnswering
                         )
import transformers

In [158]:
# suppress the verbose logging.
logging.getLogger().setLevel(logging.ERROR)

In [157]:
ner_pipeline = pipeline("ner")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [25]:
ner_pipeline("X is a competitor to Yasd")

[{'word': 'X', 'score': 0.9485399723052979, 'entity': 'I-ORG'},
 {'word': 'Ya', 'score': 0.936007022857666, 'entity': 'I-ORG'},
 {'word': '##s', 'score': 0.4494897127151489, 'entity': 'I-ORG'},
 {'word': '##d', 'score': 0.8810768127441406, 'entity': 'I-ORG'}]

In [159]:
text_classification = pipeline("sentiment-analysis")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [33]:
text_classification("movie is hmm ok")

[{'label': 'POSITIVE', 'score': 0.9991417527198792}]

In [160]:
# Tokenizer specific to Bert model.

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [44]:
tokens = tokenizer.encode("I like to watch movies like jurasicpart")

In [45]:
[tokenizer.decode(t) for t in tokens]

['[ C L S ]',
 'i',
 'l i k e',
 't o',
 'w a t c h',
 'm o v i e s',
 'l i k e',
 'j u',
 '# # r a s',
 '# # i c',
 '# # p a r k',
 '[ S E P ]']

In [161]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

In [49]:
inputs = torch.tensor(tokens).unsqueeze(0)
outputs = bert_model(inputs)

In [54]:
# outputs[0] -> each token it gives contextual 768 vector in order, here we have 12 tokens. 
#.              This information can be used to do the NER
# outputs[1] -> overall summary of the sentance or pooled output or sentiment analysis etc.
outputs[0].shape, len(tokens), outputs[1].shape

(torch.Size([1, 12, 768]), 12, torch.Size([1, 768]))

In [162]:
mask_pipeline = pipeline('fill-mask')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=230.0, style=ProgressStyle(description_…




In [57]:
preds = mask_pipeline(f"i'm really {mask_pipeline.tokenizer.mask_token} now")

In [58]:
preds

[{'sequence': "<s> i'm really tired now</s>",
  'score': 0.10747408866882324,
  'token': 7428},
 {'sequence': "<s> i'm really happy now</s>",
  'score': 0.10746168345212936,
  'token': 1372},
 {'sequence': "<s> i'm really excited now</s>",
  'score': 0.05675781890749931,
  'token': 2283},
 {'sequence': "<s> i'm really sad now</s>",
  'score': 0.040050286799669266,
  'token': 5074},
 {'sequence': "<s> i'm really pissed now</s>",
  'score': 0.039652034640312195,
  'token': 34449}]

## Next sentance prediction

In [163]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

In [118]:
first_sentance = "How are you ?"
second_sent_right = "I'm good"
second_sent_wrong = "asd hasd movie is good"

first_sentence = "I cut my finger."
second_sent_right = "The blood started flowing."
second_sent_wrong = "This website uses cookies."

right = tokenizer.encode_plus(first_sentance, text_pair=second_sent_right)
wrong = tokenizer.encode_plus(first_sentance, text_pair=second_sent_wrong)

In [119]:
# tokenizer.encode_plus
r1, r2, r3 = (torch.tensor(right['input_ids']).unsqueeze(0),
              torch.tensor(right['token_type_ids']).unsqueeze(0), 
              torch.tensor(right['attention_mask']).unsqueeze(0))
w1, w2, w3 = (torch.tensor(wrong['input_ids']).unsqueeze(0),
              torch.tensor(wrong['token_type_ids']).unsqueeze(0), 
              torch.tensor(wrong['attention_mask']).unsqueeze(0))

In [120]:
right_outputs = model(input_ids=r1, token_type_ids=r2, attention_mask=r3)
wrong_outputs = model(input_ids=w1, token_type_ids=w2, attention_mask=w3)

In [121]:
right_rel_score = right_outputs[0]
wrong_rel_score = wrong_outputs[0]

In [122]:
right_rel_score.detach().numpy().flatten()[0] > 0

False

In [126]:
# wrong_rel_score.detach()

# Retrain Bert

In [128]:
bconfig = BertConfig()

bconfig

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type_vocab_size": 2,
  "vocab_size": 30522
}

In [131]:
model = BertModel(bconfig)

print(model.embeddings)

BertEmbeddings(
  (word_embeddings): Embedding(30522, 768, padding_idx=0)
  (position_embeddings): Embedding(512, 768)
  (token_type_embeddings): Embedding(2, 768)
  (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (dropout): Dropout(p=0.1, inplace=False)
)


In [136]:
len(model.encoder.layer)

12

In [137]:
model.encoder.layer[0]

BertLayer(
  (attention): BertAttention(
    (self): BertSelfAttention(
      (query): Linear(in_features=768, out_features=768, bias=True)
      (key): Linear(in_features=768, out_features=768, bias=True)
      (value): Linear(in_features=768, out_features=768, bias=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (output): BertSelfOutput(
      (dense): Linear(in_features=768, out_features=768, bias=True)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (intermediate): BertIntermediate(
    (dense): Linear(in_features=768, out_features=3072, bias=True)
  )
  (output): BertOutput(
    (dense): Linear(in_features=3072, out_features=768, bias=True)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
)

In [138]:
model.pooler

BertPooler(
  (dense): Linear(in_features=768, out_features=768, bias=True)
  (activation): Tanh()
)

In [145]:
Coutokenizer.vocab.keys())

30522