In [10]:
import torch
import torch.nn as nn
from pytorch_pretrained_bert import BertTokenizer, BertModel, BertForMaskedLM, BertForPreTraining, modeling, BertConfig, BertForNextSentencePrediction 
import os
import json
import numpy as np

## 1st version to reproduce our results in the paper

In [11]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/config.json"

In [83]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/bert_config.json"

## 2nd version (2019/11/18) trained with more (book+wiki) and no [MASK] corpus 

In [88]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/bert_config.json"

In [94]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/bert_config.json"

## ●Bert設定

In [12]:
# mask語予測と隣接文予測をするためのBERTモデル
model = BertForMaskedLM.from_pretrained(os.environ["model_dir"])
tokenizer = BertTokenizer.from_pretrained(os.environ["model_dir"], do_lower_case=True, do_basic_tokenize=True)

In [13]:
print(model.bert.embeddings.word_embeddings.weight.mean())
print(model.bert.encoder.layer[0].attention.output.dense.weight.mean())
print(model.bert.pooler.dense.weight.mean())
print(model.cls.predictions.transform.dense.weight.mean())
print(model.cls.predictions.decoder.weight.mean())

tensor(0.0017, grad_fn=<MeanBackward0>)
tensor(0.0001, grad_fn=<MeanBackward0>)
tensor(0.0001, grad_fn=<MeanBackward0>)
tensor(1.9590e-05, grad_fn=<MeanBackward0>)
tensor(0.0017, grad_fn=<MeanBackward0>)


In [5]:
# vocab_list = list(tokenizer.vocab.keys())
# tokenizer.vocab.keys()

In [6]:
# # 2171 - name
# tokenizer.convert_ids_to_tokens([2171])

## bert based model

In [39]:
# mask語予測と隣接文予測をするためのBERTモデル
# do_lower_case=False, do_basic_tokenize=False
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForMaskedLM.from_pretrained('bert-base-uncased')

In [258]:
# print(model.bert.embeddings.word_embeddings.weight.mean())
# print(model.bert.encoder.layer[0].attention.output.dense.weight.mean())
# print(model.bert.encoder.layer[11].attention.output.dense.weight.mean())
# print(model.bert.pooler.dense.weight.mean())
# print(model.cls.predictions.transform.dense.weight.mean())
# print(model.cls.predictions.decoder.weight.mean())

In [259]:
# vocab_list = list(tokenizer.vocab.keys())
# tokenizer.vocab.keys()

In [260]:
# # 2171 - name
# tokenizer.convert_ids_to_tokens([2171])

## ●マスク語予測
### masked_lm_labels: 
* masked language modeling labels: torch.LongTensor of shape [batch_size, sequence_length] with indices selected in [-1, 0, ..., vocab_size]. 
* All labels set to -1 are ignored (masked), the loss is only computed for the labels set in [0, ..., vocab_size]

In [14]:
def predict_mask(text, tag_bert_tokens, mask_id, segments_ids):
    true_tokens = []
    for i in mask_id:
        true_tokens.append(tag_bert_tokens[i]) 
        tag_bert_tokens[i] = "[MASK]"
    ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)
    print("# ids: {}\n".format(ids))
    print("# tokens: {}\n".format(tokenizer.convert_ids_to_tokens(ids)))

    # ベクトルを作成する
    input_ids = torch.tensor([ids])  # unsqueeze(0) / reshape(1, -1)
    segments_tensors = torch.tensor([segments_ids])

    print("# input_ids: {}\n".format(input_ids))
    print("# segments_tensors: {}\n".format(segments_tensors))
    
    with torch.no_grad():
        predictions = model(input_ids, segments_tensors)
    
    print("# true: \n{}\n".format(true_tokens))
    for i in mask_id:        
        scores, predicted_indexes = torch.topk(predictions[0, i], k=5)
        predicted_tokens = tokenizer.convert_ids_to_tokens(predicted_indexes.tolist())
        print("# scores: {}\n".format(scores))
        print("# predicted_indexes: {}\n".format(predicted_indexes))
        print("# predicted_tokens: {}\n".format(predicted_tokens))
        print()

In [15]:
text1 = """
the man went to the store . 
"""
# 文書数
cnt = 1

In [121]:
text1 = """
he bought a gallon of milk .
"""
# 文書数
cnt = 1

In [127]:
text1 = """
the man went to the store .
"""
text2 = """
he bought a gallon of milk .
"""
# 文書数
cnt = 2

In [None]:
text1 = """
Who was Jim Henson ?
"""
text2 = """
Jim Henson was a puppeteer
"""
# 文書数
cnt = 2

In [16]:
max_seq_len = 128
if cnt == 1:
    text2 = ""
    bert_tokens = tokenizer.tokenize(text1)

    tag_bert_tokens = ["[CLS]"] + bert_tokens[:max_seq_len] + ["[SEP]"]
    segments_ids = [0]*len(tag_bert_tokens)
    print("tokens: \n{}\n".format(tag_bert_tokens))
    print("segments_ids: \n{}\n".format(segments_ids))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

elif cnt == 2:
    tokens_a = tokenizer.tokenize(text1)
    tokens_b = tokenizer.tokenize(text2)
    tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
    tokens_b = tokens_b + ['[SEP]']
    tag_bert_tokens = tokens_a + tokens_b
    segments_ids = [0]*len(tokens_a) + [1]*len(tokens_b)
    
    print("tokens: \n{}\n".format(tag_bert_tokens))
    print("segments_ids: \n{}\n".format(segments_ids))

    print([[i, j] for i, j in zip(range(len(tag_bert_tokens)), tag_bert_tokens)])

tokens: 
['[CLS]', 'the', 'man', 'went', 'to', 'the', 'store', '.', '[SEP]']

segments_ids: 
[0, 0, 0, 0, 0, 0, 0, 0, 0]

[[0, '[CLS]'], [1, 'the'], [2, 'man'], [3, 'went'], [4, 'to'], [5, 'the'], [6, 'store'], [7, '.'], [8, '[SEP]']]


In [17]:
mask_id = [6]
predict_mask(text1 + text2, tag_bert_tokens, mask_id, segments_ids)

# ids: [101, 1996, 2158, 2253, 2000, 1996, 103, 1012, 102]

# tokens: ['[CLS]', 'the', 'man', 'went', 'to', 'the', '[MASK]', '.', '[SEP]']

# input_ids: tensor([[ 101, 1996, 2158, 2253, 2000, 1996,  103, 1012,  102]])

# segments_tensors: tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]])

# true: 
['store']

# scores: tensor([2.1023, 2.0991, 2.0584, 2.0334, 2.0319])

# predicted_indexes: tensor([16511,  8303,  9549, 10844,  7001])

# predicted_tokens: ['resorts', '##ress', 'convoy', 'cruiser', 'resort']




## ● the man went to the store. masked=store
### bert_based
* predicted_tokens: ['door', 'window', 'bathroom', 'kitchen', 'doorway']

### 1st version (small)
* predicted_tokens: ['resorts', '##ress', 'convoy', 'cruiser', 'resort']

### 1st version (large)
* predicted_tokens: ['fabian', 'poems', 'breach', 'lenny', 'andreas']

### 2nd version no mask（small）
* predicted_tokens: ['formerly', 'https', '##rcus', 'previously', 'subsp']

### 2nd version no large
* predicted_tokens: ['##ntes', '##oire', '##hdi', 'apron', '##ending']

## ● he bought a gallon of milk. masked=milk
### bert_based
* predicted_tokens: ['milk', 'water', 'coffee', 'beer', 'wine']

### 1st version (small)
* predicted_tokens: ['##heart', '##rran', '##drome', '##stream', '##llon']

### 1st version (large)
* predicted_tokens: ['dos', 'lana', 'bind', 'edit', '##aves']

### 2nd version no mask（small）
* predicted_tokens: ['##uba', 'https', 'www', '##ensis', 'http']

### 2nd version no large
* predicted_tokens: ['##bedo', '##onate', '##riety', '##bution', '##enia']

## ●隣接文予測
### next_sentence_label: 
* next sentence classification loss: torch.LongTensor of shape [batch_size] with indices selected in [0, 1]. 
* 0 => next sentence is the continuation, 
* 1 => next sentence is a random sentence.

## 1st version to reproduce our results in the paper

In [42]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_4L_312D/bert_config.json"

In [38]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/General_TinyBERT_6L_768D/bert_config.json"

## 2nd version (2019/11/18) trained with more (book+wiki) and no [MASK] corpus 

In [34]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_4L_312D/bert_config.json"

In [None]:
os.environ["model_dir"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/"
os.environ["pytorch_model"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/pytorch_model.bin"
os.environ["vocab_txt"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/vocab.txt"
os.environ["bert_config"] = "/Users/ikuhiro/Desktop/BERT/model/2nd_General_TinyBERT_6L_768D/bert_config.json"

## input

In [49]:
# IsNextSentence
text1 = """
the man went to the store .
"""
text2 = """
he bought a gallon of milk .
"""

In [43]:
# NotNextSentence
text1 = """
the man went to the store .
"""
text2 = """
penguins are flightless .
"""

## bert_based

In [47]:
# mask語予測と隣接文予測をするためのBERTモデル
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForNextSentencePrediction.from_pretrained('bert-base-uncased')

## tinyBERT

In [45]:
# mask語予測と隣接文予測をするためのBERTモデル
model = BertForNextSentencePrediction.from_pretrained(os.environ["model_dir"])
tokenizer = BertTokenizer.from_pretrained(os.environ["model_dir"], do_lower_case=True, do_basic_tokenize=True)

In [50]:
tokens_a = tokenizer.tokenize(text1)
tokens_b = tokenizer.tokenize(text2)
tokens_a = ['[CLS]'] + tokens_a + ['[SEP]']
tokens_b = tokens_b + ['[SEP]']
tag_bert_tokens = tokens_a + tokens_b
segments_ids = [0]*len(tokens_a) + [1]*len(tokens_b)

ids = tokenizer.convert_tokens_to_ids(tag_bert_tokens)
print("# ids: {}\n".format(ids))
print("# tokens: {}\n".format(tokenizer.convert_ids_to_tokens(ids)))
print("# segments_ids: {}\n".format(segments_ids))

# ベクトルを作成する
input_ids = torch.tensor([ids])  # unsqueeze(0) / reshape(1, -1)
segments_tensors = torch.tensor([segments_ids])

with torch.no_grad():
    outputs = model(input_ids, token_type_ids=segments_tensors)
    predictions = outputs[0]

print("second-text is random next: ['{}'] (next sentence: ['{:.5f}']', random sentence: ['{:.5f}'])".format(bool(1-np.argmax(predictions)), predictions[0], predictions[1]))

# ids: [101, 1996, 2158, 2253, 2000, 1996, 3573, 1012, 102, 2002, 4149, 1037, 25234, 1997, 6501, 1012, 102]

# tokens: ['[CLS]', 'the', 'man', 'went', 'to', 'the', 'store', '.', '[SEP]', 'he', 'bought', 'a', 'gallon', 'of', 'milk', '.', '[SEP]']

# segments_ids: [0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1]

second-text is random next: ['True'] (next sentence: ['5.86740']', random sentence: ['-5.31490'])


## ● the man went to the store . he bought a gallon of milk . -> IsNextSentence
### bert_based
* second-text is random next: ['True'] (next sentence: ['5.86740']', random sentence: ['-5.31490'])

### 1st version (small)
* second-text is random next: ['True'] (next sentence: ['0.02345']', random sentence: ['-0.02809'])

### 1st version (large)
* second-text is random next: ['True'] (next sentence: ['0.10266']', random sentence: ['-0.13284'])

### 2nd version no mask（small）
* second-text is random next: ['True'] (next sentence: ['-0.00289']', random sentence: ['-0.00343'])

### 2nd version no large
* second-text is random next: ['True'] (next sentence: ['0.00094'], random sentence: ['-0.00591'])

## ● the man went to the store . penguins are flightless . -> NotNextSentence
### bert_based
* second-text is random next: ['False'] (next sentence: ['-2.93883']', random sentence: ['5.82727'])

### 1st version (small)
* second-text is random next: ['True'] (next sentence: ['0.01661']', random sentence: ['-0.00665'])

### 1st version (large)
* second-text is random next: ['True'] (next sentence: ['0.14387']', random sentence: ['0.02148'])

### 2nd version no mask（small）
* second-text is random next: ['True'] (next sentence: ['0.00018'], random sentence: ['-0.00087'])

### 2nd version no large
* second-text is random next: ['True'] (next sentence: ['0.00186'], random sentence: ['0.00106'])
