In [66]:
# https://mccormickml.com/2019/05/14/BERT-word-embeddings-tutorial/#3-extracting-embeddings

import os
import requests
import torch
from transformers import BertConfig, BertTokenizer, BertModel

class BertEmbeddingsDK(object):
    r"""Class for creating Danish BERT word and sentence on the fly."""
    
    def __init__(self, bert_path=None):
        
        # Danish BERT url
        self.url = 'https://www.dropbox.com/s/19cjaoqvv2jicq9/danish_bert_uncased_v2.zip?dl=1'
        
        # If custom model path
        if bert_path == None: 
            self.bert_path = './bert-base-danish/'
        else: 
            self.bert_path = bert_path
            self._load_model()
    
        # Check model
        pt_check, tf_check = self._check_model_files()
        if pt_check:
            self._load_model()
        elif tf_check:
            self._convert_to_pytorch()
            self._load_model()
        else:
            self._download_danish_bert()
            self._convert_to_pytorch()
            self._load_model()
        
        
    
    def _check_model_files(self):
        print('Checking required model files..')
        tf_required_files = ['bert_model.ckpt.data-00000-of-00001', 'bert_model.ckpt.index',
                             'bert_model.ckpt.meta', 'bert_config.json', 'vocab.txt']
        pt_required_files = ['bert_model.ckpt.data-00000-of-00001', 'bert_model.ckpt.index',
                             'bert_model.ckpt.meta', 'config.json', 'pytorch_model.bin', 'vocab.txt']
        if os.path.isdir(self.bert_path):
            pt_check = all([file in pt_required_files for file in os.listdir(self.bert_path)])
            tf_check = all([file in tf_required_files for file in os.listdir(self.bert_path)])
        else:
            print('Model was not found..')
            pt_check = False
            tf_check = False

        return pt_check, tf_check
    
            
    def _load_model(self):
        print('Loading model..')
        self.config = BertConfig.from_json_file(self.bert_path+'config.json')
        self.tokenizer = BertTokenizer(vocab_file=self.bert_path+'vocab.txt')
        self.model = BertModel(config=self.config).from_pretrained(self.bert_path)
        self.model.eval()
        print('Ready for embedding!')
    
    def _convert_to_pytorch(self):
        print('Converting model to PyTorch..')
        os.system('transformers-cli convert --model_type bert --tf_checkpoint ./bert-base-danish/bert_model.ckpt --config ./bert-base-danish/bert_config.json --pytorch_dump_output ./bert-base-danish/pytorch_model.bin')
        os.system('mv bert-base-danish/bert_config.json bert-base-danish/config.json')
        return None
    
    def _download_danish_bert(self):
        print('Downloading danish BERT (this may take some time ^^)..')
        r = requests.get(self.url, allow_redirects=True)
        open('bert-base-danish.zip', 'wb').write(r.content)
        os.system('unzip bert-base-danish.zip')
        os.system('mv danish_bert_uncased_v2 bert-base-danish')
        os.system('rm bert-base-danish.zip')
        os.system('rm -rf __MACOSX')
        return None
    
    def _embedding_method(self):
        # bla bla sum + avg last x layers
        return None
    
    def embed(self, text, print_num_tokens=False):
        
        tokenized = self.tokenizer(text, return_tensors='pt')
        if print_num_tokens: print(tokenized['input_ids'].shape)
        with torch.no_grad():
            output = self.model(**tokenized, output_hidden_states=True)
        
        hidden_states = torch.cat(output[2])
        hidden_states = hidden_states.permute(1,0,2) # re-arrange dimensions: [token, layer, embedding_dim]
        
        return hidden_states
        
        
        
embedder = BertEmbeddingsDK()




Checking required model files..
Model was not found..
Downloading danish BERT..
Converting model to PyTorch..
Loading model..


In [68]:
text = 'Igen en sober og saglig dialog. I lytter til gæsten og I underspiller ikke det komplekse i, at deltagelse i en bæredygtig rejse kun kræver omtanke og et valg.... helt nede i de små detaljer. Tøv ikke - Bare få begyndt! Og så er der udfordringerne ... tankevækkende at lytte med på'
hidden_states = embedder.embed(text, print_num_tokens=False)
hidden_states.shape

torch.Size([69, 13, 768])

## Download and convert Danish BERT

In [49]:


# Download Danish Bert
url = 'https://www.dropbox.com/s/19cjaoqvv2jicq9/danish_bert_uncased_v2.zip?dl=1'
r = requests.get(url, allow_redirects=True)
open('bert-base-danish.zip', 'wb').write(r.content)


os.system('unzip bert-base-danish.zip')
os.system('mv danish_bert_uncased_v2 bert-base-danish')
os.system('rm bert-base-danish.zip')
os.system('rm -rf __MACOSX')
os.system('transformers-cli convert --model_type bert --tf_checkpoint ./bert-base-danish/bert_model.ckpt --config ./bert-base-danish/bert_config.json --pytorch_dump_output ./bert-base-danish/pytorch_model.bin')
os.system('mv bert-base-danish/bert_config.json bert-base-danish/config.json')


## Get danish BERT on linux:
    # 1. download           --> wget https://www.dropbox.com/s/19cjaoqvv2jicq9/danish_bert_uncased_v2.zip?dl=1
    # 2. unzip              --> unzip danish_bert_uncased_v2.zip?dl=1
    # 3. rm zipped          --> rm danish_bert_uncased_v2.zip\?dl\=1
    # 4. rename             --> mv danish_bert_uncased_v2 bert-base-danish
    # 5. convert to pytorch --> transformers-cli convert --model_type bert --tf_checkpoint ./bert-base-danish/bert_model.ckpt --config ./bert-base-danish/bert_config.json --pytorch_dump_output ./bert-base-danish/pytorch_model.bin
    # 6. rename config      --> mv bert-base-danish/bert_config.json bert-base-danish/config.json    
    

1222631135

In [56]:
import os
tf_required_files = ['bert_model.ckpt.data-00000-of-00001', 'bert_model.ckpt.index',
                     'bert_model.ckpt.meta', 'bert_config.json', 'vocab.txt']

pt_required_files = ['bert_model.ckpt.data-00000-of-00001', 'bert_model.ckpt.index',
                     'bert_model.ckpt.meta', 'config.json', 
                     'pytorch_model.bin', 'vocab.txt']


if os.path.isdir('./bert-base-danish'):
    pt_check = [file in pt_required_files for file in os.listdir('/home/juunge/models/bert-base-danish/')]
    tf_check = [file in tf_required_files for file in os.listdir('/home/juunge/models/bert-base-danish/')]

    if all(pt_check):
        _load_model()
    elif all(tf_check):
        _convert_model()
        _load_model()
    else:
        pass

    print(pt_check)
    print(tf_check)