In [50]:
import json
from transformers import (AutoTokenizer, EncoderDecoderModel, BertConfig, 
                          EncoderDecoderConfig, BertTokenizerFast, BertModel)

import tensorflow as tf
import pickle
tf.compat.v1.enable_eager_execution()
import time
import torch


In [56]:
class ArticlePreprocessor(object):
    """Sets up a preprocessor object that can be used to tokenize, 
    translate from embeddings to text and vice-versa."""
    
    def __init__(self, tokenizer_class=BertTokenizerFast, model_class=BertModel,
                 tokenizer_model_checkpoint='bert-base-uncased',
                 model_checkpoint='bert-base-uncased', 
                 max_encoder_length=512,
                 max_decoder_length=32, padding='max_length', truncation=True, return_tensors='tf'):
        
        self.tokenizer_model_checkpoint = tokenizer_model_checkpoint
        self.model_checkpoint = model_checkpoint
        self.tokenizer = tokenizer_class.from_pretrained(tokenizer_model_checkpoint)
        self.model = model_class.from_pretrained("bert-base-uncased")
        
        self.max_encoder_length = max_encoder_length
        self.max_decoder_length = max_decoder_length
        self.padding = padding
        self.truncation = truncation
        self.return_tensors = return_tensors
        self.embedded_articles = []
        
    
    def _tokenize(self, text, max_len, padding, truncation, return_tensors):
        _tokenize_res = self.tokenizer(text, max_length=max_len, truncation=truncation,
                                       padding=padding, return_tensors=return_tensors)
        return _tokenize_res
    
    
    def tokenize_embed_article(self, content, title):
        
        content_tokens = self._tokenize(content, max_len=self.max_encoder_length, padding=self.padding, 
                                        truncation=self.truncation, return_tensors='pt')
        content_embeddings = self.model(**content_tokens)[0]
        
        title_tokens = self._tokenize(title, max_len=self.max_decoder_length, padding=self.padding,
                                      truncation=self.truncation, return_tensors='pt')
        title_embeddings = self.model(**title_tokens)[0]
        
        article_tokens = {}
        
        article_tokens['content'] = content_tokens['input_ids'].detach().numpy()
        article_tokens['content_mask'] = content_tokens['attention_mask'].detach().numpy()
        article_tokens['content_embeddings'] = content_embeddings.detach().numpy()
        
        article_tokens['title'] = title_tokens['input_ids'].detach().numpy()
        article_tokens['title_mask'] = title_tokens['attention_mask'].detach().numpy()
        article_tokens['title_embeddings'] = title_embeddings.detach().numpy()
        
        return article_tokens
    
    def tokenize_embed_articles(self, articles, break_point=None):
        start_time = time.time()
        for i, article in enumerate(articles):
            if (i%100==0):
                print(i, time.time()-start_time)
            if break_point is not None and i==break_point:
                break
            self.embedded_articles.append(self.tokenize_embed_article(**article))
        return self.embedded_articles
        

with open('../data/nytfox_collate.json', 'r') as f:
    articles = json.load(f)


In [57]:
article_preprocessor = ArticlePreprocessor()
embedded_articles = article_preprocessor.tokenize_embed_articles(articles, break_point=200)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0 1.9073486328125e-06
100 26.89378571510315
200 53.918614864349365


In [46]:
for article in embedded_articles:
    print(article.keys())
    break

dict_keys(['content', 'content_mask', 'content_embeddings', 'title', 'title_mask', 'title_embeddings'])


In [31]:
len(embedded_articles)

500

In [None]:
with open('../data/embedded_articles.pkl','wb+') as f:
    pickle.dump(embedded_articles,f)

In [4]:
with open('../data/embedded_articles.pkl', 'rb') as f:
    embeddings_data = pickle.load(f)

EOFError: Ran out of input

In [None]:
from transformers import T5Tokenizer, T5Model, AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("t5-small")
model = AutoModel.from_pretrained("t5-small")

input_ids = tokenizer(
    "A", return_tensors="pt"
).input_ids  # Batch size 1
decoder_input_ids = tokenizer("B", return_tensors="pt").input_ids  # Batch size 1

# forward pass
outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
last_hidden_states = outputs.last_hidden_state

In [16]:
last_hidden_states.shape

torch.Size([1, 2, 512])