In [1]:
import sys
sys.path.append("..")
from general_module.read import *

import torch
from tqdm.auto import tqdm
from transformers import AutoTokenizer, AutoModel



In [2]:
class FeatureExtractor:
    def __init__(self, tokenizer, transformer):
        self.tokenizer = tokenizer
        self.transformer = transformer

    # method to convert a list of sentence into BERT embeddings
    def get_embeddings(self,documents):
        
        embeddings_list= []
        for document in tqdm(documents):
            tokens = self.tokenizer.encode_plus(document, max_length=512, truncation=True, padding="max_length", return_tensors="pt")
            with torch.no_grad():
                embeddings = self.transformer(**tokens)
                embeddings = embeddings[0][:,0,:] # only use the first token [CLS] as the sentence embedding
                embeddings = embeddings.squeeze(0) # remove the 1 dimension
                embeddings = embeddings.numpy()
                embeddings_list.append(embeddings)
        
        return embeddings_list
    

In [6]:
def generate(corpus_name,set_number, feature_extraction_name, tokenizer,transformer):
    SRC_ROOT = "../corpus/ready"
    DEST_ROOT = "./ready"
    feature_extraction_name = feature_extraction_name.replace("/","--")

    feature_extractor = FeatureExtractor(tokenizer=tokenizer,transformer=transformer)
    for corpus_type in ["train","validation","test"]:

        path = SRC_ROOT+"/"+corpus_name+"/set_"+str(set_number)+"/"+corpus_type+".pickle"

        corpus = extract(path)
        
        corpus["text"] = feature_extractor.get_embeddings(corpus["text"])

        dir = DEST_ROOT +"/"+feature_extraction_name+"/"+corpus_name+"/set_"+str(set_number)
        save_dataset(corpus,dir,corpus_type)
    

In [5]:
# bert-base-uncased
feature_extraction_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(feature_extraction_name)
transformer = AutoModel.from_pretrained(feature_extraction_name)
generate("essays",0, feature_extraction_name =feature_extraction_name, tokenizer=tokenizer,transformer=transformer)
generate("mbti",0,feature_extraction_name =feature_extraction_name,  tokenizer=tokenizer,transformer=transformer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1479 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/5205 [00:00<?, ?it/s]

  0%|          | 0/1735 [00:00<?, ?it/s]

  0%|          | 0/1735 [00:00<?, ?it/s]

In [7]:
# allenai/longformer-base-4096
feature_extraction_name = "allenai/longformer-base-4096"
tokenizer = AutoTokenizer.from_pretrained(feature_extraction_name)
transformer = AutoModel.from_pretrained(feature_extraction_name)
generate("essays",0, feature_extraction_name =feature_extraction_name, tokenizer=tokenizer,transformer=transformer)
generate("mbti",0,feature_extraction_name =feature_extraction_name,  tokenizer=tokenizer,transformer=transformer)

Some weights of the model checkpoint at allenai/longformer-base-4096 were not used when initializing LongformerModel: ['lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.bias', 'lm_head.bias']
- This IS expected if you are initializing LongformerModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1479 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/494 [00:00<?, ?it/s]

  0%|          | 0/5205 [00:00<?, ?it/s]

  0%|          | 0/1735 [00:00<?, ?it/s]

  0%|          | 0/1735 [00:00<?, ?it/s]