In [1]:
import pandas as pd
import numpy as np

In [2]:
df_news_tweets = pd.read_pickle('news_tweets_filtered_en.pkl')

In [3]:
import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]="2"

## Sentense transformer
Models: 
**paraphrase-mpnet-base-v2,
paraphrase-distilroberta-base-v2,
paraphrase-MiniLM-L6-v2,
nli-mpnet-base-v2,
nli-roberta-base-v2**

In [4]:
from sentence_transformers import SentenceTransformer, util

In [18]:
# model = SentenceTransformer('paraphrase-mpnet-base-v2')
# model = SentenceTransformer('paraphrase-distilroberta-base-v2')
# model = SentenceTransformer('paraphrase-MiniLM-L6-v2')
# model = SentenceTransformer('nli-mpnet-base-v2')
model = SentenceTransformer('nli-roberta-base-v2')

In [19]:
tweets_emds = model.encode(df_news_tweets['processed_text'].tolist())
df_news_tweets['tweet_embeddings'] = tweets_emds.tolist()

In [20]:
# df_news_tweets.to_pickle('en-emd-paraphrase-mpnet-base-v2.pkl')
# df_news_tweets.to_pickle('en-emd-paraphrase-distilroberta-base-v2.pkl')
# df_news_tweets.to_pickle('en-emd-paraphrase-MiniLM-L6-v2.pkl')
# df_news_tweets.to_pickle('en-emd-nli-mpnet-base-v2.pkl')
df_news_tweets.to_pickle('en-emd-nli-roberta-base-v2.pkl')

### Hugging Face
Models: **digitalepidemiologylab/covid-twitter-bert, cardiffnlp/twitter-roberta-base**

In [21]:
import torch
from transformers import AutoTokenizer, AutoModel
from torch.utils.data import Dataset, DataLoader, sampler

In [36]:
# pretrain_model_path = 'digitalepidemiologylab/covid-twitter-bert-v2'
pretrain_model_path = 'cardiffnlp/twitter-roberta-base'

In [37]:
max_seq_len = 128

In [38]:
class MyDataset(Dataset):
    def __init__(self, sentences):

        self.tokenizer = AutoTokenizer.from_pretrained(pretrain_model_path)
        self.pad_idx = self.tokenizer.pad_token_id
        self.input_ids = []
        self.attention_mask = []
        
        encoded_input = self.tokenizer(sentences, padding=True, truncation=True, max_length=max_seq_len, return_tensors='pt')
        
        for i in range(len(encoded_input['input_ids'])):
            
            self.input_ids.append(encoded_input['input_ids'][i])
#             endic['token_type_ids'] = encoded_input['token_type_ids'][i]
            self.attention_mask.append(encoded_input['attention_mask'][i])
        
        
    def __getitem__(self, idx):
        return torch.LongTensor(self.input_ids[idx]),\
               torch.LongTensor(self.attention_mask[idx])
    
    def __len__(self):
        return len(self.input_ids)

In [39]:
text_set = MyDataset(df_news_tweets['processed_text'].tolist())

In [40]:
batch_size = 128
num_workers = 8

In [41]:
text_loader = DataLoader(dataset=text_set, batch_size=batch_size, num_workers = num_workers, shuffle=False)

In [42]:
def mean_pooling(model_output, attention_mask):
    
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    token_embeddings = token_embeddings.detach().cpu()
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

In [43]:
tokenizer = AutoTokenizer.from_pretrained(pretrain_model_path)
model = AutoModel.from_pretrained(pretrain_model_path)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [44]:
tweet_emds = []
for i, (input_ids, input_mask) in enumerate(text_loader):
    text_ids = input_ids.to(device)
    text_mask = input_mask.to(device)

    with torch.no_grad():
        
        features = model(text_ids, attention_mask = text_mask)
    
    feature_mean = mean_pooling(features, input_mask)        
    tweet_emds.append(feature_mean.tolist())
 
    torch.cuda.empty_cache()

In [45]:
emdarray = np.concatenate(tweet_emds, axis=0)

In [46]:
df_news_tweets['tweet_embeddings'] = emdarray.tolist()

In [47]:
# df_news_tweets.to_pickle('en-emd-digitalepidemiologylab-covid-twitter-bert-v2.pkl')
df_news_tweets.to_pickle('en-emd-cardiffnlp-twitter-roberta-base.pkl')