In [1]:
import os

import transformers
import torch
import torch.nn as nn

In [2]:
cache_dir = 'cache/'
if not os.path.exists(cache_dir):
    os.makedirs(cache_dir)

In [3]:
cfg_roberta = transformers.RobertaConfig.from_pretrained("FacebookAI/roberta-base", cache_dir=cache_dir)
cfg_roberta.__dict__

{'return_dict': True,
 'output_hidden_states': False,
 'output_attentions': False,
 'torchscript': False,
 'torch_dtype': None,
 'use_bfloat16': False,
 'tf_legacy_loss': False,
 'pruned_heads': {},
 'tie_word_embeddings': True,
 'chunk_size_feed_forward': 0,
 'is_encoder_decoder': False,
 'is_decoder': False,
 'cross_attention_hidden_size': None,
 'add_cross_attention': False,
 'tie_encoder_decoder': False,
 'max_length': 20,
 'min_length': 0,
 'do_sample': False,
 'early_stopping': False,
 'num_beams': 1,
 'num_beam_groups': 1,
 'diversity_penalty': 0.0,
 'temperature': 1.0,
 'top_k': 50,
 'top_p': 1.0,
 'typical_p': 1.0,
 'repetition_penalty': 1.0,
 'length_penalty': 1.0,
 'no_repeat_ngram_size': 0,
 'encoder_no_repeat_ngram_size': 0,
 'bad_words_ids': None,
 'num_return_sequences': 1,
 'output_scores': False,
 'return_dict_in_generate': False,
 'forced_bos_token_id': None,
 'forced_eos_token_id': None,
 'remove_invalid_values': False,
 'exponential_decay_length_penalty': None,
 'su

In [4]:
model = transformers.RobertaForMaskedLM.from_pretrained("FacebookAI/roberta-base",config=cfg_roberta, cache_dir=cache_dir)
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [5]:
tokenizer = transformers.RobertaTokenizer.from_pretrained("FacebookAI/roberta-base", cache_dir=cache_dir)

## Data

In [6]:
import pandas as pd

PARQUET_PATH = "../data/arxiv_metadata.parquet.gzip"
df = pd.read_parquet(PARQUET_PATH)
titles = df.title.str.lower().tolist()[:1000]

In [7]:
def collate_fn(batch):
    return tokenizer(batch, padding=True, truncation=True, return_tensors="pt")

dataloader = torch.utils.data.DataLoader(titles, batch_size=4, shuffle=False, collate_fn=collate_fn)

In [9]:
def num_params(model):
    return sum(p.numel() for p in model.parameters())

In [10]:
print(f'{num_params(model)/1e6:.2f}M')

124.70M


In [45]:
device = "cuda"
epochs = 500
mask_token_id = tokenizer.mask_token_id
pad_token_id = tokenizer.pad_token_id
mask_prob = 0.15
replace_prob = 0.90
mask_ignore_token_ids = []
num_tokens = tokenizer.vocab_size
random_token_prob = 0

In [12]:
criterion = nn.CrossEntropyLoss(padding_idx=tokenizer.pad_token_id)
optim = torch.optim.AdamW(model.parameters(), lr=0.0001)
ntokens = tokenizer.vocab_size

In [40]:
from functools import reduce
import math
from tqdm.notebook import tqdm

def prob_mask_like(t, prob):
    """
    Return binary mask with `prob` threshold.
    """
    return torch.zeros_like(t).float().uniform_(0, 1) < prob

def mask_with_tokens(t, token_ids):
    """
    Return binary mask with `token_ids` replaced with `True`.
    """
    init_no_mask = torch.full_like(t, False, dtype=torch.bool)
    mask = reduce(lambda acc, el: acc | (t == el), token_ids, init_no_mask)
    return mask

def get_mask_subset_with_prob(mask, prob):
    batch, seq_len, device = *mask.shape, mask.device
    max_masked = math.ceil(prob * seq_len)

    num_tokens = mask.sum(dim=-1, keepdim=True)
    mask_excess = (mask.cumsum(dim=-1) > (num_tokens * prob).ceil())
    mask_excess = mask_excess[:, :max_masked]

    rand = torch.rand((batch, seq_len), device=device).masked_fill(~mask, -1e9)
    _, sampled_indices = rand.topk(max_masked, dim=-1)
    sampled_indices = (sampled_indices + 1).masked_fill_(mask_excess, 0)

    new_mask = torch.zeros((batch, seq_len + 1), device=device)
    new_mask.scatter_(-1, sampled_indices, 1)
    return new_mask[:, 1:].bool()


In [None]:
model.train().to(device)
for epoch in range(epochs):
    with tqdm(dataloader, total=len(dataloader), desc=f"Epoch {epoch+1}") as t:
        total_loss = 0
        for batch in dataloader:
            inputs, mask = batch.input_ids, batch.attention_mask
            no_mask = mask_with_tokens(inputs, [])
            mask = get_mask_subset_with_prob(~no_mask, mask_prob)
            masked_seq = inputs.clone().detach()
            labels = inputs.masked_fill(~mask, pad_token_id)

            if random_token_prob > 0:
                assert num_tokens is not None, 'num_tokens keyword must be supplied when instantiating MLM if using random token replacement'
                random_token_prob = prob_mask_like(inputs, random_token_prob).to(device)
                random_tokens = torch.randint(0, num_tokens, inputs.shape, device=device)
                random_no_mask = mask_with_tokens(random_tokens,mask_ignore_token_ids)
                random_token_prob &= ~random_no_mask
                masked_seq = torch.where(random_token_prob, random_tokens, masked_seq).to(device)

                # remove tokens that were substituted by random to be [mask]ed later
                mask = (mask & ~random_token_prob)

            # [mask] input

            replace_prob_mask = prob_mask_like(inputs, replace_prob)
            masked_seq = masked_seq.masked_fill(mask * replace_prob_mask, mask_token_id)

            output = model(masked_seq.to(device),mask.to(device)).logits
            loss = criterion(output.transpose(1, 2), labels.to(device))
            total_loss += loss
            loss.backward()
            optim.step()
            t.update()

        total_loss /= len(dataloader)
        t.set_postfix(loss=total_loss.item())

In [53]:
torch.save(model.state_dict(), 'roberta_arxiv.pt')

In [62]:
model

RobertaForMaskedLM(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-11): 12 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): 

In [73]:
# Create title embeddings
model.eval()
embeddings = []
with torch.no_grad():
    inputs = tokenizer(titles, padding=True, truncation=True, return_tensors="pt").to(device)
    embeddings = model.roberta.embeddings(inputs.input_ids)
    representations = model.roberta.encoder(embeddings)


torch.Size([1000, 67])


In [79]:
# Make embedding vectors out of torch.Size([1000, 67, 768])
# Average over the sequence dimension
# torch.Size([1000, 768])

title_embeddings = representations.last_hidden_state.mean(dim=1)
title_embeddings.shape

torch.Size([1000, 768])

In [87]:
import numpy as np
import pandas as pd
import plotly.express as px
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA

# Load embeddings and titles
embeddings = title_embeddings.cpu().numpy()
tsne = TSNE(n_components=2, random_state=0)
embeddings_2d = tsne.fit_transform(embeddings)
tsne_df = pd.DataFrame(embeddings_2d, columns=['x', 'y'])
tsne_df['title'] = titles

# Define a column to specify if the title contains a specific string
specific_string = "x-ray"  # Change this to your desired string
tsne_df['highlight'] = tsne_df['title'].apply(lambda x: specific_string in x)

# Plot with Plotly
fig = px.scatter(tsne_df, x='x', y='y', color='highlight', hover_data=['title'], title='t-SNE Visualization of Word Embeddings', width=1000, height=1000)
fig.update_layout(
    xaxis_title='x',
    yaxis_title='y',
)
fig.show()


In [84]:
# Get vocab of titles

from collections import Counter

vocab = Counter()
for title in titles:
    vocab.update(title.split())

vocab

Counter({'of': 589,
         'the': 467,
         'in': 324,
         'and': 321,
         'a': 202,
         'for': 152,
         'on': 126,
         'with': 103,
         'quantum': 65,
         'to': 61,
         'from': 58,
         'at': 43,
         'theory': 41,
         'model': 39,
         'an': 37,
         'energy': 31,
         'field': 30,
         'by': 29,
         'phase': 25,
         'spin': 25,
         'systems': 24,
         'analysis': 22,
         'evolution': 19,
         'as': 19,
         'emission': 19,
         'approach': 19,
         'effects': 18,
         'new': 18,
         'equations': 18,
         'dynamics': 18,
         'dark': 17,
         'states': 17,
         'finite': 17,
         'solutions': 17,
         'state': 17,
         'star': 17,
         'structure': 16,
         'properties': 15,
         'data': 15,
         'transition': 15,
         'magnetic': 15,
         'x-ray': 15,
         'formation': 14,
         'fields': 14,
         '