## Conditional Bert

In [1]:
import os
import shutil
import sys
import subprocess

import nltk

REPO_LOCATION = 'https://gitlab.com/korzeniowski.renard/text-augmentation.git'
REPO_NAME = 'text-augmentation'
REPO_BRANCH = 'master'
PACKAGES = ["datasets", "fastai", "nlpaug", "transformers"]

# Clone the repository"
if os.path.exists(REPO_NAME):
    print("Removing existing repo")
    shutil.rmtree(REPO_NAME)
print('cloning the repository')
subprocess.call(['git', 'clone', '-b', REPO_BRANCH, REPO_LOCATION])

# Setting env variables
sys.path.append(REPO_NAME)

# Install packages 
print('installing packages:', ', '.join(PACKAGES))
subprocess.call(['pip', 'install -U'] + PACKAGES)
nltk.download('punkt')

# Run code
os.chdir(REPO_NAME)

cloning the repository
installing packages: datasets, fastai, nlpaug, transformers
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [2]:
%pip install datasets
%pip install -U fastai
%pip install nlpaug
%pip install transformers

Collecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/1a/38/0c24dce24767386123d528d27109024220db0e7a04467b658d587695241a/datasets-1.1.3-py3-none-any.whl (153kB)
[K     |████████████████████████████████| 163kB 13.9MB/s 
Collecting pyarrow>=0.17.1
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e1/27958a70848f8f7089bff8d6ebe42519daf01f976d28b481e1bfd52c8097/pyarrow-2.0.0-cp36-cp36m-manylinux2014_x86_64.whl (17.7MB)
[K     |████████████████████████████████| 17.7MB 206kB/s 
Collecting xxhash
[?25l  Downloading https://files.pythonhosted.org/packages/f7/73/826b19f3594756cb1c6c23d2fbd8ca6a77a9cd3b650c9dec5acc85004c38/xxhash-2.0.0-cp36-cp36m-manylinux2010_x86_64.whl (242kB)
[K     |████████████████████████████████| 245kB 57.2MB/s 
Installing collected packages: pyarrow, xxhash, datasets
  Found existing installation: pyarrow 0.14.1
    Uninstalling pyarrow-0.14.1:
      Successfully uninstalled pyarrow-0.14.1
Successfully installed datasets-1.1.3 p

# Training Conditional Masked Model

In [11]:
!python3 run_conditional_language_modeling.py \
--output_dir /content/drive/MyDrive/Colab\ Notebooks/nlp/pretrained_models/yelp_conditional \
--model_name_or_path roberta-base \
--do_train \
--do_eval \
--dataset_name yelp \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--gradient_accumulation_steps 4 \
--num_train_epochs 10 \
--logging_steps 1 \
--save_steps 10 \
--eval_steps 10 \
--evaluation_strategy steps \
--num_train_epochs 10 \
--overwrite_output_dir

2020-12-07 21:12:07.890000: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
12/07/2020 21:12:10 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional', overwrite_output_dir=True, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=<EvaluationStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Dec07_21-12-10_4892d22f056d', logging_first_step=False, logging_steps=1, save_steps=10, save_total_limit=None, no_cuda=False, seed=42, fp16=False,

# Evaluating Standard MLM

In [41]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

In [71]:
model = AutoModelForMaskedLM.from_pretrained('roberta-base').eval()
tokenizer = AutoTokenizer.from_pretrained('roberta-base')
vocab_words = [tokenizer.decode(i) for i in range(len(tokenizer))]

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at roberta-base and are newly initialized: ['lm_head.decoder.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [72]:
sentence = f'I think this restaurant is the {tokenizer.mask_token} in the city'
sentence

'I think this restaurant is the <mask> in the city'

In [73]:
tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

In [75]:
masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()

In [86]:
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.94857913),
 (' finest', 0.0108271325),
 (' coolest', 0.005482896),
 (' worst', 0.0053674923),
 (' newest', 0.004469985),
 (' hottest', 0.0044308784),
 (' greatest', 0.0027584091),
 (' smartest', 0.0020890823),
 (' biggest', 0.0017668337),
 (' top', 0.0014481086)]

In [84]:
tokenizer.decode(3)

'<unk>'

# Evaluating Conditional MLM

In [110]:
model = AutoModelForMaskedLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional/checkpoint-40').eval()

## Positive sentence

In [111]:
sentence = f'positive {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'
sentence

'positive </s> I think this restaurant is among the <mask> in the city'

In [113]:
tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

In [114]:
masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.9108875),
 (' finest', 0.04119079),
 (' better', 0.018189408),
 (' top', 0.014489585),
 (' hottest', 0.0024455837),
 (' greatest', 0.0018530154),
 (' great', 0.00081572926),
 (' stars', 0.0007540227),
 (' coolest', 0.00060782227),
 (' safest', 0.00051225425)]

In [115]:
sentence = f'negative {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'
sentence

'negative </s> I think this restaurant is among the <mask> in the city'

In [116]:
tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

In [117]:
masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.8484191),
 (' better', 0.045180257),
 (' worst', 0.040889587),
 (' finest', 0.03282445),
 (' top', 0.010033123),
 (' hottest', 0.0025719462),
 (' cheapest', 0.0021640842),
 (' greatest', 0.0018757473),
 (' smallest', 0.0011033808),
 (' busiest', 0.00088208786)]

In a negative sentence, we have 'worst' as a second most probable word. It doesn't appear when conditioned on positive token. It does appear in the standard model as well.



# Training Conditonal MLM starting from a finetuned model

In [109]:
!python3 run_conditional_language_modeling.py \
--output_dir '/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional_finetuned' \
--model_name_or_path '/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_roberta/checkpoint-350' \
--tokenizer_name roberta-base \
--do_train \
--do_eval \
--dataset_name yelp \
--per_device_train_batch_size 8 \
--per_device_eval_batch_size 8 \
--gradient_accumulation_steps 4 \
--num_train_epochs 10 \
--logging_steps 1 \
--save_steps 10 \
--eval_steps 10 \
--evaluation_strategy steps \
--num_train_epochs 10 \

2020-12-07 21:59:16.077979: I tensorflow/stream_executor/platform/default/dso_loader.cc:48] Successfully opened dynamic library libcudart.so.10.1
12/07/2020 21:59:18 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional_finetuned', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=False, evaluation_strategy=<EvaluationStrategy.STEPS: 'steps'>, prediction_loss_only=False, per_device_train_batch_size=8, per_device_eval_batch_size=8, per_gpu_train_batch_size=None, per_gpu_eval_batch_size=None, gradient_accumulation_steps=4, eval_accumulation_steps=None, learning_rate=5e-05, weight_decay=0.0, adam_beta1=0.9, adam_beta2=0.999, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=10.0, max_steps=-1, warmup_steps=0, logging_dir='runs/Dec07_21-59-18_4892d22f056d', logging_first_step=False, logging_steps=1, save_steps=10, save_total_limit=None, no_cuda=False, seed=42, 

In [128]:
model = AutoModelForMaskedLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional_finetuned/checkpoint-40').eval()

In [129]:
sentence = f'positive {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'

tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.92760354),
 (' better', 0.028655292),
 (' finest', 0.019014917),
 (' top', 0.010744136),
 (' greatest', 0.0019361579),
 (' hottest', 0.0011975307),
 (' worst', 0.00088472496),
 (' great', 0.00066560134),
 (' BEST', 0.0005510724),
 (' stars', 0.00053330633)]

In [130]:
sentence = f'negative {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'

tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.4735862),
 (' worst', 0.43094367),
 (' better', 0.044027366),
 (' worse', 0.009041737),
 (' finest', 0.007861966),
 (' cheapest', 0.00685714),
 (' top', 0.0043341867),
 (' smallest', 0.0027911544),
 (' weakest', 0.0020172282),
 (' poorest', 0.0015362091)]

This works much better. 'Worst' is still second most probable, but this time it has probability of 43%.

## Last checkpoint
We took fourth checkpoint because it had lower validation loss. However, validation loss is noisy (validation dataset has only 50 examples). Let's take the latest checkpoint to see how it does.

In [140]:
model = AutoModelForMaskedLM.from_pretrained('/content/drive/MyDrive/Colab Notebooks/nlp/pretrained_models/yelp_conditional_finetuned/checkpoint-100').eval()

In [141]:
sentence = f'positive {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'

tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' best', 0.9244944),
 (' better', 0.032192133),
 (' finest', 0.021731451),
 (' top', 0.006033973),
 (' greatest', 0.0022383414),
 (' worst', 0.0013920541),
 (' safest', 0.0010354719),
 (' great', 0.0008828963),
 (' hottest', 0.0008741495),
 (' cheapest', 0.00076614734)]

In [142]:
sentence = f'negative {tokenizer.sep_token} I think this restaurant is among the {tokenizer.mask_token} in the city'

tokenizer_output = tokenizer(sentence, return_tensors='pt')
input_ids = tokenizer_output['input_ids']
with torch.no_grad():
    logits = model(**tokenizer_output).logits

masked_position = (input_ids == tokenizer.mask_token_id).long().argmax().item()
probas = logits[0, masked_position, :].detach().softmax(0).numpy()
words_probas = sorted(zip(vocab_words, probas), key=lambda t: t[1], reverse=True)
words_probas[:10]

[(' worst', 0.59666413),
 (' best', 0.31662795),
 (' better', 0.03332686),
 (' worse', 0.01389257),
 (' cheapest', 0.010395007),
 (' finest', 0.006077247),
 (' smallest', 0.0027908573),
 (' weakest', 0.001942544),
 (' top', 0.0016419988),
 (' poorest', 0.0015085399)]

It does even better after more training. 'Worst' is the most probable word here, this model seems to capture sentiment quite well. Maybe it would keep improving after more training - let's check it.