# Google Colab file for paraphrasing the MultiNLI development set. 
#### this version uses a large BART seq2seq (text2text generation) model fine-tuned on 3 paraphrase datasets.

https://huggingface.co/eugenesiow/bart-paraphrase

In [None]:
# install libraries
!pip install SentencePiece 
!pip install transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting SentencePiece
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m63.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SentencePiece
Successfully installed SentencePiece-0.1.99
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m104.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
Collecting t

In [None]:
# import libraries
import torch
import json
from datetime import datetime
from transformers import BartForConditionalGeneration, BartTokenizer

In [None]:
# set seed and torch device
# Colab notebook setting "runtime type" must be set to GPU
torch.manual_seed(0)
torch.cuda.manual_seed_all(0)
device = "cuda"

In [None]:
# get the model
model = BartForConditionalGeneration.from_pretrained('eugenesiow/bart-paraphrase').to(device)
tokenizer = BartTokenizer.from_pretrained('eugenesiow/bart-paraphrase')

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/332 [00:00<?, ?B/s]

In [None]:
# define functions
def paraphrase_sentence(sentence):
  # tokenize the sentence and run through the BART model
  batch = tokenizer(sentence, return_tensors='pt').to(device)
  generated_ids = model.generate(batch['input_ids'], max_length=128)
  return tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]

def paraphrase_dataset(path_original, path_out):
  # open files
  with open(path_out, "w") as out_f:
    with open(path_original, "r") as f:
      for id_, row in enumerate(f):
        data = json.loads(row)
        # paraphrase the sentences
        new_data = {"label": data["label"],
                    "premise": paraphrase_sentence(data["premise"]),
                    "hypothesis": paraphrase_sentence(data["hypothesis"])}

        # checking progress
        if id_ % 400 == 0:
          print(id_, datetime.now())

        # export the sentences
        json.dump(new_data, out_f)
        out_f.write('\n')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# read files from Google Drive and export to Google Drive
paraphrase_dataset("/content/drive/MyDrive/thesis/multinli_dev_matched.jsonl", 
                   "/content/drive/MyDrive/thesis/multinli_dev_matched_bart.jsonl")

paraphrase_dataset("/content/drive/MyDrive/thesis/multinli_dev_mismatched.jsonl", 
                   "/content/drive/MyDrive/thesis/multinli_dev_mismatched_bart.jsonl")

0 2023-05-30 13:07:23.367927
400 2023-05-30 13:11:59.434058
800 2023-05-30 13:16:32.527289
1200 2023-05-30 13:20:59.015041
1600 2023-05-30 13:25:30.014289
2000 2023-05-30 13:30:03.424603
2400 2023-05-30 13:34:35.423831
2800 2023-05-30 13:39:03.668556
3200 2023-05-30 13:43:20.697612
3600 2023-05-30 13:47:54.439071
4000 2023-05-30 13:52:07.572423
4400 2023-05-30 13:56:28.263303
4800 2023-05-30 14:00:46.083281
5200 2023-05-30 14:05:09.343819
5600 2023-05-30 14:09:23.700766
6000 2023-05-30 14:13:45.886004
6400 2023-05-30 14:18:06.525594
6800 2023-05-30 14:22:25.314218
7200 2023-05-30 14:26:54.439776
7600 2023-05-30 14:31:35.800136
8000 2023-05-30 14:36:21.398219
8400 2023-05-30 14:41:08.413496
8800 2023-05-30 14:45:37.925339
9200 2023-05-30 14:50:15.880195
9600 2023-05-30 14:54:56.467328
0 2023-05-30 14:59:33.284349
400 2023-05-30 15:04:18.342472
800 2023-05-30 15:08:59.916630
1200 2023-05-30 15:13:29.475931
1600 2023-05-30 15:18:13.925483
2000 2023-05-30 15:23:10.581145
2400 2023-05-30 15