## Installing dependencies

In [4]:
!pip install simpletransformers datasets tqdm pandas

Collecting simpletransformers
[?25l  Downloading https://files.pythonhosted.org/packages/24/fc/3da256b01385dcecd52f79c11cc493a4cfbef8e6d1a6a62d98e8536c2993/simpletransformers-0.61.9-py3-none-any.whl (220kB)
[K     |████████████████████████████████| 225kB 7.7MB/s 
[?25hCollecting datasets
[?25l  Downloading https://files.pythonhosted.org/packages/08/a2/d4e1024c891506e1cee8f9d719d20831bac31cb5b7416983c4d2f65a6287/datasets-1.8.0-py3-none-any.whl (237kB)
[K     |████████████████████████████████| 245kB 12.9MB/s 
Collecting tensorboardx
[?25l  Downloading https://files.pythonhosted.org/packages/42/36/2b147652c40c3a858efa0afbf7b8236fae968e88ff530511a4cfa299a506/tensorboardX-2.3-py2.py3-none-any.whl (124kB)
[K     |████████████████████████████████| 133kB 14.6MB/s 
Collecting wandb>=0.10.32
[?25l  Downloading https://files.pythonhosted.org/packages/e0/b4/9d92953d8cddc8450c859be12e3dbdd4c7754fb8def94c28b3b351c6ee4e/wandb-0.10.32-py2.py3-none-any.whl (1.8MB)
[K     |█████████████████████

##Loading data from huggingface(optional)

In [None]:
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm

In [None]:
dataset = load_dataset('tapaco', 'en')

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=3015.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=4687.0, style=ProgressStyle(description…


Downloading and preparing dataset tapaco/en (download: 30.72 MiB, generated: 14.37 MiB, post-processed: Unknown size, total: 45.09 MiB) to /root/.cache/huggingface/datasets/tapaco/en/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698...


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=32213126.0, style=ProgressStyle(descrip…




HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Dataset tapaco downloaded and prepared to /root/.cache/huggingface/datasets/tapaco/en/1.0.0/71d200534b520a174927a8f0479c06220a0a6fb5201a84ebfce19006c6354698. Subsequent calls will reuse this data.


In [None]:
def process_tapaco_dataset(dataset, out_file):
    tapaco = []
    # The dataset has only train split.
    for data in tqdm(dataset["train"]):
        keys = data.keys()
        tapaco.append([data[key] for key in keys])
    tapaco_df = pd.DataFrame(
        data=tapaco,
        columns=[
            "language",
            "lists",
            "paraphrase",
            "paraphrase_set_id",
            "sentence_id",
            "tags",
        ],
    )
    tapaco_df.to_csv(out_file, sep="\t", index=None)
    return tapaco_df

In [None]:
tapaco_df = process_tapaco_dataset(dataset,"tapaco_huggingface.csv")

100%|██████████| 158053/158053 [00:22<00:00, 6997.11it/s]


In [None]:
tapaco_df.head()

Unnamed: 0,language,lists,paraphrase,paraphrase_set_id,sentence_id,tags
0,en,"['907', '4000', '6677', '7361', '7415']",I ate the cheese.,1,416554,['']
1,en,['907'],I eat cheese.,1,2481696,['']
2,en,"['992', '3800']",I'm eating a yogurt.,1,2721028,['']
3,en,['6905'],I'm eating cheese.,1,3010891,['']
4,en,['6905'],I'm having some cheese.,1,4129977,['']


## Preprocessing TaPaCo for training(optional)

In [None]:
import pandas as pd
from tqdm import tqdm

In [None]:
tapaco_df = pd.read_csv("tapaco_huggingface.csv",sep="\t")

In [None]:
def generate_tapaco_paraphrase_dataset(dataset, out_file):
    dataset_df = dataset[["paraphrase", "paraphrase_set_id"]]
    non_single_labels = (
        dataset_df["paraphrase_set_id"]
        .value_counts()[dataset_df["paraphrase_set_id"].value_counts() > 1]
        .index.tolist()
    )
    tapaco_df_sorted = dataset_df.loc[
        dataset_df["paraphrase_set_id"].isin(non_single_labels)
    ]
    tapaco_paraphrases_dataset = []

    for paraphrase_set_id in tqdm(tapaco_df_sorted["paraphrase_set_id"].unique()):
        id_wise_paraphrases = tapaco_df_sorted[
            tapaco_df_sorted["paraphrase_set_id"] == paraphrase_set_id
        ]
        len_id_wise_paraphrases = (
            id_wise_paraphrases.shape[0]
            if id_wise_paraphrases.shape[0] % 2 == 0
            else id_wise_paraphrases.shape[0] - 1
        )
        for ix in range(0, len_id_wise_paraphrases, 2):
            current_phrase = id_wise_paraphrases.iloc[ix][0]
            for count_ix in range(ix + 1, ix + 2):
                next_phrase = id_wise_paraphrases.iloc[ix + 1][0]
                tapaco_paraphrases_dataset.append([current_phrase, next_phrase])
    tapaco_paraphrases_dataset_df = pd.DataFrame(
        tapaco_paraphrases_dataset, columns=["Text", "Paraphrase"]
    )
    tapaco_paraphrases_dataset_df.to_csv(out_file, sep="\t", index=None)
    return tapaco_paraphrases_dataset_df

In [None]:
dataset_df = generate_tapaco_paraphrase_dataset(tapaco_df,"tapaco_paraphrases_dataset.csv")

100%|██████████| 62044/62044 [01:28<00:00, 698.45it/s]


In [None]:
dataset_df.head()

Unnamed: 0,Text,Paraphrase
0,I ate the cheese.,I eat cheese.
1,I'm eating a yogurt.,I'm eating cheese.
2,I'm having some cheese.,I eat some cheese.
3,It's Monday.,It is Monday today.
4,It's Monday today.,Today is Monday.


## Load already preprocessed version of TaPaCo

In [None]:
!wget https://github.com/hetpandya/paraphrase-datasets-pretrained-models/raw/main/datasets/tapaco/tapaco_paraphrases_dataset.csv

In [None]:
dataset_df = pd.read_csv("tapaco_paraphrases_dataset.csv",sep="\t")

##Model Training

In [None]:
from simpletransformers.t5 import T5Model
from sklearn.model_selection import train_test_split
import sklearn

In [None]:
dataset_df.columns = ["input_text","target_text"]
dataset_df["prefix"] = "paraphrase"

In [None]:
train_data,test_data = train_test_split(dataset_df,test_size=0.1)

In [None]:
train_data

Unnamed: 0,input_text,target_text,prefix
10426,Everyone kept quiet.,Everything is quiet.,paraphrase
58725,I'm going to cancel in another two days.,I'm going to cancel in a couple of days.,paraphrase
28478,That's what troubles me.,That's what bothers me.,paraphrase
62231,When was the last time you visited Boston?,When did you visit Boston last?,paraphrase
62067,I came in early.,I came early.,paraphrase
...,...,...,...
44779,A lot of people look up to you. Don't let them...,Many people worship you. Do not disappoint them.,paraphrase
23039,I must have lost my purse in the supermarket.,It seems I forgot my wallet at the supermarket.,paraphrase
16486,Tom doesn't understand anything of french.,Tom speaks no French.,paraphrase
58384,I've considered that possibility as well.,I've also considered that possibility.,paraphrase


In [None]:
test_data

Unnamed: 0,input_text,target_text,prefix
66729,Tom should've been more patient.,Tom should have been more patient.,paraphrase
48228,"If you don't study, you won't pass the exam.",You won't pass the exam if you don't study for...,paraphrase
51009,It's wrong of you to talk back to her.,You're out of order in talking back to her in ...,paraphrase
17720,He asked after you last night.,"Last night, he asked if you were well.",paraphrase
71420,Tom and Mary live in Australia with their father.,Tom and Mary live with their father in Australia.,paraphrase
...,...,...,...
35111,The substance is light enough to float on the ...,The material is light enough to float in water.,paraphrase
1358,They know him.,They know you.,paraphrase
67137,Tom has no one to advise him.,Tom has nobody to advise him.,paraphrase
31977,I see a boy.,I see the man.,paraphrase


In [None]:
args = {
    "reprocess_input_data": True,
    "overwrite_output_dir": True,
    "max_seq_length": 256,
    "num_train_epochs": 4,
    "num_beams": None,
    "do_sample": True,
    "top_k": 50,
    "top_p": 0.95,
    "use_multiprocessing": False,
    "save_steps": -1,
    "save_eval_checkpoints": True,
    "evaluate_during_training": False,
    'adam_epsilon': 1e-08,
    'eval_batch_size': 6,
    'fp_16': False,
    'gradient_accumulation_steps': 16,
    'learning_rate': 0.0003,
    'max_grad_norm': 1.0,
    'n_gpu': 1,
    'seed': 42,
    'train_batch_size': 6,
    'warmup_steps': 0,
    'weight_decay': 0.0
}

In [None]:
model = T5Model("t5","t5-small", args=args)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1197.0, style=ProgressStyle(description…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=242065649.0, style=ProgressStyle(descri…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=791656.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1389353.0, style=ProgressStyle(descript…




In [None]:
model.train_model(train_data, eval_data=test_data, use_cuda=True,acc=sklearn.metrics.accuracy_score)

HBox(children=(FloatProgress(value=0.0, max=66166.0), HTML(value='')))




Using Adafactor for T5


HBox(children=(FloatProgress(value=0.0, description='Epoch', max=4.0, style=ProgressStyle(description_width='i…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 0 of 4', max=11028.0, style=ProgressStyle(d…





HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 4', max=11028.0, style=ProgressStyle(d…

HBox(children=(FloatProgress(value=0.0, description='Running Epoch 1 of 4', max=11028.0, style=ProgressStyle(d…

##Loading Trained Model & Prediction Using Trained Model

In [6]:
from simpletransformers.t5 import T5Model
from pprint import pprint
import os

In [10]:
root_dir = os.getcwd()
trained_model_path = os.path.join(root_dir,"outputs")

In [11]:
args = {
    "overwrite_output_dir": True,
    "max_seq_length": 256,
    "max_length": 50,
    "top_k": 50,
    "top_p": 0.95,
    "num_return_sequences": 5,
}

In [12]:
trained_model = T5Model("t5",trained_model_path,args=args)

In [13]:
prefix = "paraphrase"
pred = trained_model.predict([f"{prefix}: The house will be cleaned by me every Saturday."])
pprint(pred)

HBox(children=(FloatProgress(value=0.0, description='Generating outputs', max=1.0, style=ProgressStyle(descrip…






HBox(children=(FloatProgress(value=0.0, description='Decoding outputs', max=5.0, style=ProgressStyle(descripti…


[['My home will be cleaned on Saturdays.',
  'I will clean the house every Saturday.',
  'The house is going to be clean every Saturday.',
  "I'll clean the house every Saturday.",
  'I will clean the house every Saturday.']]
