<a href="https://colab.research.google.com/github/iAmKankan/Paraphrase/blob/main/rephraseit.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install simpletransformers==0.60.9

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting simpletransformers==0.60.9
  Downloading simpletransformers-0.60.9-py3-none-any.whl (206 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m206.7/206.7 KB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting seqeval
  Downloading seqeval-1.2.2.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 KB[0m [31m5.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting streamlit
  Downloading streamlit-1.16.0-py2.py3-none-any.whl (9.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.2/9.2 MB[0m [31m83.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece
  Downloading sentencepiece-0.1.97-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m72.3 MB/s[0m eta 

In [2]:
import warnings
import os
from datetime import datetime
import logging

import pandas as pd
from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs

In [3]:
logging.basicConfig(level=logging.INFO)
transformers_logger = logging.getLogger("transformers")
transformers_logger.setLevel(logging.ERROR)

### Data Loading Function

In [4]:
def load_data(
    file_path, input_text_column, target_text_column, label_column, keep_label=1
):
    df = pd.read_csv(file_path, sep="\t", error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

### Data Cleaning Operations

In [5]:
def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
!unzip /content/drive/MyDrive/Projects-NLP/Paraphrasing/data/RephraseitDev.zip

Archive:  /content/drive/MyDrive/Projects-NLP/Paraphrasing/data/RephraseitDev.zip
  inflating: RephraseitDev/dev.tsv   
  inflating: RephraseitDev/msr_paraphrase_test.txt  
  inflating: RephraseitDev/msr_paraphrase_train.txt  
  inflating: RephraseitDev/parabank_5m.tsv  
  inflating: RephraseitDev/quora_duplicate_questions.tsv  
  inflating: RephraseitDev/rephrase.py  
  inflating: RephraseitDev/test.tsv  
  inflating: RephraseitDev/train.tsv  


In [7]:
%cd /content/drive/MyDrive/Projects-NLP/Paraphrasing/data/RephraseitDev

/content/drive/MyDrive/Projects-NLP/Paraphrasing/data/RephraseitDev


In [9]:
#!unzip /content/drive/MyDrive/nlp_para/RephraseitDev.zip

In [10]:
# %cd /content/drive/MyDrive/nlp_para/RephraseitDev

### Google Paws Dataset

In [8]:
# Google Data
train_df = pd.read_csv("train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("dev.tsv", sep="\t").astype(str)


train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"

print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                              input_text  \
1      The NBA season of 1975 -- 76 was the 30th seas...   
3      When comparable rates of flow can be maintaine...   
4      It is the seat of Zerendi District in Akmola R...   
5      William Henry Henry Harman was born on 17 Febr...   
7      With a discrete amount of probabilities Formul...   
...                                                  ...   
49384  The Romanesque language , Galician ( Galego ) ...   
49390  Note that k is a vector consisting of three in...   
49393  Tim Henman won in the final 6 -- 2 , 7 -- 6 , ...   
49395  He was considered an active member of the coun...   
49397  She was in Cork on June 24 and arrived on 8 Ju...   

                                             target_text      prefix  
1      The 1975 -- 76 season of the National Basketba...  paraphrase  
3      The results are high when comparable flow rate...  paraphrase  
4      It is the seat of the district of Zerendi in A...  paraphra

### MSRP Data Loading

In [9]:
# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)




  load_data("msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSki

In [10]:
print(train_df)
print("-------------------------------------------------------------")
print(eval_df)

                                             input_text  \
1     The NBA season of 1975 -- 76 was the 30th seas...   
3     When comparable rates of flow can be maintaine...   
4     It is the seat of Zerendi District in Akmola R...   
5     William Henry Henry Harman was born on 17 Febr...   
7     With a discrete amount of probabilities Formul...   
...                                                 ...   
3931  Knox County Health Department is following nat...   
3932  The new rules will allow a single company to o...   
3933  At this point, Mr. Brando announced: 'Somebody...   
3935  We have concluded that the outlook for price s...   
3936  The notification was first reported Friday by ...   

                                            target_text      prefix  
1     The 1975 -- 76 season of the National Basketba...  paraphrase  
3     The results are high when comparable flow rate...  paraphrase  
4     It is the seat of the district of Zerendi in A...  paraphrase  
5     Willi

### Quora Datset

In [11]:
# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "quora_duplicate_questions.tsv", "question1", "question2", "is_duplicate"
)
q_train, q_test = train_test_split(df)



  df = load_data(


### ParaBank Dataset

In [12]:
#Parabank Data
paradata = pd.read_csv("parabank_5m.tsv", sep='\t', header=None, error_bad_lines=False)



  exec(code_obj, self.user_global_ns, self.user_ns)
b'Skipping line 1104: expected 2 fields, saw 3\nSkipping line 1638: expected 2 fields, saw 3\nSkipping line 3057: expected 2 fields, saw 3\nSkipping line 3114: expected 2 fields, saw 3\nSkipping line 4860: expected 2 fields, saw 3\nSkipping line 5558: expected 2 fields, saw 3\nSkipping line 5817: expected 2 fields, saw 3\nSkipping line 8544: expected 2 fields, saw 3\nSkipping line 8583: expected 2 fields, saw 3\nSkipping line 9089: expected 2 fields, saw 3\nSkipping line 9287: expected 2 fields, saw 3\nSkipping line 9708: expected 2 fields, saw 3\nSkipping line 9718: expected 2 fields, saw 3\nSkipping line 10829: expected 2 fields, saw 3\nSkipping line 12176: expected 2 fields, saw 3\nSkipping line 12371: expected 2 fields, saw 3\nSkipping line 12552: expected 2 fields, saw 3\nSkipping line 13634: expected 2 fields, saw 3\nSkipping line 16258: expected 2 fields, saw 3\nSkipping line 16544: expected 2 fields, saw 3\nSkipping line 169

In [13]:
paradata['prefix'] = 'paraphrase'
#paradata = paradata.reindex(columns=['prefix',0,1])
paradata.rename(columns={0:'input_text',1:'target_text'}, inplace = True)
para_train, para_test = train_test_split(paradata)

### Data Transformation &b Cleaning

In [14]:
train_df = pd.concat([train_df, q_train,para_train])
eval_df = pd.concat([eval_df, q_test,para_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

# My Datset Ready....let's Go for training

### Simple Transformers

In [None]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 16
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 1000
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 10
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 4
model_args.use_multiprocessing = False
model_args.wandb_project = "Paraphrasing with BART"


model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args,
)

model.train_model(train_df, eval_data=eval_df)

to_predict = [
    prefix + ": " + str(input_text)
    for prefix, input_text in zip(eval_df["prefix"].tolist(), eval_df["input_text"].tolist())
]
truth = eval_df["target_text"].tolist()

preds = model.predict(to_predict)

# Saving the predictions if needed
os.makedirs("predictions", exist_ok=True)

with open(f"predictions/predictions_{datetime.now()}.txt", "w") as f:
    for i, text in enumerate(eval_df["input_text"].tolist()):
        f.write(str(text) + "\n\n")

        f.write("Truth:\n")
        f.write(truth[i] + "\n\n")

        f.write("Prediction:\n")
        for pred in preds[i]:
            f.write(str(pred) + "\n")
        f.write(
            "________________________________________________________________________________\n"
        )

Downloading:   0%|          | 0.00/1.63k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.02G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

  0%|          | 0/3641227 [00:00<?, ?it/s]

In [None]:
!nvidia-smi

In [None]:
!conda install pytorch==1.6.0 torchvision==0.7.0 cudatoolkit=10.1 -c pytorch -y