<a href="https://colab.research.google.com/github/jonathan-sudo/colab-BART/blob/main/BART_test.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [14]:
!pip install simpletransformers
!pip install torch
!pip install tqdm

import os
import logging
import warnings
import pandas as pd
from datetime import datetime

from sklearn.model_selection import train_test_split
from simpletransformers.seq2seq import Seq2SeqModel, Seq2SeqArgs





In [15]:

def load_data(
    file_path, 
    input_text_column, 
    target_text_column, 
    label_column, 
    keep_label=1,
    sep="\t"
):
    df = pd.read_csv(file_path, sep=sep, error_bad_lines=False)
    df = df.loc[df[label_column] == keep_label]
    df = df.rename(
        columns={input_text_column: "input_text", target_text_column: "target_text"}
    )
    df = df[["input_text", "target_text"]]
    df["prefix"] = "paraphrase"

    return df

def clean_unnecessary_spaces(out_string):
    if not isinstance(out_string, str):
        warnings.warn(f">>> {out_string} <<< is not a string.")
        out_string = str(out_string)
    out_string = (
        out_string.replace(" .", ".")
        .replace(" ?", "?")
        .replace(" !", "!")
        .replace(" ,", ",")
        .replace(" ' ", "'")
        .replace(" n't", "n't")
        .replace(" 'm", "'m")
        .replace(" 's", "'s")
        .replace(" 've", "'ve")
        .replace(" 're", "'re")
    )
    return out_string

# Nouvelle section

In [16]:

# Google Data
train_df = pd.read_csv("/content/drive/MyDrive/Data/BART-fine-tuning/train.tsv", sep="\t").astype(str)
eval_df = pd.read_csv("/content/drive/MyDrive/Data/BART-fine-tuning/dev.tsv", sep="\t").astype(str)

train_df = train_df.loc[train_df["label"] == "1"]
eval_df = eval_df.loc[eval_df["label"] == "1"]

train_df = train_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)
eval_df = eval_df.rename(
    columns={"sentence1": "input_text", "sentence2": "target_text"}
)

train_df = train_df[["input_text", "target_text"]]
eval_df = eval_df[["input_text", "target_text"]]

train_df["prefix"] = "paraphrase"
eval_df["prefix"] = "paraphrase"


"""
# MSRP Data
train_df = pd.concat(
    [
        train_df,
        load_data("/content/drive/MyDrive/Data/BART-fine-tuning/msr_paraphrase_train.txt", "#1 String", "#2 String", "Quality"),
    ]
)
eval_df = pd.concat(
    [
        eval_df,
        load_data("/content/drive/MyDrive/Data/BART-fine-tuning/msr_paraphrase_test.txt", "#1 String", "#2 String", "Quality"),
    ]
)
"""

# Quora Data

# The Quora Dataset is not separated into train/test, so we do it manually the first time.
df = load_data(
    "/content/drive/MyDrive/Data/BART-fine-tuning/quora_duplicate_questions.tsv", 
    "question1", 
    "question2", 
    "is_duplicate"
)
q_train, q_test = train_test_split(df)

q_train.to_csv("/content/drive/MyDrive/Data/BART-fine-tuning/quora_train.tsv", sep="\t")
q_test.to_csv("/content/drive/MyDrive/Data/BART-fine-tuning/quora_test.tsv", sep="\t")

# The code block above only needs to be run once.
# After that, the two lines below are sufficient to load the Quora dataset.

# q_train = pd.read_csv("/content/drive/MyDrive/Data/BART-fine-tuning/quora_train.tsv", sep="\t")
# q_test = pd.read_csv("/content/drive/MyDrive/Data/BART-fine-tuning/quora_test.tsv", sep="\t")

train_df = pd.concat([train_df, q_train])
eval_df = pd.concat([eval_df, q_test])

train_df = train_df[["prefix", "input_text", "target_text"]]
eval_df = eval_df[["prefix", "input_text", "target_text"]]

train_df = train_df.dropna()
eval_df = eval_df.dropna()

train_df["input_text"] = train_df["input_text"].apply(clean_unnecessary_spaces)
train_df["target_text"] = train_df["target_text"].apply(clean_unnecessary_spaces)

eval_df["input_text"] = eval_df["input_text"].apply(clean_unnecessary_spaces)
eval_df["target_text"] = eval_df["target_text"].apply(clean_unnecessary_spaces)

print(train_df)

b'Skipping line 102: expected 5 fields, saw 6\nSkipping line 656: expected 5 fields, saw 6\nSkipping line 867: expected 5 fields, saw 6\nSkipping line 880: expected 5 fields, saw 6\nSkipping line 980: expected 5 fields, saw 6\nSkipping line 1439: expected 5 fields, saw 6\nSkipping line 1473: expected 5 fields, saw 6\nSkipping line 1822: expected 5 fields, saw 6\nSkipping line 1952: expected 5 fields, saw 6\nSkipping line 2009: expected 5 fields, saw 6\nSkipping line 2230: expected 5 fields, saw 6\nSkipping line 2506: expected 5 fields, saw 6\nSkipping line 2523: expected 5 fields, saw 6\nSkipping line 2809: expected 5 fields, saw 6\nSkipping line 2887: expected 5 fields, saw 6\nSkipping line 2920: expected 5 fields, saw 6\nSkipping line 2944: expected 5 fields, saw 6\nSkipping line 3241: expected 5 fields, saw 6\nSkipping line 3358: expected 5 fields, saw 6\nSkipping line 3459: expected 5 fields, saw 6\nSkipping line 3491: expected 5 fields, saw 6\nSkipping line 3643: expected 5 fields

            prefix  ...                                        target_text
1       paraphrase  ...  The 1975 -- 76 season of the National Basketba...
3       paraphrase  ...  The results are high when comparable flow rate...
4       paraphrase  ...  It is the seat of the district of Zerendi in A...
5       paraphrase  ...  William Henry Harman was born in Waynesboro, V...
7       paraphrase  ...  Given a discrete set of probabilities formula ...
...            ...  ...                                                ...
10237   paraphrase  ...               What are some substitutes for cumin?
37230   paraphrase  ...      What are the different factors of production?
97997   paraphrase  ...                    What are some good blog topics?
12699   paraphrase  ...                                How can I join MIT?
133574  paraphrase  ...  What are your views on the decision of Narendr...

[136422 rows x 3 columns]


In [17]:
model_args = Seq2SeqArgs()
model_args.do_sample = True
model_args.eval_batch_size = 64
model_args.evaluate_during_training = True
model_args.evaluate_during_training_steps = 2500
model_args.evaluate_during_training_verbose = True
model_args.fp16 = False
model_args.learning_rate = 5e-5
model_args.max_length = 128
model_args.max_seq_length = 128
model_args.num_beams = None
model_args.num_return_sequences = 3
model_args.num_train_epochs = 2
model_args.overwrite_output_dir = True
model_args.reprocess_input_data = True
model_args.save_eval_checkpoints = False
model_args.save_steps = -1
model_args.top_k = 50
model_args.top_p = 0.95
model_args.train_batch_size = 8
model_args.use_multiprocessing = False

In [19]:
model = Seq2SeqModel(
    encoder_decoder_type="bart",
    encoder_decoder_name="facebook/bart-large",
    args=model_args
)

model.train_model(train_df, eval_data=eval_df)

RuntimeError: ignored