#### **Installing Some required dependencies**

In [None]:
# !pip install rouge_score --quiet
# !pip install peft --quiet
# !pip install -U transformers

#### **Importing the required dependencies**

In [None]:
from datasets import load_dataset, Dataset, DatasetDict
from transformers import AutoTokenizer, TrainingArguments, Trainer, AutoModelForSeq2SeqLM
import torch
import gc
from peft import LoraConfig, get_peft_model, TaskType, PeftModel
import dask.dataframe as dd
from sklearn.model_selection import train_test_split

#### **Importing the dataset and removing all the samples with output length of 256**

In [6]:
dataset = dd.read_parquet("hf://datasets/sharad/chatgpt-paraphrases-simple/data/train-*-of-*.parquet").compute()
dataset = dataset[dataset['s2'].str.len() > 256]
dataset.head(5)

Unnamed: 0,s1,s2
4779,What's the best on-line calender/scheduling/bo...,Which on-line calendar/scheduling/booking syst...
4783,Which on-line calendar/scheduling/booking syst...,Which on-line calendar/scheduling/booking syst...
4786,What is the most effective on-line calendar/sc...,Which on-line calendar/scheduling/booking syst...
4788,Which on-line calendar/scheduling/booking syst...,Which on-line calendar/scheduling/booking syst...
4789,What is the most efficient on-line calendar/sc...,Which on-line calendar/scheduling/booking syst...


#### **Splitting the dataset into train test and validation set.**

In [8]:
dataset.rename(columns={'s1': 'input', 's2': 'output'}, inplace=True)
trainData, tempDF = train_test_split(dataset, test_size=0.3, random_state=42)
valData, testData = train_test_split(tempDF, test_size=1/3, random_state=42)

print(f"Training set size: {len(trainData)}")
print(f"Validation set size: {len(valData)}")
print(f"Test set size: {len(testData)}")

Training set size: 70209
Validation set size: 20060
Test set size: 10030


#### **Creating a model friendly transformers dataset dictionary for smooth training.**

In [9]:
trainData = Dataset.from_pandas(trainData.reset_index(drop=True))
valData = Dataset.from_pandas(valData.reset_index(drop=True))
testData = Dataset.from_pandas(testData.reset_index(drop=True))

datasetDict = DatasetDict({
    'train': trainData,
    'validation': valData,
    'test': testData
})
del trainData, valData, testData
datasetDict

DatasetDict({
    train: Dataset({
        features: ['input', 'output'],
        num_rows: 70209
    })
    validation: Dataset({
        features: ['input', 'output'],
        num_rows: 20060
    })
    test: Dataset({
        features: ['input', 'output'],
        num_rows: 10030
    })
})

#### **Importing the model and the tokenizer.**

In [10]:
MODEL_ID = "google/flan-t5-base"
def initModelAndTokenizer(modelID: str):
    model = AutoModelForSeq2SeqLM.from_pretrained(
        modelID,
        use_safetensors=True,
        trust_remote_code=True,
    )
    tokenizer = AutoTokenizer.from_pretrained(modelID)
    tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    return model, tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
baseModel, tokenizer = initModelAndTokenizer(MODEL_ID)
baseModel.to(device)
print("Log ---------------- Model and tokenizer Loaded")

config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]



Log ---------------- Model and tokenizer Loaded


#### **Defining a dataset preprocessor function that takes in input a sample of examples and outputs their `input_ids`, `attention_masks` and `labels`**

In [11]:
def tokenizeInputText(sample):
    prompt = [
        f'Paraphrase this sentence without changing its meaning: \"{inp}\"' for inp in sample["input"]
    ]
    # print(prompt)
    tokenizedSample = tokenizer(
        prompt,
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=216,
    )

    sample["input_ids"] = tokenizedSample["input_ids"]
    sample["attention_mask"] = tokenizedSample["attention_mask"]
    # print(sample["output"])
    labels = tokenizer(
        sample["output"],
        padding="max_length",
        truncation=True,
        return_tensors="pt",
        max_length=216,
    )["input_ids"]
    
    labels[labels == tokenizer.pad_token_id] = -100
    sample["labels"] = labels

    return sample


#### **Applying the tokenizer function in batches to exploit multiprocessing**

In [12]:
tokenizedData = datasetDict.map(tokenizeInputText, batched=True)
tokenizedData = tokenizedData.remove_columns(['input', 'output'])

Map:   0%|          | 0/70209 [00:00<?, ? examples/s]

Map:   0%|          | 0/20060 [00:00<?, ? examples/s]

Map:   0%|          | 0/10030 [00:00<?, ? examples/s]

In [13]:
t = 1452
print(tokenizedData["train"][t]["input_ids"])
print()
print(tokenizedData["train"][t]["labels"])
print()
print(tokenizer.decode(tokenizedData["train"][t]["input_ids"]))

[4734, 27111, 48, 7142, 406, 2839, 165, 2530, 10, 96, 7825, 986, 31842, 47, 2650, 6381, 17, 106, 31, 7, 2743, 21, 8, 1025, 97, 16, 6622, 6, 68, 8, 1357, 737, 31, 17, 726, 326, 38, 8, 372, 31, 7, 30552, 189, 18, 4687, 1992, 16, 8, 6552, 2009, 3679, 79, 163, 3, 28423, 3, 60, 5772, 257, 57, 578, 3, 9, 4784, 1288, 1750, 12, 1491, 7377, 12832, 277, 535, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]

[3, 4868, 271, 7817, 38, 6381, 17, 106, 31, 7, 2743, 21, 8, 1025, 97, 16, 6622, 6, 13816, 31842, 31, 7, 20752, 47, 26684, 6, 28, 8, 372, 8619, 30552, 189, 16, 8, 6552, 2009, 11, 163, 3, 16217, 3, 60, 5772, 257, 788, 

## Training setup

In [14]:
def trainableParams(model):
    trainable_model_params = 0
    all_model_params = 0
    for _, param in model.named_parameters():
        all_model_params += param.numel()
        if param.requires_grad:
            trainable_model_params += param.numel()
    return f"trainable model parameters: {trainable_model_params}\nall model parameters: {all_model_params}\npercentage of trainable model parameters: {100 * trainable_model_params / all_model_params:.2f}%"

In [15]:
OUPUT_DIR = f'/kaggle/working/flan-peft-train-V3'
NUM_EPOCHS = 5
LEARNING_RATE = 1e-3
DROPOUT_RATE = 0.1

loraConfig = LoraConfig(
    r=16,
    lora_alpha=16,
    target_modules=['q', 'v'],
    lora_dropout=DROPOUT_RATE,
    bias="none",
    task_type=TaskType.SEQ_2_SEQ_LM,
)

peftModel = get_peft_model(baseModel, loraConfig)

trainArgs = TrainingArguments(
    output_dir=OUPUT_DIR,
    num_train_epochs=NUM_EPOCHS,
    learning_rate=LEARNING_RATE,
    
    save_strategy="epoch",
    eval_strategy="epoch",
    
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    weight_decay=0.01,
    save_total_limit=3,
    
    logging_strategy="steps", 
    logging_steps=5,
)

trainer = Trainer(
    model=peftModel,
    args=trainArgs,
    train_dataset=tokenizedData['train'],
    eval_dataset=tokenizedData['validation'],
)

In [16]:
print(trainableParams(peftModel))

trainable model parameters: 1769472
all model parameters: 249347328
percentage of trainable model parameters: 0.71%


In [17]:
gc.collect()
torch.cuda.empty_cache()
# 0df9404f6d8f654feeb2066151fcad924d9d4363
trainStats = trainer.train()
print(trainStats.metrics)

[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


VBox(children=(Label(value='Waiting for wandb.init()...\r'), FloatProgress(value=0.011112971711109922, max=1.0…

  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


Epoch,Training Loss,Validation Loss
0,0.8782,0.80239
1,0.8858,0.771587
2,0.8882,0.754733
4,0.9106,0.742265


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):


{'train_runtime': 30864.1301, 'train_samples_per_second': 11.374, 'train_steps_per_second': 0.355, 'total_flos': 1.0220600466830131e+17, 'train_loss': 0.8701929144350745, 'epoch': 4.999430329269682}


#### **Evaluation section**

In [18]:
gc.collect()
torch.cuda.empty_cache()
PEFT_MODEL_ID = "/kaggle/working/flan-peft-train-V3/checkpoint-10970"
baseModelImport = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")
peftModelTest = PeftModel.from_pretrained(baseModelImport,PEFT_MODEL_ID,is_trainable=False).to(device)

In [None]:
evalIndex = 13
inputIds = torch.stack([torch.tensor(ids) for ids in tokenizedData["test"]["input_ids"][evalIndex:evalIndex+1]]).to(device)
attentionMask = torch.stack([torch.tensor(mask) for mask in tokenizedData["test"]["attention_mask"][evalIndex:evalIndex+1]]).to(device)
print(inputIds)
print(attentionMask)
peftModel.eval()

outputs = peftModelTest.generate(input_ids=inputIds,
                             max_new_tokens=216,
                             temperature=0.8,
                             attention_mask=attentionMask,
                             pad_token_id=tokenizer.pad_token_id,
                             top_k=50,
                             top_p=0.9,

                            )
print(outputs)
textedOutput = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("Prompt-------------")
print(tokenizer.decode(tokenizedData["test"][evalIndex]["input_ids"], skip_special_tokens=True))
print("\nModel Output-------")
print(textedOutput)

tensor([[ 4734, 27111,    48,  7142,   406,  2839,   165,  2530,    10,    96,
         11889,    15,    15,  9351,     7,    43,   118,  4792,    11,     3,
         14903,    72,  5830,  7532,    16, 10748,  6032,   437,  1671,  2628,
            28, 15721,     7, 11214,     3, 23606,   844,    95,    12,  1283,
         20325,    41,  1828,  2286,    61,  1096,  9351,  9964,    11,     3,
          3131,  8640,     6,  2313,  9351,     7,    16,  5129,     6,     8,
           934,   243,   535,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,  