In [1]:
import numpy as np 
import pandas as pd
import os
from datasets import load_dataset, Dataset
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

import tensorflow as tf

2024-10-04 17:22:22.798159: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [3]:
data = pd.read_csv('./Bitext_Sample.csv')

In [4]:
data.columns

Index(['flags', 'instruction', 'category', 'intent', 'response'], dtype='object')

In [5]:
# sample
for col in data.columns:
    print(f'\n {col} :\n',data.iloc[0][col])


 flags :
 B

 instruction :
 question about cancelling order {{Order Number}}

 category :
 ORDER

 intent :
 cancel_order

 response :
 I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you.


### Model Fetch

In [6]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")

input_text = "translate English to German: How old are you?"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<pad> Wie old sind Sie?</s>




In [7]:
original_model = model.to('cpu')

`Function to get instruction prompt using an index`

In [8]:
# helper functions

def get_prompt(data, index):
    instruction = data.iloc[index]['instruction']
    response = data.iloc[index]['response']
    
    prompt = f'''
    instruction:
    {instruction}
    
    response:
    {response}
    
    '''
    
    return prompt

def get_instruction(data,index):
    instruction = data.iloc[index]['instruction']
    return f'''
    
    instruction:
    {instruction}
    
    response:
    
    '''
    

### Check Model Output [One Shot]

In [9]:
test_prompt = get_prompt(data,0) + get_instruction(data,10)
print(test_prompt)


    instruction:
    question about cancelling order {{Order Number}}
    
    response:
    I've understood you have a question regarding canceling order {{Order Number}}, and I'm here to provide you with the information you need. Please go ahead and ask your question, and I'll do my best to assist you.
    
    
    
    instruction:
    i dont know what to do to cancel order {{Order Number}}
    
    response:
    
    


In [10]:
input_text = tokenizer.encode(test_prompt,return_tensors="pt")
output = tokenizer.decode(original_model.generate(input_text)[0])
print(output)

<pad> I've understood you have a question regarding canceling order <unk> Order Number<unk>


### Create Dataset

In [11]:
def tokenize_function(example):
    start_prompt = 'Instruction:\n'
    end_prompt = '\nResponse:'
    prompt = [start_prompt + dialogue + end_prompt for dialogue in example["instruction"]]
    example['input_ids'] = tokenizer(prompt, padding="max_length", truncation=True, return_tensors="pt").input_ids
    example['labels'] = tokenizer(example["response"], padding="max_length", truncation=True, return_tensors="pt").input_ids
    
    return example

In [12]:
act_dataset = Dataset.from_csv('./Bitext_Sample.csv')
shuffled_dataset = act_dataset.shuffle(seed=42) 

split_datasets = shuffled_dataset.train_test_split(test_size = 0.2)

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [13]:
tokenized_datasets = split_datasets.map(tokenize_function, batched=True)
tokenized_datasets = tokenized_datasets.remove_columns(['flags', 'instruction', 'category', 'intent', 'response'])

Map:   0%|          | 0/21497 [00:00<?, ? examples/s]

Map:   0%|          | 0/5375 [00:00<?, ? examples/s]

In [14]:
tokenized_datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 21497
    })
    test: Dataset({
        features: ['input_ids', 'labels'],
        num_rows: 5375
    })
})

### PEFT

In [None]:
%pip install peft

In [16]:
from peft import LoraConfig, get_peft_model, TaskType

In [17]:
lora_config = LoraConfig(
    r=32, # Rank
    lora_alpha=32,
    lora_dropout=0.05,
    task_type=TaskType.SEQ_2_SEQ_LM
)

In [18]:
peft_model = get_peft_model(original_model, 
                            lora_config)
print(peft_model.print_trainable_parameters())

trainable params: 3,538,944 || all params: 251,116,800 || trainable%: 1.4093
None


In [30]:
import time

output_dir = f'./peft-dialogue-summary-training-{str(int(time.time()))}'

peft_training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=5,
    per_device_eval_batch_size=5,
    learning_rate=1e-3,
    num_train_epochs= 3
)
    
peft_trainer = Trainer(
    model=peft_model,
    args=peft_training_args,
    train_dataset= tokenized_datasets['train'],
    eval_dataset= tokenized_datasets['test']
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [31]:
from safetensors.torch import load_model, save_model

peft_trainer.train()
save_model(peft_model, "model.safetensors")

[34m[1mwandb[0m: Currently logged in as: [33mkalprikshbist[0m ([33mkalpriksh[0m). Use [1m`wandb login --relogin`[0m to force relogin


Step,Training Loss
500,1.2935
1000,0.4344
1500,0.3867
2000,0.3729
2500,0.352
3000,0.3406
3500,0.3783
4000,0.3263
4500,0.3136
5000,0.3129


In [32]:
# from peft import PeftModel, PeftConfig

# peft_model_base = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base", torch_dtype=torch.bfloat16)
# tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

# peft_model = PeftModel.from_pretrained(peft_model_base, 
#                                        '/kaggle/working/peft-dialogue-summary-training-1712733026/checkpoint-13500/', 
#                                        torch_dtype=torch.bfloat16,
#                                        is_trainable=False)

In [33]:
!zip -r file.zip /kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500

  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/ (stored 0%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/trainer_state.json (deflated 76%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/adapter_config.json (deflated 52%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/optimizer.pt (deflated 9%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/rng_state.pth (deflated 25%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/adapter_model.safetensors (deflated 7%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/training_args.bin (deflated 51%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/scheduler.pt (deflated 55%)
  adding: kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500/READM

In [44]:
np.shape(input_text)

torch.Size([1, 89])

In [36]:
peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))

tensor([[8033,   10,  822,   81, 9179,  697,  455,    3,    2, 7395,  588, 7720,
            2, 1773,   10,   27,   31,  162, 7571,   25,   43,    3,    9,  822,
         1918, 9179,   53,  455,    3,    2, 7395,  588, 7720,    2,    6,   11,
           27,   31,   51,  270,   12,  370,   25,   28,    8,  251,   25,  174,
            5,  863,  281, 2177,   11,  987,   39,  822,    6,   11,   27,   31,
          195,  103,   82,  200,   12, 2094,   25,    5, 8033,   10,    3,   23,
         2483,  214,  125,   12,  103,   12, 9179,  455,    3,    2, 7395,  588,
         7720,    2, 1773,   10,    1]])

In [None]:
input_text = tokenizer.encode(test_prompt,return_tensors="pt").input_ids
output = tokenizer.decode(original_model.generate(input_text)[0])
print(output)

### load model

In [1]:
import numpy as np 
import pandas as pd
import os
from datasets import load_dataset, Dataset
import torch

from transformers import AutoModelForSeq2SeqLM, AutoTokenizer, GenerationConfig, TrainingArguments, Trainer

import tensorflow as tf

2024-04-14 07:37:28.667024: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-04-14 07:37:28.667175: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-04-14 07:37:28.844414: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
%pip install peft
from peft import PeftModel, PeftConfig

Installing collected packages: peft
Successfully installed peft-0.10.0
Note: you may need to restart the kernel to use updated packages.


In [14]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained("google/flan-t5-base")
model = T5ForConditionalGeneration.from_pretrained("google/flan-t5-base")


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
peft_model = PeftModel.from_pretrained(model, 
                                       '/kaggle/input/flan-t5-chatbot-model/kaggle/working/peft-dialogue-summary-training-1712998561/checkpoint-12500', 
                                       torch_dtype=torch.bfloat16,
                                       is_trainable=False)

In [6]:
data = pd.read_csv('/kaggle/input/bitext-gen-ai-chatbot-customer-support-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')

idx = 2000
idx_var = 100

check = data.iloc[idx]['instruction']
check2 = data.iloc[idx]['response']

test = data.iloc[idx_var]['instruction']
test_response = data.iloc[idx_var]['response']

print(f'instruction:\n{test}\n\nresponse:\n')


print(f'\n\n======================\nexpected reponse: \n {test_response}')

instruction:
where can I cancel order {{Order Number}}?

response:



expected reponse: 
 I've taken note that you're looking for information on where to cancel order {{Order Number}}. You can easily initiate the cancellation process by following these steps:

1. Sign into Your Account: Access our platform by signing in to your {{Online Company Portal Info}}.
2. Navigate to Your Orders: Once you're signed in, go to the '{{Online Order Interaction}}' or '{{Online Order Interaction}}' section.
3. Locate the Relevant Order: Look for the purchase associated with the order number {{Order Number}}. Click on it to view the details.
4. Initiate Cancellation: Within the order details, you'll find the option labeled '{{Online Order Interaction}}'. Please select this to proceed.
5. Confirm Cancellation: The system may prompt you to confirm the cancellation or provide feedback. Please follow the instructions to complete the process.

If you encounter any difficulties or have further questions, our

In [7]:
peft_model = peft_model.to('cpu')


test_prompt = f'instruction:\n{test}\n\nresponse:\n'
input_text = tokenizer(test_prompt,return_tensors="pt").input_ids
output = peft_model.generate(input_ids=input_text, generation_config=GenerationConfig(max_new_tokens=200, num_beams=1))

peft_model_text_output = tokenizer.decode(output[0], skip_special_tokens=True)


print(peft_model_text_output)

Order Number can be cancelled by following the following steps: 1. Go to the 'Orders' section of your order. 2. Locate the specific order with the order number Order Number. 3. Click on the order to view the details. 4. You may be asked to provide the details of the order, such as the name, address, and any other information required. 5. Fill in the required fields with the details of the order, such as the order number, order number, and any other information required. 6. If the order is not listed, you can request a refund or a refund. 7. If the refund is not received, the refund will be charged to the original payment method. 8. If the refund is not received, the refund will be charged to the original payment method. 9. If the refund is not received, the refund will be charged to the original payment method. 10. If the refund is not received, the refund will


### Evaluate

In [17]:
act_dataset = Dataset.from_csv('/kaggle/input/bitext-gen-ai-chatbot-customer-support-dataset/Bitext_Sample_Customer_Support_Training_Dataset_27K_responses-v11.csv')
shuffled_dataset = act_dataset.shuffle(seed=42) 

split_datasets = shuffled_dataset.train_test_split(test_size = 0.2)


instructions = split_datasets['test'][0:150]['instruction']
response = split_datasets['test'][0:150]['response']

peft_model_responses = []
original_model_responses = []

for instruction in instructions:
    
    prompt = f"""
    instruction:
    {instruction}
    
    response:
    
    """
    
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    
    peft_model_outputs = peft_model.generate(input_ids=input_ids, generation_config=GenerationConfig(max_new_tokens=200))
    peft_model_text_output = tokenizer.decode(peft_model_outputs[0], skip_special_tokens=True)
    
    original_model_outputs = model.generate(input_ids=input_ids,
                                            generation_config=GenerationConfig(max_new_tokens=200)
                                           )
    original_model_text_output = tokenizer.decode(original_model_outputs[0], skip_special_tokens=True)

    
    
    peft_model_responses.append(peft_model_text_output)
    original_model_responses.append(original_model_text_output)
    

In [30]:
zipped_responses = list(zip(response,peft_model_responses,original_model_responses))
df = pd.DataFrame(zipped_responses,columns = ['human','peft','original'])
df

Unnamed: 0,human,peft,original
0,I'm happy to help! {{Delivery City}} is one of...,Delivery City is a city in the northern part o...,Delivery City
1,I grasp that you are looking for the appropria...,Customer complaint agaist,agaist ur company
2,Of course! I'm here to provide you with the ne...,Order Number is a purchase with the purchase n...,Order number
3,"Sure, I can help you with that. To view the ca...","To view the cancellation fees, you can visit t...",Find the cancellation fees on the website of t...
4,How truly incredible it is to witness your det...,,i need help with my account
...,...,...,...
145,I'm clearly cognizant your need for assistance...,Order Number is a specific order and you need ...,Order Number
146,We understand that you're eager to know the es...,We can provide you with the estimated delivery...,a few hours
147,Certainly! I understand your curiosity about t...,We can provide you with the estimated delivery...,a few minutes
148,Your reach-out is appreciated! I'm sensing tha...,We are here to help you with speaking with our...,i need assistance to speak with customer service


In [None]:
%pip install evaluate
%pip install rouge_score

In [11]:
import evaluate
rouge = evaluate.load('rouge')

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

In [27]:
response = split_datasets['test'][0:150]['response']

original_model_results = rouge.compute(
    predictions=original_model_responses,
    references=response,
    use_aggregator=True,
    use_stemmer=True,
)



peft_model_results = rouge.compute(
    predictions=peft_model_responses,
    references=response,
    use_aggregator=True,
    use_stemmer=True,
)

In [28]:
print('\noriginal\n',original_model_results)

print('\npeft\n',peft_model_results)


original
 {'rouge1': 0.08692217961485044, 'rouge2': 0.02684384296582492, 'rougeL': 0.07314723799537023, 'rougeLsum': 0.07537768796313195}

peft
 {'rouge1': 0.32672681286955485, 'rouge2': 0.15188963975065217, 'rougeL': 0.23395528602380755, 'rougeLsum': 0.24671992903640394}
