In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [None]:
def LabelCategory(num):
  if(num == 2):
    return 'Frequency'
  elif(num == 3):
    return 'Typical Time'

  elif(num == 0):
    return 'Event Duration'
  elif(num == 1):
    return 'Event Ordering'
  elif(num == 4):
    return 'Stationarity'

In [None]:
import pandas as pd

# Load datasets
df1 = pd.read_csv('/content/outputs_Llama2.0_RealNews_2k.csv',nrows = 1500)
df2 = pd.read_csv('/content/outputs_Llama2.0_final.csv',nrows = 1500)
# Combine datasets by concatenating them vertically
from datasets import Dataset
data_name = 'mc_taco'
training_data1 = load_dataset(data_name, split="test")
mctaco = [x for x in training_data1  if  (x['label']!=0)]

# Event ordering and stationary cant be predicted in our model, so removing that data (category : 1,4 )
# df3 = [x for x in training_data1  if  (x['label']!=0)]
updated_dataset = pd.concat([df1, df2], ignore_index=True)

In [None]:
datas = list()
for i in range(len(updated_dataset['context'])):
    datas.append('<s>[INST] '+updated_dataset['context'][i]+' '+updated_dataset['question'][i]+' [ '+updated_dataset['category'][i]+' ]'+' [/INST] '+updated_dataset['answer'][i]+'</s>')
for i in mctaco:
  datas.append('<s>[INST] '+i['sentence']+' '+i['question']+' [ '+LabelCategory(i['category'])+' ]'+' [/INST] '+i['answer']+'</s>')


In [None]:
from datasets import Dataset


# Create the dataset directly from the list of strings
training_data = Dataset.from_dict({"text": datas})

# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-TC" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()


# Test with McTACO

In [None]:
from re import I
import pandas as pd
from datasets import load_dataset
data_name = 'mc_taco'
# Generate Text
# Event ordering and stationary cant be predicted in our model, so removing that data (category : 1,4 )
training_data1 = load_dataset(data_name, split="validation")
updated_dataset_test = [x for x in training_data1  if (x['label']!=0)]
datas = list()
mc_taco_data = {'context':[],'question':[],'answer':[],'category':[]}
prev = ''
for temp in updated_dataset_test:

    if(temp['question']==prev):
      mc_taco_data['answer'][-1].append(temp['answer'])
      continue
    datas.append('<s>[INST] '+temp['sentence']+' '+temp['question']+' [ '+LabelCategory(temp['category'])+' ]'+' [/INST]')
    mc_taco_data['context'].append(temp['sentence'])
    mc_taco_data['question'].append(temp['question'])
    mc_taco_data['answer'].append([temp['answer']])
    mc_taco_data['category'].append(LabelCategory(temp['category']))
    prev = temp['question']


In [None]:
text_gen = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=llama_tokenizer, max_length=200)
outputs = dict()
outputs['ans'] = list()
outputs['query'] = datas
for i in datas:
  output = text_gen(i)
  outputs['ans'].append(output[0]['generated_text'])

In [None]:
import pandas as pd
tc = dict()
tc['question'] = list()
tc['answer'] = list()
for line in outputs['ans']:
    start_index = line.find("<s>")
    end_index = line.find("</s>")

    if start_index != -1 and end_index != -1:  # Both <s> and </s> tags are found in the line
        parsed_text = line[start_index + 3:end_index]  # Extract the text between <s> and </s>
        end_inst_index = parsed_text.find("[/INST]")
        tc['question'].append(parsed_text[7:end_inst_index])
        tc['answer'].append(parsed_text[end_inst_index+7:])
        print(tc['question'][-1],tc['answer'][-1])
    elif start_index != -1:  # Only <s> tag is found
        print("Incomplete tag sequence: <s> without </s>")
        continue
    elif end_index != -1:  # Only </s> tag is found
        print("Incomplete tag sequence: </s> without <s>")
        continue
df = pd.DataFrame(tc)
# Define the file name
csv_file = "outputs_Llama2.0_Trained on SamSUM test Mctaco.csv"
# Save DataFrame to CSV
df.to_csv(csv_file, index=False)
print(f"Dataset saved to {csv_file}")

In [None]:
tc['question'] = mc_taco_data['question']
tc['expected answer'] = mc_taco_data['answer']
tc['context'] = mc_taco_data['context']
tc['category'] = mc_taco_data['category']

In [None]:
df = pd.DataFrame(tc)
# Define the file name
csv_file = "Train_SamSUM_RealNews_McTACO_Test_Mctaco.csv"
# Save DataFrame to CSV
df.to_csv(csv_file, index=False)
print(f"Dataset saved to {csv_file}")