In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
!pip install -q  torch peft==0.4.0 bitsandbytes==0.40.2 transformers==4.31.0 trl==0.4.7 accelerate

import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline
)
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
def LabelCategory(num):
  if(num == 2):
    return 'Frequency'
  elif(num == 3):
    return 'Typical Time'

  elif(num == 0):
    return 'Event Duration'
  elif(num == 1):
    return 'Event Ordering'
  elif(num == 4):
    return 'Stationarity'

In [None]:
from datasets import Dataset
data_name = 'mc_taco'
training_data1 = load_dataset(data_name, split="test")
# Event ordering and stationary cant be predicted in our model, so removing that data (category : 1,4 )
updated_dataset = [x for x in training_data1  if  (x['label']!=0)]
datas = list()
for temp in updated_dataset:
    datas.append('<s>[INST] '+temp['sentence']+' '+temp['question']+' [ '+LabelCategory(temp['category'])+' ]'+' [/INST] '+temp['answer']+'</s>')

# Create the dataset directly from the list of strings
training_data = Dataset.from_dict({"text": datas})

# Model and tokenizer names
base_model_name = "NousResearch/Llama-2-7b-chat-hf"
refined_model = "llama-2-TC" #You can give it your own name

# Tokenizer
llama_tokenizer = AutoTokenizer.from_pretrained(base_model_name, trust_remote_code=True)
llama_tokenizer.pad_token = llama_tokenizer.eos_token
llama_tokenizer.padding_side = "right"  # Fix for fp16

# Quantization Config
quant_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_use_double_quant=False
)

# Model
base_model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=quant_config,
    device_map={"": 0}
)
base_model.config.use_cache = False
base_model.config.pretraining_tp = 1

In [None]:
# LoRA Config
peft_parameters = LoraConfig(
    lora_alpha=16,
    lora_dropout=0.1,
    r=8,
    bias="none",
    task_type="CAUSAL_LM"
)

# Training Params
train_params = TrainingArguments(
    output_dir="./results_modified",
    num_train_epochs=1,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=1,
    optim="paged_adamw_32bit",
    save_steps=25,
    logging_steps=25,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=False,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="constant",
    report_to="tensorboard"
)

# Trainer
fine_tuning = SFTTrainer(
    model=base_model,
    train_dataset=training_data,
    peft_config=peft_parameters,
    dataset_text_field="text",
    tokenizer=llama_tokenizer,
    args=train_params
)

# Training
fine_tuning.train()


# Test with McTACO

In [None]:
from re import I
import pandas as pd
# Generate Text
# Event ordering and stationary cant be predicted in our model, so removing that data (category : 1,4 )
training_data1 = load_dataset(data_name, split="validation")
updated_dataset_test = [x for x in training_data1  if (x['label']!=0)]
datas = list()
prev = ''
for temp in updated_dataset_test:
    if(temp['question']==prev):
      continue
    datas.append('<s>[INST] '+temp['sentence']+' '+temp['question']+' [ '+LabelCategory(temp['category'])+' ]'+' [/INST]')
    prev = temp['question']


text_gen = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=llama_tokenizer, max_length=200)
outputs = dict()
outputs['ans'] = list()
outputs['query'] = datas
for i in datas:
  output = text_gen(i)
  print(output[0]['generated_text'])
  outputs['ans'].append(output[0]['generated_text'])



In [None]:
import pandas as pd
with open("/content/textsamsum.txt", "r") as file:
    tc = dict()
    tc['question'] = list()
    tc['answer'] = list()
    for line in file:
        start_index = line.find("<s>")
        end_index = line.find("</s>")

        if start_index != -1 and end_index != -1:  # Both <s> and </s> tags are found in the line
            parsed_text = line[start_index + 3:end_index]  # Extract the text between <s> and </s>
            end_inst_index = parsed_text.find("[/INST]")
            tc['question'].append(parsed_text[7:end_inst_index])
            tc['answer'].append(parsed_text[end_inst_index+7:])
            print(tc['question'][-1],tc['answer'][-1])
        elif start_index != -1:  # Only <s> tag is found
            print("Incomplete tag sequence: <s> without </s>")
            continue
        elif end_index != -1:  # Only </s> tag is found
            print("Incomplete tag sequence: </s> without <s>")
            continue
df = pd.DataFrame(tc)
# Define the file name
csv_file = "outputs_Llama2.0.csv"
# Save DataFrame to CSV
df.to_csv(csv_file, index=False)
print(f"Dataset saved to {csv_file}")

In [None]:
import pandas as pd
import re

def separate_sentences_by_category(input_csv_path, categories, column_name, output_file_base):
    # Load the CSV file
    df = pd.read_csv(input_csv_path)

    # Regular expression to match full sentences
    sentence_pattern = re.compile(r"(.*?)[\s]*\[\s*(.*?)\s*\]")

    files = {}
    for category in categories:
        files[category] = open(f"{output_file_base}_{category.replace(' ', '_').lower()}.txt", 'w')

    # Additional file for sentences that don't fit any category
    other_file = open(f"{output_file_base}_others.txt", 'w')

    # Iterate through each row in the dataframe
    for index, row in df.iterrows():
        # Access the column with text
        context = str(row[column_name])
        # Split content into sentences
        sentences = sentence_pattern.search(context)
        sentences = sentences.group(1),sentences.group(2)
        print(sentences)
        sentence,cat = sentences
        # Variable to track if the sentence has been categorized
        categorized = False
        for category in categories:
            # Check if the sentence contains the category keyword (case-insensitive search)
            if category.lower() in cat.lower():
                files[category].write(sentence+' '+row['answer']+'\n')
                categorized = True
                break  # Assuming a sentence only belongs to one category
    # Close all files
    for f in files.values():
        f.close()
    other_file.close()

# Example usage
input_csv_path = '/content/outputs_Llama2.0.csv'
categories = ['Stationarity', 'Typical Time', 'Event Duration', 'Event Ordering', 'Frequency']
column_name = 'question'
output_file_base = 'sentences_categorized'

separate_sentences_by_category(input_csv_path, categories, column_name, output_file_base)


# TempQ Tests





In [7]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('/content/questions_with_temporal_connotation_samsum.csv')


In [8]:
import re

def move_content_in_brackets_to_end(context):
    # Find content within brackets
    match = re.search(r'\[(.*?)\]', context)
    if match:
        # Get the content within brackets
        content_within_brackets = match.group(1)
        # Remove the content within brackets from the original string
        context_without_brackets = context.replace(match.group(0), '').strip()
        # Append the content within brackets at the end
        formatted_string = f"{context_without_brackets.strip()}"
        return formatted_string, content_within_brackets.strip()
    else:
        # No content within brackets found, return the original string
        return context

# # Example usage
# context = "[Frequency] Hannah needs Betty's number but Amanda doesn't have it. She needs to contact Larry."
# predicted_question = "How often does she need to contact Larry?"
model_query_data = []
for i in range(len(df['context'])):
  # Move content within brackets to the end of the string
  formatted_string,category = move_content_in_brackets_to_end(df['context'][i])

  # Append the predicted question
  formatted_string = f"{formatted_string.strip()} {df['predicted_question'][i].strip()} [ {category} ]"
  # print(formatted_string)
  model_query_data.append(formatted_string)

In [None]:
from re import I
import pandas as pd
datas = list()
prev = ''
for temp in model_query_data:
    datas.append('<s>[INST] '+temp+' [/INST]')


text_gen = pipeline(task="text-generation", model=fine_tuning.model, tokenizer=llama_tokenizer, max_length=200)
outputs = dict()
outputs['ans'] = list()
outputs['query'] = datas
# count = 2000
for i in datas:
  output = text_gen(i)
  # print(output[0]['generated_text'])
  outputs['ans'].append(output[0]['generated_text'])
  # count = count - 1


In [None]:
import pandas as pd
# Convert the outputs dictionary to a DataFrame
df = pd.DataFrame(outputs)

# Save the DataFrame to a CSV file
df.to_csv('outputs.csv', index=False)