In [None]:
! pip install transformers
! pip install torch
! pip install peft

### importing libraries 

In [None]:
import torch
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json
import random
from sklearn.model_selection import train_test_split
from itertools import zip_longest
from peft import get_peft_model, LoraConfig, TaskType

### defining general variables

In [None]:
#defining global valriables throughout the whole notebook
EPOCH = 32
BATCH_SIZE = 64
MAX_INPUT_LENGTH = 65
MAX_LABEL_LENGTH = 8
MODEL_LINK = "google/flan-t5-large"
FOLD_NUMBER = 0
SEED = 0
srl_data_link = '/kaggle/input/srl-w-cluster-number103k/srl_orl-cluster_number.csv'
lora_config = LoraConfig(
    r=16,
    task_type=TaskType.CAUSAL_LM,
    lora_alpha=32,
    lora_dropout=0.05
)

In [None]:
def set_seed():
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True

In [None]:
set_seed()
tokenizer = T5Tokenizer.from_pretrained(MODEL_LINK, legacy=False)
model = T5ForConditionalGeneration.from_pretrained(MODEL_LINK).to('cuda')
model = get_peft_model(model, lora_config)

### functions created for reading(get_data) and organize the files (organize_data)

In [None]:
# a funciton to read data off of a database link is here to help getting and organizing data into dataframes
def get_data(address):
    lines = []
    with open(address) as file:
        for line in file:
            x = json.loads(line)
            lines.append(x)
    sentences, orl, sep_sentences = [], [], []
    for i in range(len(lines)):
        sep_sentences.append(lines[i]['sentences'])
        sentences.append(' '.join(lines[i]['sentences']))
        orl.append(lines[i]['orl'])
    dataframe = pd.DataFrame({'sentence': sentences, 'orl': orl, 'sep_sent': sep_sentences})
    return dataframe

# this function is to make a list of the said attribute for later iterations
def list_of(attributes, requested_atr):
    requested_list = []
    for sublist in attributes:
        if sublist[-1] == requested_atr:
            requested_list.append(sublist)
    return requested_list

# this function was made to find target(s)/agent(s) of a dse according to list of attributes
def organize_data(dataframe):
    
    target_column, agent_column, sentence_column, dse_column = [], [], [], []
    
    for i in range(len(dataframe)):
        
        attributes = dataframe['orl'][i]
        sentence   = dataframe['sep_sent'][i]
        target_list= list_of(attributes, 'TARGET')
        agent_list = list_of(attributes, 'AGENT')
        AGENT, DSE, TARGET = '', '', ''
        
        
        for sublist in attributes:
            if sublist[-1] == 'DSE':
                dse_start = int(sublist[0])
                dse_end = int(sublist[1] + 1)
                DSE += ' '.join(sentence[dse_start:dse_end]) + '|'
                
                
                # looking for the targets and agents of this dse that we have found
                for sub_sublist in target_list:
                    if sub_sublist[0] == dse_start and int(sub_sublist[1] + 1) == dse_end:
                        target_start = int(sub_sublist[2])
                        target_end = int(sub_sublist[3] + 1)
                        TARGET += ' '.join(sentence[target_start:target_end]) + ' |'
                if not TARGET:
                    TARGET += ' |'
                
                for sub_sublist in agent_list:
                    if sub_sublist[0] == dse_start and int(sub_sublist[1] + 1) == dse_end:
                        agent_start = int(sub_sublist[2])
                        agent_end = int(sub_sublist[3] + 1)
                        AGENT += ' '.join(sentence[agent_start:agent_end]) + '|'
                if not AGENT:
                    AGENT += ' |'
                # for every iteration of loop over attributes, if a dse is found, then we need to transfer it to new line of a dataframe
                # for each one of the dse(s) i have to add them into a new array so then they can create the correct dataframe
                target_column.append(TARGET)
                agent_column.append(AGENT)
                dse_column.append(DSE)
                sentence_column.append(dataframe['sentence'][i])

    # end of iteration on all sentences
    output_df = pd.DataFrame({'sentence':sentence_column, 'dse':dse_column, 'target':target_column, 'agent':agent_column})
    return output_df

### a function to call for different links of folders with exact process to extract and organize data

In [None]:
def get_files_of_folder(folder_number):
    folder_number = int()
    folder = f"/kaggle/input/ds-json-format/json_format_dataset/{folder_number}"
    dev_df = get_data(f"{folder}/aaai19srl.dev{folder_number}.conll.json")
    df = get_data(f"{folder}/aaai19srl.train{folder_number}.conll.json")
    test_df = get_data(f"{folder}/aaai19srl.test{folder_number}.conll.json")
    return df, dev_df, test_df
df = get_files_of_folder(FOLD_NUMBER)[0]
dev_df = get_files_of_folder(FOLD_NUMBER)[1]
test_df = get_files_of_folder(FOLD_NUMBER)[2]

### getting training/validation/test data into data frames and dividing each of {agent, target, dse} elements

In [None]:
# df = get_data("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.train0.conll.json")

df = organize_data(df)
df

In [None]:
# dev_df = get_data("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.dev0.conll.json")

dev_df = organize_data(dev_df)
dev_df

In [None]:
# test_df = get_data("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.test0.conll.json")

test_df = organize_data(test_df)
test_df

### Getting evaluation data into df and dividing each of {agent, target, dse} elements

In [None]:
# function which will write prompt for the model according to the sentence and the items in it
def create_prompt(input_df):
    target_prompt, agent_prompt, target, agent, sentence, dse = [], [], [], [], [], []
    for i in range(len(input_df)):
        target_prompt.append(f"sentence is: {input_df['sentence'][i]} this is verb: {input_df['dse'][i]}. find target for DSE in the sentence?")
        agent_prompt.append(f"sentence is: {input_df['sentence'][i]} this is verb: {input_df['dse'][i]}. find agent for DSE in the sentence?")
        target.append(input_df['target'][i])
        agent.append(input_df['agent'][i])
        sentence.append(input_df['sentence'][i])
        dse.append(input_df['dse'][i])
    output_df = pd.DataFrame({'sentence':sentence, 'dse':dse, 'target_prompt':target_prompt, 'target':target, 'agent_prompt':agent_prompt, 'agent':agent})
    return output_df

### Change the sentence and dse into a prompt according to information in that row 
#### change will be applied on all three data frames {train, test, validation}

In [None]:
dev_df = create_prompt(dev_df).reset_index(drop=True)
train_df = create_prompt(df).reset_index(drop=True)
test_df = create_prompt(test_df).reset_index(drop=True)

In [None]:
from torch.utils.data import DataLoader, Dataset
# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length4text, max_length4label):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length4text = max_length4text
        self.max_length4label = max_length4label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        agent_prompt = self.data['agent_prompt'][idx]
        target_prompt = self.data['target_prompt'][idx]
        agent = self.data['agent'][idx]
        target = self.data['target'][idx]
        # tokenizing agent prompt
        agent_prompt_encoding = self.tokenizer(agent_prompt, truncation=True, padding='max_length', max_length=self.max_length4text, return_tensors='pt').to("cuda")
        # tokenizing target prompt
        target_prompt_encoding = self.tokenizer(target_prompt, truncation=True, padding='max_length', max_length=self.max_length4text, return_tensors='pt').to("cuda")
        # tokenizing agent
        agent_encoding = self.tokenizer(agent, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        # tokenizing target
        target_encoding = self.tokenizer(target, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        return {
            # agent prompt
            'agent_input_id': agent_prompt_encoding['input_ids'].squeeze(),
            'agent_attention_mask': agent_prompt_encoding['attention_mask'].squeeze(),
            # target prompt
            'target_input_id': target_prompt_encoding['input_ids'].squeeze(),
            'target_attention_mask': target_prompt_encoding['attention_mask'].squeeze(),
            #agent
            'agent_id': agent_encoding['input_ids'].squeeze(),
            'agent_mask': agent_encoding['attention_mask'].squeeze(),
            #target
            'target_id': target_encoding['input_ids'].squeeze(),
            'target_mask': target_encoding['attention_mask'].squeeze()
        }

In [None]:
# Define the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)

In [None]:
def f1_calculator(split_pred_list, split_actual_list):
    matched, percision, recall, f1 = 0, 0, 0, 0
    predicted_len, actual_len      = 0, 0
    
    for actual_sublist, prediction_sublist in zip(split_pred_list, split_actual_list):
        predicted_len += len(prediction_sublist)
        for i in actual_sublist:
            actual_len += 1
            if i in prediction_sublist:
                matched += 1
    
    print(f"matched: {matched}, predicted_len: {predicted_len}, actual_len: {actual_len}")
    try:
        precision = matched / predicted_len
        recall = matched / actual_len
        f1 = 2 * (precision * recall) / (precision + recall)
    except ZeroDivisionError:
        f1 = 0

    return f1

In [None]:
def clear_data(text_list):
    list_of_items_in_text = []
    for single_list in text_list:
        for i in single_list.split('|'):
            i = i.lower().replace(" ", "")  # Convert to lowercase and remove extra whitespace
            if i:  # Check if `i` is not empty after stripping
                list_of_items_in_text.append(i)
    return list_of_items_in_text

In [None]:
a = ['clear as day| iMportant| noted and clear| |']
print(clear_data(a))

In [None]:
def evaluate_model(model, dataloader, prompt_type):
    if prompt_type == 'target':
        id_type = 'target_input_id'
        attention_type = 'target_attention_mask'
        output_type = 'target_id'
    elif prompt_type == 'agent':
        id_type = 'agent_input_id'
        attention_type = 'agent_attention_mask'
        output_type = 'agent_id'
    
    model.eval()  # Set the model to evaluation mode
    with torch.no_grad():
        actual_list, prediction_list = [], []
        for batch_idx, batch in enumerate(dataloader):
            
            # Forward pass
            ids = batch[id_type]
            mask = batch[attention_type]
            output_id = batch[output_type]
            
            actuals = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in output_id]
            
            generated_output = model.generate(
              input_ids = ids,
              attention_mask = mask, 
              max_length=64, 
              )
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in generated_output]
            
            actual_list.append(clear_data(actuals))
            prediction_list.append(clear_data(preds))
            
    return f1_calculator(prediction_list, actual_list)

In [None]:
f1_scores_target, f1_scores_agent = [], []
#           data_loader:train,v_data_loader:validation
def train_data(data_loader, v_data_loader):
    data_list = []
    model.train()
    for epoch in range(EPOCH):
        losses = []
        for batch in data_loader:

            agent_input = batch['agent_input_id']
            agent_attention_mask = batch['agent_attention_mask']

            target_input = batch['target_input_id']
            target_attention_mask = batch['target_attention_mask']

            agent_id = batch['agent_id']
            agent_mask = batch['agent_mask']

            target_id = batch['target_id']
            target_mask = batch['target_mask']

            optimizer.zero_grad()

            agent_output = model(agent_input, attention_mask=agent_attention_mask, labels=agent_id)
            target_output = model(target_input, attention_mask=target_attention_mask, labels=target_id)
            
            # created just for examination
            # has no contribution to accuracy of model output
            temprorarly_pd = pd.DataFrame([{
                'actual_agent': [tokenizer.decode(a) for a in agent_input],
                'agent_model_output': agent_output,
                'actual_target': [tokenizer.decode(c) for c in target_input],
                'target_model_output': target_output
            }])
            # 
            # 
            
            agent_loss = agent_output.loss
            target_loss = target_output.loss
            losses.append(agent_loss.item())
            losses.append(target_loss.item())

            agent_loss.backward()
            target_loss.backward()
            optimizer.step()
            

        f1_4_target = evaluate_model(model, v_data_loader, 'target')
        f1_4_agent = evaluate_model(model, v_data_loader, 'agent')
        
        f1_scores_target.append(f1_4_target)
        f1_scores_agent.append(f1_4_agent)
        
        # report the results of training function.
        print(f'loss: {np.mean(losses)}, f1 for target:{f1_4_target}, f1 for agent:{f1_4_agent} \n end of epoch{epoch}. \n')

# By this box, mpqa/orl data is ready to be combined with different forms of clustering being applied on the rest of the data

### .
### .
### .
### .
## chosing randomly out of SRL_without_anchors for calculating the results
### .
### .
### .
### .

In [None]:
srl = pd.read_csv(srl_data_link)
agents, targets, verbs, sentences, cluster_number = [], [], [], [], []
for i in range(len(srl)):
    
    #single line of data is selected from df
    line = srl.iloc[i]
    
    #check for both ARGS, 
    #if both are empty skips that line.
    #if either one of the args is filled-
    #-process continues.
    if line['ARG0'] or line['ARG1']:
        if line['ARG0']:
            agents.append(line['ARG0'])
        else:
            agents.append("")
        if line['ARG1']:
            targets.append(line['ARG1'])
        else:
            targets.append("")
        sentences.append(line['Sentence'])
        verbs.append(line['Predicate'])
        cluster_number.append(line['Cluster_Number'])

srl_df = pd.DataFrame({'sentences': sentences, 'verbs': verbs, 'agents': agents, 'targets': targets, 'cluster_no':cluster_number}, dtype='object').fillna('')
srl_df

In [None]:
agent_prompts, agents, target_prompts, targets, cluster_no = [], [], [], [], []
for i in range(len(srl_df)):
    line = srl_df.iloc[i]
    agent_prompts.append(f"sentence is: {line['sentences']} this is verb: {line['verbs']} find agent for this veerb in the sentence")
    agents.append(line['agents'])
    target_prompts.append(f"sentence is: {line['sentences']} this is verb: {line['verbs']} find target for this verb in the sentence")
    targets.append(line['targets'])
    cluster_no.append(line['cluster_no'])
    # (f"sentence is: \{{input_df['sentence'][i]}} this is DSE: {input_df['dse'][i]}. find target for DSE in the sentence?")
srl_df = []
srl_df = pd.DataFrame({'agent_prompt': agent_prompts, 'agent': agents, 'target_prompt': target_prompts, 'target': targets, 'cluster_no':cluster_no})
srl_df

#### randomly selecting number of this dataframe to be fed into the model

In [None]:
def random_sample_df(df, num_rows, SEED=None):
    # Check if num_rows is larger than the available number of rows in the DataFrame
    if num_rows > len(df):
        raise ValueError(f"Requested {num_rows} rows, but the DataFrame only contains {len(df)} rows.")
    
    # Sample the DataFrame and return the result
    sampled_df = df.sample(n=num_rows, random_state=SEED)
    return sampled_df

def random_sample_from_all_clusters(df, num_rows_per_cluster, cluster_column, SEED=None):
    # Check if the cluster_column exists in the DataFrame
    if cluster_column not in df.columns:
        raise ValueError(f"Column '{cluster_column}' not found in DataFrame.")

    # Create an empty list to hold the sampled data for each cluster
    sampled_dfs = []

    # Group the DataFrame by the cluster column
    grouped = df.groupby(cluster_column)

    # Iterate over each cluster and sample rows
    for cluster, group in grouped:
        # Check if the group has enough rows to sample
        if len(group) < num_rows_per_cluster:
            raise ValueError(f"Cluster '{cluster}' has only {len(group)} rows, but {num_rows_per_cluster} were requested.")
        
        # Sample the rows from the current cluster
        sampled_group = group.sample(n=num_rows_per_cluster, random_state=SEED)
        sampled_dfs.append(sampled_group)

    # Concatenate the sampled DataFrames for each cluster
    sampled_df = pd.concat(sampled_dfs).reset_index(drop=True)

    return sampled_df


srl_train_nk = random_sample_from_all_clusters(srl_df, 400, 'cluster_no', SEED)
srl_train_nk

In [None]:
def combine_and_shuffle(df1, df2, SEED=None):
    """
    Combines two DataFrames with the same columns and shuffles the rows.

    Args:
        df1 (pd.DataFrame): The first DataFrame.
        df2 (pd.DataFrame): The second DataFrame.
        seed (int, optional): Random seed for reproducibility.

    Returns:
        pd.DataFrame: A new DataFrame with rows from both DataFrames shuffled.
    """
    # Combine the two DataFrames using pd.concat
    combined_df = pd.concat([df1, df2], ignore_index=True)
    
    # Shuffle the combined DataFrame
    shuffled_df = combined_df.sample(frac=1, random_state=SEED).reset_index(drop=True)
    
    return shuffled_df
orl_srl = combine_and_shuffle(train_df, srl_train_nk, SEED)
orl_srl.drop(columns=['cluster_no', 'sentence', 'dse'], inplace=True)
orl_srl

In [None]:
# Create a data loader for TRAIN dataframe 
train_dataset = CustomDataset(orl_srl, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
train_data_loader = DataLoader(train_dataset, batch_size= BATCH_SIZE, shuffle=True)

# Create a data loader for EVALUATION dataframe
val_dataset = CustomDataset(dev_df, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
val_data_loader = DataLoader(val_dataset, batch_size= BATCH_SIZE, shuffle=False)

# Create a data loader for TEST dataframe
test_dataset = CustomDataset(test_df, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
test_data_loader = DataLoader(test_dataset, batch_size= BATCH_SIZE, shuffle=False)


In [None]:
# # # # designed only for mpqa only, this code won't use any of the srl data cleaned by the previous blocks


# # Create a data loader for TRAIN dataframe 
# train_dataset = CustomDataset(train_df, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
# train_data_loader = DataLoader(train_dataset, batch_size= BATCH_SIZE, shuffle=True)

# # Create a data loader for EVALUATION dataframe
# val_dataset = CustomDataset(dev_df, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
# val_data_loader = DataLoader(val_dataset, batch_size= BATCH_SIZE, shuffle=False)

# # Create a data loader for TEST dataframe
# test_dataset = CustomDataset(test_df, tokenizer, max_length4text= MAX_INPUT_LENGTH, max_length4label= MAX_LABEL_LENGTH)
# test_data_loader = DataLoader(test_dataset, batch_size= BATCH_SIZE, shuffle=False)


In [None]:
train_data(train_data_loader, val_data_loader)

In [None]:
import matplotlib.pyplot as plt
# Plot F1 scores for target and agent
epochs = range(0, EPOCH)

plt.figure(figsize=(10, 6))
plt.plot(epochs, f1_scores_target, label='F1 Target', marker='o')
plt.plot(epochs, f1_scores_agent, label='F1 Agent', marker='o')

plt.xlabel('Epochs')
plt.ylabel('F1 Score')
plt.title('F1 Scores for Target and Agent over Epochs')
plt.legend()
plt.grid(True)
plt.show()

In [None]:
target_accuracy = evaluate_model(model, val_data_loader, 'target')
agent_accuracy = evaluate_model(model, val_data_loader, 'agent')
print(f'f1 Agent: {agent_accuracy}. f1 Target: {target_accuracy}\n\n----------------------')

### mpqa fold 0 to 4 
#### average f1 score for agent and target
fold0_agent = 0.8197088465845465
fold0_target = 0.5834266517357223

fold1_agent = 0.7872340425531915
fold1_target = 0.5599104143337066

fold2_agent = 0.7849944008958566
fold2_target = 0.568868980963046

fold3_agent = 0.7950727883538634
fold3_target = 0.5789473684210527

fold4_agent = 0.7939529675251961
fold4_target = 0.571108622620380