In [1]:
! pip install transformers
! pip install torch



### importing libraries 

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

In [3]:
#defining general valriables throughout the whole notebook
EPOCH = 10
batch_size = 16
max_input_length = 64
max_label_length = 8
t5_small = "google/flan-t5-small"

In [4]:
tokenizer = T5Tokenizer.from_pretrained(t5_small)
model = T5ForConditionalGeneration.from_pretrained(t5_small).to('cuda')

Downloading tokenizer_config.json:   0%|          | 0.00/2.54k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/2.20k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading config.json:   0%|          | 0.00/1.40k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/308M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### reading dataset and cleaning it

In [5]:
def get_data(address):
    lines = []
    with open(address) as file:
        for line in file:
            x = json.loads(line)
            lines.append(x)
    sentences, orl, sep_sentences = [], [], []
    for i in range(len(lines)):
        sep_sentences.append(lines[i]['sentences'])
        sentences.append(' '.join(lines[i]['sentences']))
        orl.append(lines[i]['orl'])
    dataframe = pd.DataFrame({'sentence': sentences, 'orl': orl, 'sep_sent': sep_sentences})
    return dataframe

### getting training data into df and dividing each of {agent, target, dse} elements

In [6]:
df = get_data("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.train0.conll.json")
df

Unnamed: 0,sentence,orl,sep_sent
0,The Kimberley Provincial Hospital said it woul...,"[[6, 8, 0, 3, AGENT], [6, 8, 6, 8, DSE], [6, 8...","[The, Kimberley, Provincial, Hospital, said, i..."
1,Saeed said indications were that those tests w...,"[[1, 1, 0, 0, AGENT], [1, 1, 1, 1, DSE], [1, 1...","[Saeed, said, indications, were, that, those, ..."
2,He said it was his opinion that the patient --...,"[[4, 5, 0, 0, AGENT], [4, 5, 4, 5, DSE], [4, 5...","[He, said, it, was, his, opinion, that, the, p..."
3,The woman was admitted to the hospital on Satu...,"[[10, 10, 0, 1, AGENT], [10, 10, 10, 10, DSE],...","[The, woman, was, admitted, to, the, hospital,..."
4,`` Since our technical equipment is far from p...,"[[22, 22, 2, 4, TARGET], [22, 22, 10, 10, TARG...","[``, Since, our, technical, equipment, is, far..."
...,...,...,...
2444,Benjamin Franklin Federal Savings & Loan Assoc...,"[[9, 9, 8, 8, AGENT], [9, 9, 9, 9, DSE]]","[Benjamin, Franklin, Federal, Savings, &, Loan..."
2445,thrift said the restructuring should help it m...,"[[1, 1, 1, 1, DSE], [1, 1, 2, 3, TARGET]]","[thrift, said, the, restructuring, should, hel..."
2446,Details of the restructuring wo n't be made fi...,"[[11, 11, 10, 10, AGENT], [11, 11, 11, 11, DSE]]","[Details, of, the, restructuring, wo, n't, be,..."
2447,"Jay Stevens , an analyst with Dean Witter Reyn...","[[12, 12, 11, 11, AGENT], [12, 12, 12, 12, DSE]]","[Jay, Stevens, ,, an, analyst, with, Dean, Wit..."


In [7]:
# Function to organize the data into separate columns
def organize_data(atributs, sentence):
    AGENT, DSE, TARGET = '', '', ''
    for sublist in atributs:
        if sublist[-1] == 'AGENT':
            start = int(sublist[2])
            end = int(sublist[3] + 1)
            AGENT += ' '.join(sentence[start:end]) + '|'
        elif sublist[-1] == 'DSE':
            start = int(sublist[0])
            end = int(sublist[1] + 1)
            DSE += ' '.join(sentence[start:end]) + '|'
        elif sublist[-1] == 'TARGET':
            start = int(sublist[2])
            end = int(sublist[3] + 1)
            TARGET += ' '.join(sentence[start:end]) + '|'
    return AGENT, DSE, TARGET

# Organize tarin data into diffrent columns
for i in range(len(df)):
    agent, dse, target = organize_data(df['orl'][i], df['sep_sent'][i])
    df.loc[i, 'agent'] = agent
    df.loc[i, 'dse'] = dse
    df.loc[i, 'target'] = target
df.head(10)

Unnamed: 0,sentence,orl,sep_sent,agent,dse,target
0,The Kimberley Provincial Hospital said it woul...,"[[6, 8, 0, 3, AGENT], [6, 8, 6, 8, DSE], [6, 8...","[The, Kimberley, Provincial, Hospital, said, i...",The Kimberley Provincial Hospital|,would probably know|,whether one of its patients had Congo Fever|
1,Saeed said indications were that those tests w...,"[[1, 1, 0, 0, AGENT], [1, 1, 1, 1, DSE], [1, 1...","[Saeed, said, indications, were, that, those, ...",Saeed|,said|,those tests|
2,He said it was his opinion that the patient --...,"[[4, 5, 0, 0, AGENT], [4, 5, 4, 5, DSE], [4, 5...","[He, said, it, was, his, opinion, that, the, p...",He|,his opinion|,the patient -- a woman|
3,The woman was admitted to the hospital on Satu...,"[[10, 10, 0, 1, AGENT], [10, 10, 10, 10, DSE],...","[The, woman, was, admitted, to, the, hospital,...",The woman|,complaining|,severe joint pains|
4,`` Since our technical equipment is far from p...,"[[22, 22, 2, 4, TARGET], [22, 22, 10, 10, TARG...","[``, Since, our, technical, equipment, is, far...",Nazarov|,said|,our technical equipment|we|
5,`` Our agency seriously needs equipment for de...,"[[3, 4, 3, 4, DSE], [3, 4, 5, 8, TARGET], [12,...","[``, Our, agency, seriously, needs, equipment,...",he|,seriously needs|said|,equipment for detecting drugs|Our agency|
6,"Paris , July 11 -LRB- CNA -RRB- -- Taiwan 's e...","[[39, 39, 8, 22, TARGET], [39, 39, 37, 38, AGE...","[Paris, ,, July, 11, -LRB-, CNA, -RRB-, --, Ta...",French weekly|,warned|,Taiwan 's economy will become totally dependen...
7,The Nouvel Observateur said that Beijing is un...,"[[3, 3, 0, 2, AGENT], [3, 3, 3, 3, DSE], [3, 3...","[The, Nouvel, Observateur, said, that, Beijing...",The Nouvel Observateur|Taiwan businessmen|,said|are eager|,Beijing|explore the mainland Chinese market|
8,The weekly said that mainland China has attrac...,"[[2, 2, 0, 1, AGENT], [2, 2, 2, 2, DSE], [2, 2...","[The, weekly, said, that, mainland, China, has...",The weekly|its own businessmen|,said|bowed to|pressure from|expected|,the Taipei authorities|lift restrictions on in...
9,"Barring any reversal to the trend , the weekly...","[[15, 15, 10, 13, AGENT], [15, 15, 15, 15, DSE...","[Barring, any, reversal, to, the, trend, ,, th...",U.S. magazine Business Weekly|,saying|,Beijing|


### getting evaluation data ready into dev_df and dividing elements into columns

In [8]:
dev_df = get_data("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.dev0.conll.json")
dev_df

Unnamed: 0,sentence,orl,sep_sent
0,The owner though that the animal was suffering...,"[[2, 2, 0, 1, AGENT], [2, 2, 2, 2, DSE], [2, 2...","[The, owner, though, that, the, animal, was, s..."
1,"The owner put down the animal , although the v...","[[10, 11, 8, 9, AGENT], [10, 11, 10, 11, DSE],...","[The, owner, put, down, the, animal, ,, althou..."
2,"GATUNA , Rwanda , July 6 -LRB- AFP -RRB- - Pre...","[[30, 32, 30, 32, DSE], [30, 32, 33, 35, TARGET]]","[GATUNA, ,, Rwanda, ,, July, 6, -LRB-, AFP, -R..."
3,"The formerly close allies fell out in 1999 , t...","[[4, 5, 0, 3, AGENT], [4, 5, 4, 5, DSE], [4, 5...","[The, formerly, close, allies, fell, out, in, ..."
4,"In March , Uganda declared Rwanda a hostile na...","[[4, 4, 3, 3, AGENT], [4, 4, 4, 4, DSE], [4, 4...","[In, March, ,, Uganda, declared, Rwanda, a, ho..."
...,...,...,...
1033,He said the independent power segment could gr...,"[[1, 1, 0, 0, AGENT], [1, 1, 1, 1, DSE], [1, 1...","[He, said, the, independent, power, segment, c..."
1034,Westinghouse also expects its international sa...,"[[2, 2, 0, 0, AGENT], [2, 2, 2, 2, DSE]]","[Westinghouse, also, expects, its, internation..."
1035,Analysts had expected Consolidated to post a s...,"[[2, 2, 0, 0, AGENT], [2, 2, 2, 2, DSE], [2, 2...","[Analysts, had, expected, Consolidated, to, po..."
1036,"They have to continue to tighten their belts ,...","[[10, 10, 0, 0, TARGET], [10, 10, 10, 10, DSE]...","[They, have, to, continue, to, tighten, their,..."


In [9]:
for i in range(len(dev_df)):
    agent, dse, target = organize_data(dev_df['orl'][i], dev_df['sep_sent'][i])
    dev_df.loc[i, 'agent'] = agent
    dev_df.loc[i, 'dse'] = dse
    dev_df.loc[i, 'target'] = target
dev_df.head(10)

Unnamed: 0,sentence,orl,sep_sent,agent,dse,target
0,The owner though that the animal was suffering...,"[[2, 2, 0, 1, AGENT], [2, 2, 2, 2, DSE], [2, 2...","[The, owner, though, that, the, animal, was, s...",The owner|,though|,the animal|
1,"The owner put down the animal , although the v...","[[10, 11, 8, 9, AGENT], [10, 11, 10, 11, DSE],...","[The, owner, put, down, the, animal, ,, althou...",the vet|,had forbidden|,him to do so|
2,"GATUNA , Rwanda , July 6 -LRB- AFP -RRB- - Pre...","[[30, 32, 30, 32, DSE], [30, 32, 33, 35, TARGET]]","[GATUNA, ,, Rwanda, ,, July, 6, -LRB-, AFP, -R...",,soured relations between|,their neighbouring countries|
3,"The formerly close allies fell out in 1999 , t...","[[4, 5, 0, 3, AGENT], [4, 5, 4, 5, DSE], [4, 5...","[The, formerly, close, allies, fell, out, in, ...",The formerly close allies|The formerly close a...,fell out|mounting rivalry|,each|
4,"In March , Uganda declared Rwanda a hostile na...","[[4, 4, 3, 3, AGENT], [4, 4, 4, 4, DSE], [4, 4...","[In, March, ,, Uganda, declared, Rwanda, a, ho...",Uganda|,declared|alleged|support|,Rwanda|Kigali|a rival to Museveni in a preside...
5,The two countries have each accused the other ...,"[[3, 5, 3, 5, DSE], [3, 5, 6, 7, TARGET], [9, ...","[The, two, countries, have, each, accused, the...",,have each accused|backing|,the other|dissidents|
6,"According to military experts , it is possible...","[[0, 1, 0, 1, DSE], [0, 1, 2, 3, AGENT], [0, 1...","[According, to, military, experts, ,, it, is, ...",military experts|,According to|,clashes will resume between the Taleban and UI...
7,`` We hope that this will serve as an occasion...,"[[2, 2, 1, 1, AGENT], [2, 2, 2, 2, DSE], [2, 2...","[``, We, hope, that, this, will, serve, as, an...",We|,hope|,this will serve as an occasion for LG to make ...
8,The navy craft had approached the small boat t...,"[[8, 8, 8, 8, DSE], [8, 8, 9, 9, TARGET]]","[The, navy, craft, had, approached, the, small...",,thinking|,it|
9,Merchant vessels have been warned to operate s...,"[[2, 4, 0, 1, TARGET], [2, 4, 2, 4, DSE], [2, ...","[Merchant, vessels, have, been, warned, to, op...",,have been warned|,Merchant vessels|being detected by rebels|


In [13]:
df['sentence'] = df['sentence'].astype(str).apply(lambda x: 'find expression of the sentence: ' + x)
dev_df['sentence'] = df['sentence'].astype(str).apply(lambda x: 'find expression of the sentence: ' + x)

In [14]:
from torch.utils.data import DataLoader, Dataset
# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length4text, max_length4label):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length4text = max_length4text
        self.max_length4label = max_length4label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['sentence'][idx]
        agent = self.data['agent'][idx]
        dse = self.data['dse'][idx]
        target = self.data['target'][idx]
        text_encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length4text, return_tensors='pt').to("cuda")
        #agent
        agent_encoding = self.tokenizer(agent, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        #dse
        dse_encoding = self.tokenizer(dse, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        #target
        target_encoding = self.tokenizer(target, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        return {
            #text
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            #agent
            'agent_id': agent_encoding['input_ids'].squeeze(),
            'agent_mask': agent_encoding['attention_mask'].squeeze(),
            #dse
            'dse_id': dse_encoding['input_ids'].squeeze(),
            'dse_mask': dse_encoding['attention_mask'].squeeze(),
            #target
            'target_id': target_encoding['input_ids'].squeeze(),
            'target_mask': target_encoding['attention_mask'].squeeze()
        }

In [15]:
# Create a data loader
train_dataset = CustomDataset(df, tokenizer, max_length4text=max_input_length, max_length4label=max_label_length)
train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_dataset = CustomDataset(df, tokenizer, max_length4text=max_input_length, max_length4label=max_label_length)
val_data_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=True)

# Define the loss function and optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

In [None]:
def cal_Trues(actual_list, prediction_list):
    pass
    

In [None]:
def evaluate_model(model, dataloader):
    model.eval()  # Set the model to evaluation mode

    true_positives = 0
    false_positives = 0
    false_negatives = 0
    threshold = 0.5
    with torch.no_grad():
        actual_list, prediction_list = [], []
        for batch in dataloader:
            # Move data to the specified device
            batch = {key: value.to('cuda') for key, value in batch.items()}

            # Forward pass
            input_id = batch['input_ids']
            attention_mask = batch['attention_mask']
            target_id = batch['dse_id']
            
            output = model.generate(input_id)
            preds = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in output[0]]
            actuals = [tokenizer.decode(g, skip_special_tokens=True, clean_up_tokenization_spaces=True) for g in target_id]
            print(f'predicted: {preds},\n actuals: {actuals}, end of batch +++++++++ \n')
            actual_list.extend(actuals)
            prediction_list.extend(preds)
        

#     # Calculate precision, recall, and F1-score
#     precision = true_positives / max((true_positives + false_positives), 1e-10)
#     recall = true_positives / max((true_positives + false_negatives), 1e-10)
#     f1_score = 2 * (precision * recall) / max((precision + recall), 1e-10)
    return 0
evaluate_model(model, val_data_loader)

In [None]:
model.train()
for epoch in range(EPOCH):
    losses = []
    print(epoch)
    for batch in train_data_loader:
        inputs = batch['input_ids']
        attention_mask = batch['attention_mask']
        dse_id = batch['dse_id']
        dse_mask = batch['dse_mask']

        optimizer.zero_grad()
        outputs = model(inputs, attention_mask=attention_mask, labels=dse_id)
        loss = outputs.loss
        losses.append(loss.item())
        loss.backward()
        optimizer.step()
    accuracy = evaluate_model(model, val_data_loader)
    print(np.mean(losses), accuracy)

In [None]:
input_text = "find expression of the sentence: The Palestinians want nothing from Washington but to understand their cause and stand beside right and justice ."
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

outputs = model.generate(input_ids)
print(tokenizer.decode(outputs[0]))