In [1]:
! pip install transformers
! pip install torch



### importing libraries 

In [2]:
import torch
import numpy as np
import pandas as pd
from transformers import T5Tokenizer, T5ForConditionalGeneration
import json

In [3]:
tokenizer = T5Tokenizer.from_pretrained("t5-base")
model = T5ForConditionalGeneration.from_pretrained("t5-base").to('cuda')

Downloading (…)ve/main/spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading (…)neration_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

### reading dataset and cleaning it

In [4]:
lines = []
with open("/kaggle/input/ds-json-format/json_format_dataset/0/aaai19srl.train0.conll.json") as file:
    for line in file:
        x = json.loads(line)
        lines.append(x)

In [5]:
sentences, orl = [], []
for i in range(len(lines)):
    sentences.append(' '.join(lines[i]['sentences']))
    orl.append(lines[i]['orl'])
print(sentences[0])
print(orl[0])

The Kimberley Provincial Hospital said it would probably know by Tuesday whether one of its patients had Congo Fever .
[[6, 8, 0, 3, 'AGENT'], [6, 8, 6, 8, 'DSE'], [6, 8, 11, 18, 'TARGET']]


In [6]:
df = pd.DataFrame({'sentence': sentences, 'orl': orl})
df

Unnamed: 0,sentence,orl
0,The Kimberley Provincial Hospital said it woul...,"[[6, 8, 0, 3, AGENT], [6, 8, 6, 8, DSE], [6, 8..."
1,Saeed said indications were that those tests w...,"[[1, 1, 0, 0, AGENT], [1, 1, 1, 1, DSE], [1, 1..."
2,He said it was his opinion that the patient --...,"[[4, 5, 0, 0, AGENT], [4, 5, 4, 5, DSE], [4, 5..."
3,The woman was admitted to the hospital on Satu...,"[[10, 10, 0, 1, AGENT], [10, 10, 10, 10, DSE],..."
4,`` Since our technical equipment is far from p...,"[[22, 22, 2, 4, TARGET], [22, 22, 10, 10, TARG..."
...,...,...
2444,Benjamin Franklin Federal Savings & Loan Assoc...,"[[9, 9, 8, 8, AGENT], [9, 9, 9, 9, DSE]]"
2445,thrift said the restructuring should help it m...,"[[1, 1, 1, 1, DSE], [1, 1, 2, 3, TARGET]]"
2446,Details of the restructuring wo n't be made fi...,"[[11, 11, 10, 10, AGENT], [11, 11, 11, 11, DSE]]"
2447,"Jay Stevens , an analyst with Dean Witter Reyn...","[[12, 12, 11, 11, AGENT], [12, 12, 12, 12, DSE]]"


In [7]:
df['orl'][0][0][4]

'AGENT'

In [8]:
# Function to extract values from the list and handle missing values
def extract_values(row, index):
    if len(row) > index:
        return row[index]
    else:
        return None

# Create new columns
df['agent'] = df['orl'].apply(lambda x: extract_values(x, 0))
df['dse'] = df['orl'].apply(lambda x: extract_values(x, 1))
df['target'] = df['orl'].apply(lambda x: extract_values(x, 2))
df.head()

Unnamed: 0,sentence,orl,agent,dse,target
0,The Kimberley Provincial Hospital said it woul...,"[[6, 8, 0, 3, AGENT], [6, 8, 6, 8, DSE], [6, 8...","[6, 8, 0, 3, AGENT]","[6, 8, 6, 8, DSE]","[6, 8, 11, 18, TARGET]"
1,Saeed said indications were that those tests w...,"[[1, 1, 0, 0, AGENT], [1, 1, 1, 1, DSE], [1, 1...","[1, 1, 0, 0, AGENT]","[1, 1, 1, 1, DSE]","[1, 1, 5, 6, TARGET]"
2,He said it was his opinion that the patient --...,"[[4, 5, 0, 0, AGENT], [4, 5, 4, 5, DSE], [4, 5...","[4, 5, 0, 0, AGENT]","[4, 5, 4, 5, DSE]","[4, 5, 7, 11, TARGET]"
3,The woman was admitted to the hospital on Satu...,"[[10, 10, 0, 1, AGENT], [10, 10, 10, 10, DSE],...","[10, 10, 0, 1, AGENT]","[10, 10, 10, 10, DSE]","[10, 10, 12, 14, TARGET]"
4,`` Since our technical equipment is far from p...,"[[22, 22, 2, 4, TARGET], [22, 22, 10, 10, TARG...","[22, 22, 2, 4, TARGET]","[22, 22, 10, 10, TARGET]","[22, 22, 21, 21, AGENT]"


In [9]:
from torch.utils.data import DataLoader, Dataset
# Define a custom dataset
class CustomDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_length4text=64, max_length4label=8):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_length4text = max_length4text
        self.max_length4label = max_length4label

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        text = self.data['sentence'][idx]
        label = self.data['label'][idx]
        text_encoding = self.tokenizer(text, truncation=True, padding='max_length', max_length=self.max_length4text, return_tensors='pt').to("cuda")
        label_encoding = self.tokenizer(label, truncation=True, padding='max_length', max_length=self.max_length4label, return_tensors='pt').to("cuda")
        return {
            'input_ids': text_encoding['input_ids'].squeeze(),
            'attention_mask': text_encoding['attention_mask'].squeeze(),
            'input_id_labels': label_encoding['input_ids'].squeeze(),
            'attention_mask_labels': label_encoding['attention_mask'].squeeze()
        }