In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# !pip install datasets
!pip install tokenizers
# !pip install transformers

!pip install datasets transformers==4.28.0

Collecting tokenizers
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.8/3.8 MB[0m [31m36.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting huggingface_hub<0.18,>=0.16.4 (from tokenizers)
  Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.0/295.0 kB[0m [31m35.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: huggingface_hub, tokenizers
Successfully installed huggingface_hub-0.17.3 tokenizers-0.14.1
Collecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m8.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting transformers==4.28.0
  Downloading transformers-4.28.0-py3-none-any.whl (7.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.0/7.0 MB[0m [31m88.3 

In [3]:
from transformers import AutoTokenizer, AutoModelForTokenClassification
from transformers import pipeline
from transformers import DataCollatorWithPadding,AutoModelForSequenceClassification, Trainer, TrainingArguments,AutoTokenizer,AutoModel,AutoConfig, DataCollatorForTokenClassification
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import datasets
from datasets import load_dataset,Dataset,DatasetDict,ClassLabel,Sequence

In [4]:
id2label = {0: "O", 1: "B-Subject", 2: "I-Subject", 3: "B-Object", 4: "I-Object", 5: "B-Aspect", 6: "B-Aspect",
            7: "B-Predicate", 8: "I-Predicate"}

label2id = {"O": 0 , "B-Subject": 1, "I-Subject": 2 , "B-Object": 3, "I-Object": 4, "B-Aspect": 5, "B-Aspect": 6,
            "B-Predicate": 7, "I-Predicate": 8}

In [5]:
electra_tokenizer = AutoTokenizer.from_pretrained("NlpHUST/ner-vietnamese-electra-base")
electra_model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/electra_new_data")

Downloading (…)okenizer_config.json:   0%|          | 0.00/383 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/411k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.40M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

In [6]:
phobert_tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base-v2")
phobert_model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/phobert_new_data")

Downloading (…)lve/main/config.json:   0%|          | 0.00/678 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/895k [00:00<?, ?B/s]

Downloading (…)solve/main/bpe.codes:   0%|          | 0.00/1.14M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
multi_tokenizer = AutoTokenizer.from_pretrained("bert-base-multilingual-cased")
multi_model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/multi_new_data")

Downloading (…)okenizer_config.json:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.96M [00:00<?, ?B/s]

Ensembling

In [8]:
def align_tokens(text, tokenizer):
  word_dict = {}
  text_list = text.split(" ")
  for i in range(len(text_list)):
    tokenized_word = tokenizer.tokenize(text_list[i])
    word_dict.update({i: []})
    for x in range(len(tokenized_word)):
      if i == 0:
        word_dict[i].append(x+1)
      else:
        word_dict[i].append(x + max(word_dict[i-1]) + 1)
  return word_dict

In [9]:
def reduce_logits_size(text, tokenizer, model):
  tensor_list = []
  word_dict = align_tokens(text, tokenizer)
  text_input = tokenizer(text, return_tensors="pt")
  with torch.no_grad():
    logits = model(**text_input).logits
  for i in word_dict:
    sum = torch.Tensor([0, 0, 0, 0, 0, 0, 0, 0, 0])
    for x in word_dict[i]:
      sum += logits[0][x]
      # print(logits[0][x])
      # print("\n")
    final_sum = sum / len(word_dict[i])
    tensor_list.append(final_sum)
  # tensor_list = torch.Tensor(empty_list)
  return torch.stack(tensor_list).unsqueeze(0)

In [10]:
sentence = "Cả hai chiếc flagship thế hệ mới nhất từ Xiaomi và Apple đều sở hữu thiết kế tuyệt đẹp và cuốn hút ."

In [11]:
reduce_logits_size(sentence, electra_tokenizer, electra_model).shape

torch.Size([1, 23, 9])

In [12]:
reduce_logits_size(sentence, phobert_tokenizer, phobert_model).shape

torch.Size([1, 23, 9])

In [13]:
reduce_logits_size(sentence, multi_tokenizer, multi_model).shape

torch.Size([1, 23, 9])

In [17]:
def combine_model_logits(text):
  electra_output = reduce_logits_size(text, electra_tokenizer, electra_model)
  phobert_output = reduce_logits_size(text, phobert_tokenizer, phobert_model)
  multi_output = reduce_logits_size(text, multi_tokenizer, multi_model)
  final_output = 0.3 * electra_output + 0.2 * phobert_output + 0.5 * multi_output
  # final_output = 0.5 * electra_output + 0.2 * phobert_output + 0.3 * multi_output
  # final_output = 0.5 * electra_output + 0.5 * phobert_output + 0 * multi_output

  return final_output
def infer_logits(text):
  combined_output = combine_model_logits(text)
  predictions = torch.argmax(combined_output, dim = 2)
  predicted_token_list = [id2label[t.item()] for t in predictions[0]]
  text_list = text.split(" ")
  outputs = []
  for i in range(len(text_list)):
    outputs.append({"text": text_list[i], "value": predicted_token_list[i]})
  return outputs

In [230]:
infer_logits("Ngược lại chiếc máy của Vivo có mặt lưng vát cong cho cảm giác ôm tay hơn khi cầm năm lâu dài .")

[{'text': 'Ngược', 'value': 'O'},
 {'text': 'lại', 'value': 'O'},
 {'text': 'chiếc', 'value': 'B-Subject'},
 {'text': 'máy', 'value': 'I-Subject'},
 {'text': 'của', 'value': 'I-Subject'},
 {'text': 'Vivo', 'value': 'I-Subject'},
 {'text': 'có', 'value': 'O'},
 {'text': 'mặt', 'value': 'O'},
 {'text': 'lưng', 'value': 'O'},
 {'text': 'vát', 'value': 'O'},
 {'text': 'cong', 'value': 'O'},
 {'text': 'cho', 'value': 'O'},
 {'text': 'cảm', 'value': 'O'},
 {'text': 'giác', 'value': 'O'},
 {'text': 'ôm', 'value': 'B-Predicate'},
 {'text': 'tay', 'value': 'I-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'khi', 'value': 'O'},
 {'text': 'cầm', 'value': 'O'},
 {'text': 'năm', 'value': 'O'},
 {'text': 'lâu', 'value': 'O'},
 {'text': 'dài', 'value': 'O'},
 {'text': '.', 'value': 'O'}]

In [210]:
infer_logits("Chính vì vậy mà chất lượng hiển thị của màn hình Galaxy A31 tốt hơn Galaxy A12 , độ chi tiết và sắc nét cao hơn hẳn A12 .")

[{'text': 'Chính', 'value': 'O'},
 {'text': 'vì', 'value': 'O'},
 {'text': 'vậy', 'value': 'O'},
 {'text': 'mà', 'value': 'O'},
 {'text': 'chất', 'value': 'B-Aspect'},
 {'text': 'lượng', 'value': 'B-Aspect'},
 {'text': 'hiển', 'value': 'B-Aspect'},
 {'text': 'thị', 'value': 'B-Aspect'},
 {'text': 'của', 'value': 'O'},
 {'text': 'màn', 'value': 'B-Aspect'},
 {'text': 'hình', 'value': 'B-Aspect'},
 {'text': 'Galaxy', 'value': 'B-Subject'},
 {'text': 'A31', 'value': 'I-Subject'},
 {'text': 'tốt', 'value': 'B-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'Galaxy', 'value': 'B-Object'},
 {'text': 'A12', 'value': 'I-Object'},
 {'text': ',', 'value': 'O'},
 {'text': 'độ', 'value': 'B-Aspect'},
 {'text': 'chi', 'value': 'B-Aspect'},
 {'text': 'tiết', 'value': 'B-Aspect'},
 {'text': 'và', 'value': 'O'},
 {'text': 'sắc', 'value': 'B-Aspect'},
 {'text': 'nét', 'value': 'B-Aspect'},
 {'text': 'cao', 'value': 'B-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'hẳ

In [162]:
infer_logits("Theo đó , Galaxy A12 được hãng trang bị cụm camera hình vuông gọn gàng hơn so với hệ thống camera hình chữ nhật của Galaxy A31 .")

[{'text': 'Theo', 'value': 'O'},
 {'text': 'đó', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'Galaxy', 'value': 'B-Subject'},
 {'text': 'A12', 'value': 'I-Subject'},
 {'text': 'được', 'value': 'O'},
 {'text': 'hãng', 'value': 'O'},
 {'text': 'trang', 'value': 'O'},
 {'text': 'bị', 'value': 'O'},
 {'text': 'cụm', 'value': 'B-Aspect'},
 {'text': 'camera', 'value': 'B-Aspect'},
 {'text': 'hình', 'value': 'B-Aspect'},
 {'text': 'vuông', 'value': 'B-Predicate'},
 {'text': 'gọn', 'value': 'I-Predicate'},
 {'text': 'gàng', 'value': 'I-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'so', 'value': 'O'},
 {'text': 'với', 'value': 'O'},
 {'text': 'hệ', 'value': 'B-Object'},
 {'text': 'thống', 'value': 'I-Object'},
 {'text': 'camera', 'value': 'I-Object'},
 {'text': 'hình', 'value': 'I-Object'},
 {'text': 'chữ', 'value': 'I-Object'},
 {'text': 'nhật', 'value': 'I-Object'},
 {'text': 'của', 'value': 'I-Object'},
 {'text': 'Galaxy', 'value': 'I-Object'},
 {'text': 'A31',

In [148]:
infer_logits("Theo đánh giá cá nhân , tôi vẫn thích camera trên iPhone 11 hơn , ít nhất nó có độ phân giải cao hơn ( 12MP ) .")

[{'text': 'Theo', 'value': 'O'},
 {'text': 'đánh', 'value': 'O'},
 {'text': 'giá', 'value': 'O'},
 {'text': 'cá', 'value': 'O'},
 {'text': 'nhân', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'tôi', 'value': 'O'},
 {'text': 'vẫn', 'value': 'O'},
 {'text': 'thích', 'value': 'O'},
 {'text': 'camera', 'value': 'B-Aspect'},
 {'text': 'trên', 'value': 'O'},
 {'text': 'iPhone', 'value': 'B-Subject'},
 {'text': '11', 'value': 'I-Subject'},
 {'text': 'hơn', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'ít', 'value': 'O'},
 {'text': 'nhất', 'value': 'O'},
 {'text': 'nó', 'value': 'B-Subject'},
 {'text': 'có', 'value': 'O'},
 {'text': 'độ', 'value': 'B-Aspect'},
 {'text': 'phân', 'value': 'B-Aspect'},
 {'text': 'giải', 'value': 'B-Aspect'},
 {'text': 'cao', 'value': 'B-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': '(', 'value': 'O'},
 {'text': '12MP', 'value': 'O'},
 {'text': ')', 'value': 'O'},
 {'text': '.', 'value': 'O'}]

In [65]:

infer_logits("Người em của nó lại sử dụng tấm nền LCD với khích thước 6.1 in , tương tự như iPhone Xr . Rõ ràng , khả năng hiện thì của chiếc iPhone đời mới không bằng mà còn ngốn pin hơn .")

[{'text': 'Người', 'value': 'B-Object'},
 {'text': 'em', 'value': 'I-Object'},
 {'text': 'của', 'value': 'I-Object'},
 {'text': 'nó', 'value': 'I-Object'},
 {'text': 'lại', 'value': 'O'},
 {'text': 'sử', 'value': 'O'},
 {'text': 'dụng', 'value': 'O'},
 {'text': 'tấm', 'value': 'B-Aspect'},
 {'text': 'nền', 'value': 'B-Aspect'},
 {'text': 'LCD', 'value': 'O'},
 {'text': 'với', 'value': 'O'},
 {'text': 'khích', 'value': 'O'},
 {'text': 'thước', 'value': 'O'},
 {'text': '6.1', 'value': 'O'},
 {'text': 'in', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'tương', 'value': 'B-Predicate'},
 {'text': 'tự', 'value': 'I-Predicate'},
 {'text': 'như', 'value': 'O'},
 {'text': 'iPhone', 'value': 'B-Object'},
 {'text': 'Xr', 'value': 'I-Object'},
 {'text': '.', 'value': 'O'},
 {'text': 'Rõ', 'value': 'O'},
 {'text': 'ràng', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'khả', 'value': 'B-Aspect'},
 {'text': 'năng', 'value': 'B-Aspect'},
 {'text': 'hiện', 'value': 'B-Aspect'},
 {'te

In [71]:
infer_logits("Vậy nên , iPhone 11 mạnh hơn hẳn iPhone Xs Max cho dù cả 2 cùng lượng RAM 4GB và sử dùng iOS cùng phiên bản .")

[{'text': 'Vậy', 'value': 'O'},
 {'text': 'nên', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'iPhone', 'value': 'B-Subject'},
 {'text': '11', 'value': 'I-Subject'},
 {'text': 'mạnh', 'value': 'B-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'hẳn', 'value': 'I-Predicate'},
 {'text': 'iPhone', 'value': 'B-Object'},
 {'text': 'Xs', 'value': 'I-Object'},
 {'text': 'Max', 'value': 'I-Object'},
 {'text': 'cho', 'value': 'O'},
 {'text': 'dù', 'value': 'O'},
 {'text': 'cả', 'value': 'B-Object'},
 {'text': '2', 'value': 'I-Object'},
 {'text': 'cùng', 'value': 'B-Predicate'},
 {'text': 'lượng', 'value': 'O'},
 {'text': 'RAM', 'value': 'B-Aspect'},
 {'text': '4GB', 'value': 'B-Aspect'},
 {'text': 'và', 'value': 'O'},
 {'text': 'sử', 'value': 'O'},
 {'text': 'dùng', 'value': 'O'},
 {'text': 'iOS', 'value': 'O'},
 {'text': 'cùng', 'value': 'O'},
 {'text': 'phiên', 'value': 'O'},
 {'text': 'bản', 'value': 'O'},
 {'text': '.', 'value': 'O'}]

In [38]:
infer_logits("Vậy nên, iPhone Xs Max lại tạo được sự đẳng cấp hơn một chút.")

[{'text': 'Vậy', 'value': 'O'},
 {'text': 'nên,', 'value': 'O'},
 {'text': 'iPhone', 'value': 'B-Subject'},
 {'text': 'Xs', 'value': 'I-Subject'},
 {'text': 'Max', 'value': 'I-Subject'},
 {'text': 'lại', 'value': 'O'},
 {'text': 'tạo', 'value': 'O'},
 {'text': 'được', 'value': 'O'},
 {'text': 'sự', 'value': 'O'},
 {'text': 'đẳng', 'value': 'B-Predicate'},
 {'text': 'cấp', 'value': 'I-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'một', 'value': 'I-Predicate'},
 {'text': 'chút.', 'value': 'I-Predicate'}]

In [19]:
infer_logits("Thêm vào đó là GPU quá lỗi thời cho năm nay , trong khi các đối thủ đã có cho mình GPU tốt hơn nhiều với cùng mức giá .")

[{'text': 'Thêm', 'value': 'O'},
 {'text': 'vào', 'value': 'O'},
 {'text': 'đó', 'value': 'O'},
 {'text': 'là', 'value': 'O'},
 {'text': 'GPU', 'value': 'B-Aspect'},
 {'text': 'quá', 'value': 'O'},
 {'text': 'lỗi', 'value': 'O'},
 {'text': 'thời', 'value': 'I-Predicate'},
 {'text': 'cho', 'value': 'O'},
 {'text': 'năm', 'value': 'O'},
 {'text': 'nay', 'value': 'O'},
 {'text': ',', 'value': 'O'},
 {'text': 'trong', 'value': 'O'},
 {'text': 'khi', 'value': 'O'},
 {'text': 'các', 'value': 'B-Subject'},
 {'text': 'đối', 'value': 'I-Subject'},
 {'text': 'thủ', 'value': 'I-Subject'},
 {'text': 'đã', 'value': 'O'},
 {'text': 'có', 'value': 'O'},
 {'text': 'cho', 'value': 'O'},
 {'text': 'mình', 'value': 'O'},
 {'text': 'GPU', 'value': 'B-Aspect'},
 {'text': 'tốt', 'value': 'B-Predicate'},
 {'text': 'hơn', 'value': 'I-Predicate'},
 {'text': 'nhiều', 'value': 'I-Predicate'},
 {'text': 'với', 'value': 'O'},
 {'text': 'cùng', 'value': 'B-Predicate'},
 {'text': 'mức', 'value': 'B-Aspect'},
 {'text

In [12]:
from transformers import AutoTokenizer
tokenizer1 = AutoTokenizer.from_pretrained("vinai/phobert-base-v2", use_fast = True)
model1 = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/test_bert_task1')
nlp1 = pipeline("sentiment-analysis", model = model1, tokenizer = tokenizer1)

model3 = AutoModelForSequenceClassification.from_pretrained('/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/task3_phobert_30epochs_newdata')
nlp3 = pipeline("sentiment-analysis", model = model3, tokenizer = tokenizer1)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
def post_process(tokens, entities):
    post_process_text = ""
    entity = []
    idx = 0
    for token in tokens:
        if not token.startswith("##"):
            post_process_text += " "
            entity.append(entities[idx])
        post_process_text += token.lstrip("##")
        idx += 1
    return post_process_text.lstrip(), entity

In [14]:
import json
def split_quintuple(input_quintuple):
    lists_to_split = ["subject", "object", "aspect", "predicate"]

    split_quintuples = {list_name: [] for list_name in lists_to_split}

    for list_name in lists_to_split:
        current_list = input_quintuple[list_name]
        current_group = []
        current_word = ""
        for item in current_list:
            index = int(item.split("&&")[0])

            if not current_group or index == current_group[-1] + 1:
                current_group.append(index)
            else:
                split_quintuples[list_name].append([s for s in input_quintuple[list_name] if int(s.split("&&")[0]) in current_group])
                current_group = [index]

        split_quintuples[list_name].append([s for s in input_quintuple[list_name] if int(s.split("&&")[0]) in current_group])

    max_groups = max(len(split_quintuples[list_name]) for list_name in lists_to_split)

    final_quintuples = []

    for group_index in range(max_groups):
        new_quintuple = input_quintuple.copy()
        for list_name in lists_to_split:
            split_group = split_quintuples[list_name][group_index] if group_index < len(split_quintuples[list_name]) else input_quintuple[list_name]
            new_quintuple[list_name] = split_group
        new_quintuple = json.dumps(new_quintuple, ensure_ascii=False)
        final_quintuples.append(new_quintuple)

    return final_quintuples

# Output

In [15]:
import os
input_directory = '/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/submission/original public dataset/VLSP2023_ComOM_public_test_nolabel'
output_directory = '/content/drive/MyDrive/VLSP23/Data/HH/baseline_models/submission/output15'
for file_number in range(1, 25):
    input_file_name = f'dev_{str(file_number).zfill(4)}.txt'
    input_file_path = os.path.join(input_directory, input_file_name)
    output_file_path = os.path.join(output_directory, input_file_name)

    with open(output_file_path, 'w') as output_file:
        with open(input_file_path, 'r') as input_file:
            lines = input_file.readlines()
        input_sentence = []
        for line in lines:
            sentences = line.split('\t')
            if len(sentences) >= 2:
                output_file.write(sentences[1])
    with open(output_file_path, 'r') as input_file:
        lines = input_file.readlines()
    with open(output_file_path, 'w') as output_file:
        comparative = []
        task2 = []
        for line in lines:
            comparative.append(nlp1(line))
        for i, line in enumerate(lines):
            label = comparative[i][0]['label']
            if label == 'Comparative':
                task2.append(line)
        ner = []
        for sentence in task2:
            sentence = sentence.replace(' \u200b\u200b ', ' ')
            sentence = " ".join(sentence.split())
            ner.append(infer_logits(sentence))
        preference = []
        for sentence in task2:
            preference.append(nlp3(sentence))
        ner_post_processed = []
        for token_list in ner:
            tokens = [token['text'] for token in token_list]
            entities = [token['value'] for token in token_list]

            post_processed_text,entity = post_process(tokens,entities)
            post_processed_words = post_processed_text.split()

            result_list = [{'entity': entity[i], 'index': i + 1, 'word': post_processed_words[i]} for i in range(len(post_processed_words))]
            ner_post_processed.append(result_list)

        idx = 0
        for i, line in enumerate(lines):
            label = comparative[i][0]['label']
            if label == 'Comparative':
                quintuple = {"subject": [], "object": [], "aspect": [], "predicate": [], "label": ""}
                for tag in ner_post_processed[idx]:
                    entity = tag['entity']
                    index = tag['index']
                    word = tag['word']
                    quintuple['label'] = preference[idx][0]['label']
                    if entity in ['B-Subject', 'I-Subject']:
                        quintuple['subject'].append(f"{index}&&{word}")
                    elif entity in ['B-Object', 'I-Object']:
                        quintuple['object'].append(f"{index}&&{word}")
                    elif entity in ['B-Aspect', 'I-Aspect']:
                        quintuple['aspect'].append(f"{index}&&{word}")
                    elif entity in ['B-Predicate', 'I-Predicate']:
                        quintuple['predicate'].append(f"{index}&&{word}")
                final_quintuples = split_quintuple(quintuple)
                modified_line = f"{line}"

                for quintuple in final_quintuples:
                    modified_line += f"{quintuple}\n"

                modified_line += "\n"
                idx += 1
            else:
                modified_line = f"{line}\n"
            output_file.write(modified_line)