<a href="https://colab.research.google.com/github/hamzapervez062/Bakery-Shop_Project_Csharp/blob/master/t5_herarchical_(2).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
 #installation of libraries
 !pip install -q transformers datasets
 !pip install -q evaluate
 !pip install -q rouge_score
 !pip install -q bert-score
 !pip install -q contractions

In [None]:
import json
import torch
from tqdm import tqdm
import pandas as pd
import re
import contractions
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM


In [None]:
# Device configuration
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
device

device(type='cuda')

In [None]:
# Load your dataset
with open("/content/regusum_dataset_segmented_final.json", "r", encoding="utf-8") as f:
    data = json.load(f)

In [None]:
original_df = pd.DataFrame(data)
print(f"Loaded {len(original_df)} samples.")

Loaded 354 samples.


In [None]:
df = original_df.copy()
df.head()

Unnamed: 0,ID,gold_summary,Agency ID,Title,Topics,original_content,Sections
0,IRS-2008-0041-0003,This document contains corrections to Treasury...,IRS,"Federal Register, Volume 85 Issue 113 (Thursda...","Income taxes, Reporting and recordkeeping requ...",Background\n\n The final regulations (TD 93...,"[{'header': 'Introduction', 'text': 'Backgroun..."
1,IRS-2008-0053-0009,This document contains a correction to Treasur...,IRS,"Federal Register, Volume 88 Issue 146 (Tuesday...","Income taxes, Reporting and recordkeeping requ...",Background\n\n The final regulations (TD 95...,"[{'header': 'Introduction', 'text': 'Backgroun..."
2,IRS-2008-0092-0007,This document contains a correction to a Treas...,IRS,"Federal Register, Volume 85 Issue 162 (Thursda...","Income taxes, Reporting and recordkeeping requ...",Background\n\n The final regulations (TD 96...,"[{'header': 'Introduction', 'text': 'Backgroun..."
3,IRS-2011-0050-0006,This document contains corrections to Treasury...,IRS,"Federal Register, Volume 85 Issue 46 (Monday, ...","Income taxes, Reporting and recordkeeping requ...",Background\n\n The final regulations (TD 96...,"[{'header': 'Introduction', 'text': 'Backgroun..."
4,IRS-2014-0001-0014,This document contains corrections to Treasury...,IRS,"Federal Register, Volume 85 Issue 88 (Wednesda...","Income taxes, Reporting and recordkeeping requ...",Background\n\n The final regulations (TD 98...,"[{'header': 'Introduction', 'text': 'Backgroun..."


In [None]:
# Keep only rows where 'text' is not empty or NaN
df = df[df['original_content'] != ''].dropna(subset=['original_content'])
df.shape

(353, 7)

In [None]:
#Keep only required columns and rename
df = df[["ID", "original_content", "gold_summary"]]

In [None]:
#to lowercase the all text
def lowercase_text(text):
    return text.lower()

In [None]:
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [None]:
#combinations of words that are shortened like i'm --> i am
def expand_contractions(text):
    expanded_text = contractions.fix(text)
    return expanded_text

In [None]:
# Clean newlines in text
def clean_text(text):
    return text.replace("\n", " ").strip()

In [None]:
#cleaning dots
def dot_cleanings(text):
  cleaned_text = re.sub(r'\.{3,}', ' ', text)
  cleaned_text = re.sub(r'\s{2,}', ' ', cleaned_text).strip()
  return cleaned_text


In [None]:
#remove special characters like '', ', `, /, \ from text.
def cleaned_text(text):
    text = re.sub(r"[\'\"`/\\]", "", text)
    text = re.sub(r"\s+", " ", text).strip()
    return text

In [None]:
#removing text like page no 2334
df["original_content"] = df["original_content"].str.replace(r'\n\n\[\[Page \d+\]\]\n\n', ' ', regex=True)
# removing -- and **
df["original_content"] = df["original_content"].str.replace(r"\s*\*\s*", "", regex=True).str.strip()
df["original_content"] = df["original_content"].str.replace(r'-{2,}', '', regex=True).str.strip()

In [None]:
#apply all functions on original_content column
df.original_content = df.original_content.apply(lowercase_text)
df.original_content = df.original_content.apply(remove_url)
df.original_content = df.original_content.apply(expand_contractions)
df.original_content = df.original_content.apply(clean_text)
df.original_content = df.original_content.apply(dot_cleanings)

In [None]:
df.gold_summary = df.gold_summary.apply(lowercase_text)
df.gold_summary = df.gold_summary.apply(remove_url)
df.gold_summary = df.gold_summary.apply(expand_contractions)
df.gold_summary = df.gold_summary.apply(clean_text)

In [None]:
df["original_content"] = df["original_content"].str.replace(r"/", "", regex=True).str.strip()

In [None]:
df.head()

Unnamed: 0,ID,original_content,gold_summary
0,IRS-2008-0041-0003,background the final regulations (td 9394) tha...,this document contains corrections to treasury...
1,IRS-2008-0053-0009,background the final regulations (td 9515) tha...,this document contains a correction to treasur...
2,IRS-2008-0092-0007,background the final regulations (td 9614) tha...,this document contains a correction to a treas...
3,IRS-2011-0050-0006,background the final regulations (td 9630) tha...,this document contains corrections to treasury...
4,IRS-2014-0001-0014,background the final regulations (td 9806) tha...,this document contains corrections to treasury...


In [None]:
model_namet5 = "t5-large"
t5_tokenizer = AutoTokenizer.from_pretrained(model_namet5)
t5_model= AutoModelForSeq2SeqLM.from_pretrained(model_namet5).to(device)


In [None]:
# Function to get token length of original_text
def get_token_length(text):
    tokens = t5_tokenizer.tokenize(text)
    return len(tokens)

In [None]:
# Finding ids where token length of original_text exceeds 20000
exceeding_ids = []
leng = []
for idx, row in df.iterrows():
    length = get_token_length(row['original_content'])
    leng.append(length)

    if length > 20000:
        exceeding_ids.append(row['ID'])
        print(f"ID {row['ID']} has {length} tokens.")

# print("IDs with text longer than 20000 tokens:", exceeding_ids)
# print(leng)

ID IRS-2015-0030-0227 has 57638 tokens.
ID IRS-2016-0007-0008 has 23174 tokens.
ID IRS-2016-0010-0035 has 51567 tokens.
ID IRS-2016-0040-0003 has 30306 tokens.
ID IRS-2016-0047-0004 has 45942 tokens.
ID IRS-2017-0002-0013 has 23355 tokens.
ID IRS-2017-0005-0005 has 34654 tokens.
ID IRS-2018-0011-0036 has 39050 tokens.
ID IRS-2018-0027-0009 has 31439 tokens.
ID IRS-2019-0003-0004 has 22547 tokens.
ID IRS-2019-0004-0125 has 273511 tokens.
ID IRS-2019-0005-0013 has 103738 tokens.
ID IRS-2019-0012-0052 has 136028 tokens.
ID IRS-2019-0021-0012 has 42677 tokens.
ID IRS-2019-0022-0142 has 241024 tokens.
ID IRS-2019-0023-0027 has 64172 tokens.
ID IRS-2019-0027-0022 has 53230 tokens.
ID IRS-2019-0028-0011 has 86098 tokens.
ID IRS-2019-0029-0034 has 55804 tokens.
ID IRS-2019-0033-0020 has 57414 tokens.
ID IRS-2019-0034-0028 has 108986 tokens.
ID IRS-2019-0038-0018 has 88505 tokens.
ID IRS-2019-0041-0008 has 63841 tokens.
ID IRS-2019-0044-0010 has 27957 tokens.
ID IRS-2019-0049-0019 has 25654 tok

In [None]:
# Calculate token lengths for all rows (original_content) and store them in a new column
df['token_length'] = df['original_content'].apply(get_token_length)

# Get IDs where token length > 20000
exceeding_ids = df.loc[df['token_length'] > 20000, 'ID'].tolist()

print("IDs with text longer than 20000 tokens:", exceeding_ids)

# Create a new DataFrame with rows where token length ≤ 20000
df_less_than_20000 = df[df['token_length'] <= 20000].copy()

print(f"Rows with ≤20000 tokens: {len(df_less_than_20000)}")


IDs with text longer than 20000 tokens: ['IRS-2015-0030-0227', 'IRS-2016-0007-0008', 'IRS-2016-0010-0035', 'IRS-2016-0040-0003', 'IRS-2016-0047-0004', 'IRS-2017-0002-0013', 'IRS-2017-0005-0005', 'IRS-2018-0011-0036', 'IRS-2018-0027-0009', 'IRS-2019-0003-0004', 'IRS-2019-0004-0125', 'IRS-2019-0005-0013', 'IRS-2019-0012-0052', 'IRS-2019-0021-0012', 'IRS-2019-0022-0142', 'IRS-2019-0023-0027', 'IRS-2019-0027-0022', 'IRS-2019-0028-0011', 'IRS-2019-0029-0034', 'IRS-2019-0033-0020', 'IRS-2019-0034-0028', 'IRS-2019-0038-0018', 'IRS-2019-0041-0008', 'IRS-2019-0044-0010', 'IRS-2019-0049-0019', 'IRS-2019-0050-0057', 'IRS-2019-0054-0015', 'IRS-2019-0057-0017', 'IRS-2019-0059-0008', 'IRS-2020-0001-0009', 'IRS-2020-0002-0018', 'IRS-2020-0006-0021', 'IRS-2020-0008-0032', 'IRS-2020-0013-0091', 'IRS-2020-0017-0017', 'IRS-2020-0018-0023', 'IRS-2020-0019-0017', 'IRS-2020-0020-0011', 'IRS-2020-0023-0002', 'IRS-2020-0026-0005', 'IRS-2020-0028-0013', 'IRS-2020-0030-0020', 'IRS-2020-0033-0003', 'IRS-2020-003

In [None]:
#length of those docs that are exceeding 20000 tokens' length
len(exceeding_ids )

154

In [None]:
# Function to get token length of gold_summary
def get_token_length_gold_summary(gold_summary):
    tokens = t5_tokenizer.tokenize(gold_summary)
    return len(tokens)

In [None]:
# Calculate token lengths for all rows and store in a new column
df_less_than_20000['token_gold_summary'] = df_less_than_20000['gold_summary'].apply(get_token_length_gold_summary)

In [None]:
df_less_than_20000.head(10)

Unnamed: 0,ID,original_content,gold_summary,token_length,token_gold_summary
0,IRS-2008-0041-0003,background the final regulations (td 9394) tha...,this document contains corrections to treasury...,781,126
1,IRS-2008-0053-0009,background the final regulations (td 9515) tha...,this document contains a correction to treasur...,700,155
2,IRS-2008-0092-0007,background the final regulations (td 9614) tha...,this document contains a correction to a treas...,1104,124
3,IRS-2011-0050-0006,background the final regulations (td 9630) tha...,this document contains corrections to treasury...,217,97
4,IRS-2014-0001-0014,background the final regulations (td 9806) tha...,this document contains corrections to treasury...,269,132
5,IRS-2014-0030-0004,authority this document amends the income tax ...,this document contains final regulations relat...,14034,91
8,IRS-2016-0007-0009,background the final regulations (td 9981) tha...,this document contains a correction to treasur...,208,120
9,IRS-2016-0007-0010,background the final regulations (td 9981) tha...,this document corrects the correction to treas...,171,122
11,IRS-2016-0038-0007,background this document contains amendments t...,this document contains the final regulations t...,8973,74
13,IRS-2016-0044-0011,background section 401(a)(11) of the internal ...,this document sets forth final regulations pro...,17591,136


**Chunking Method Implementation with dynamic technique**

In [None]:
#creating chunks """Splits text into chunks based on T5 token count."""

def split_text_t5(text, chunk_size=500, chunk_overlap=200):
    token_ids = t5_tokenizer.encode(text, add_special_tokens=False)
    chunks = []
    start = 0
    while start < len(token_ids):
        end = min(start + chunk_size, len(token_ids))

        chunk_ids = token_ids[start:end]
        chunk_text = t5_tokenizer.decode(chunk_ids , skip_special_tokens=True)

        chunks.append(chunk_text)

        # If the next start would be beyond the last new token, break
        if end == len(token_ids):
            break

        start += chunk_size - chunk_overlap # move window forward

    return chunks

In [None]:
df_less_than_20000.tail(5)

Unnamed: 0,ID,original_content,gold_summary,token_length,token_gold_summary
345,SEC-2024-1184-0001,the commission is adopting new 17 cfr 270.3c-7...,the securities and exchange commission (``comm...,6822,146
348,SEC-2024-1493-0001,"we are adopting an updated filer manual, volum...",the securities and exchange commission (``comm...,2170,129
349,SEC-2024-1627-0001,the commission is amending the following rules...,the securities and exchange commission (``comm...,18893,102
351,SEC-2024-1650-0001,the commission is adopting technical amendment...,the securities and exchange commission (``comm...,1432,293
352,SEC-2024-1821-0001,i. background the commission is amending the i...,the securities and exchange commission (``sec'...,1938,55


In [None]:
final_chunks = {}
generated_summaries = []

# Step 1: Chunk the documents using T5 tokenization
for _, row in tqdm(df_less_than_20000.iterrows(), total=len(df_less_than_20000)):

    input_text = row['original_content']
    gold_summary = row['gold_summary']  # The gold summary text
    doc_id = row["ID"]

    # Count tokens using T5 tokenizer
    total_tokens_input = len(t5_tokenizer.encode(input_text, add_special_tokens=False))
    gold_summary_tokens = len(t5_tokenizer.encode(gold_summary, add_special_tokens=False))

    #for shorter chunks < 512
    if total_tokens_input < 512:
        final_chunks[doc_id] = {
            "chunks": [input_text],
            "gold_length": gold_summary_tokens
        }

    #for longer documents chunking
    else:

        split_chunks = split_text_t5(input_text, chunk_size=500, chunk_overlap=200) # calling function (split_text_t5)
        print(f"Splitting ID {doc_id} into {len(split_chunks)} chunks (T5-based)")
        final_chunks[doc_id] = {
            "chunks": split_chunks,
            "gold_length": gold_summary_tokens
        }

# Step 2: Summarize each chunk with proportional length allocation
for doc_id, data in tqdm(final_chunks.items()):
    doc_chunks = data["chunks"]
    gold_length = data["gold_length"]  # T5 token length of gold summary
    num_chunks = len(doc_chunks)

    # Allocate token budget proportionally to each chunk
    chunk_target_len = max(15, gold_length // num_chunks)

    print(f"Processing document with ID: {doc_id} - {num_chunks} chunks ")

    predicted_chunks = []
    for chunk in doc_chunks:
      #tokenizing
        input_ids = t5_tokenizer(
            "summarize: " + chunk,
            return_tensors="pt",
            truncation=False,
            max_length=512 # T5 limited to 512 tokens
        ).input_ids.to("cuda")

        #summary generation
        output_ids = t5_model.generate(
            input_ids,
            max_length=chunk_target_len,
        )
        #decode into text
        summary = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True)
        predicted_chunks.append(summary)

    # Concatenate chunk summaries for final prediction
    predicted_summary = " ".join(predicted_chunks)
    generated_summaries.append(predicted_summary)


  3%|▎         | 6/199 [00:00<00:03, 50.65it/s]

Splitting ID IRS-2008-0041-0003 into 2 chunks (T5-based)
Splitting ID IRS-2008-0053-0009 into 2 chunks (T5-based)
Splitting ID IRS-2008-0092-0007 into 4 chunks (T5-based)
Splitting ID IRS-2014-0030-0004 into 47 chunks (T5-based)
Splitting ID IRS-2016-0038-0007 into 30 chunks (T5-based)


  9%|▊         | 17/199 [00:00<00:04, 40.14it/s]

Splitting ID IRS-2016-0044-0011 into 58 chunks (T5-based)
Splitting ID IRS-2016-0054-0015 into 25 chunks (T5-based)
Splitting ID IRS-2017-0005-0007 into 2 chunks (T5-based)
Splitting ID IRS-2017-0010-0003 into 23 chunks (T5-based)
Splitting ID IRS-2017-0017-0003 into 5 chunks (T5-based)
Splitting ID IRS-2018-0008-0019 into 8 chunks (T5-based)
Splitting ID IRS-2018-0011-0037 into 3 chunks (T5-based)
Splitting ID IRS-2018-0040-0051 into 2 chunks (T5-based)


 16%|█▌        | 32/199 [00:00<00:03, 43.46it/s]

Splitting ID IRS-2019-0006-0007 into 29 chunks (T5-based)
Splitting ID IRS-2019-0009-0020 into 45 chunks (T5-based)
Splitting ID IRS-2019-0012-0053 into 4 chunks (T5-based)
Splitting ID IRS-2019-0012-0055 into 3 chunks (T5-based)
Splitting ID IRS-2019-0012-0056 into 2 chunks (T5-based)
Splitting ID IRS-2019-0017-0003 into 12 chunks (T5-based)
Splitting ID IRS-2019-0022-0144 into 26 chunks (T5-based)


 22%|██▏       | 44/199 [00:00<00:02, 56.06it/s]

Splitting ID IRS-2019-0022-0146 into 4 chunks (T5-based)
Splitting ID IRS-2019-0022-0147 into 2 chunks (T5-based)
Splitting ID IRS-2019-0023-0028 into 2 chunks (T5-based)
Splitting ID IRS-2019-0025-0003 into 19 chunks (T5-based)
Splitting ID IRS-2019-0027-0024 into 2 chunks (T5-based)
Splitting ID IRS-2019-0027-0025 into 2 chunks (T5-based)
Splitting ID IRS-2019-0028-0012 into 10 chunks (T5-based)
Splitting ID IRS-2019-0034-0030 into 4 chunks (T5-based)
Splitting ID IRS-2019-0036-0002 into 26 chunks (T5-based)


 25%|██▌       | 50/199 [00:01<00:04, 36.80it/s]

Splitting ID IRS-2019-0039-8391 into 56 chunks (T5-based)
Splitting ID IRS-2019-0043-0007 into 44 chunks (T5-based)
Splitting ID IRS-2019-0044-0012 into 19 chunks (T5-based)
Splitting ID IRS-2019-0054-0011 into 2 chunks (T5-based)
Splitting ID IRS-2019-0056-0047 into 38 chunks (T5-based)


 28%|██▊       | 55/199 [00:01<00:04, 34.37it/s]

Splitting ID IRS-2019-0058-0003 into 10 chunks (T5-based)
Splitting ID IRS-2020-0002-0019 into 2 chunks (T5-based)
Splitting ID IRS-2020-0003-0009 into 47 chunks (T5-based)
Splitting ID IRS-2020-0005-0023 into 13 chunks (T5-based)
Splitting ID IRS-2020-0006-0022 into 4 chunks (T5-based)


 30%|███       | 60/199 [00:01<00:04, 33.04it/s]

Splitting ID IRS-2020-0007-0014 into 37 chunks (T5-based)
Splitting ID IRS-2020-0010-0005 into 12 chunks (T5-based)
Splitting ID IRS-2020-0011-0003 into 16 chunks (T5-based)
Splitting ID IRS-2020-0012-0006 into 14 chunks (T5-based)
Splitting ID IRS-2020-0013-0093 into 2 chunks (T5-based)
Splitting ID IRS-2020-0014-0005 into 13 chunks (T5-based)
Splitting ID IRS-2020-0018-0024 into 2 chunks (T5-based)
Splitting ID IRS-2020-0020-0002 into 38 chunks (T5-based)


 36%|███▌      | 72/199 [00:01<00:03, 38.26it/s]

Splitting ID IRS-2020-0020-0010 into 2 chunks (T5-based)
Splitting ID IRS-2020-0020-0012 into 33 chunks (T5-based)
Splitting ID IRS-2020-0024-0022 into 4 chunks (T5-based)
Splitting ID IRS-2020-0025-0001 into 26 chunks (T5-based)
Splitting ID IRS-2020-0028-0015 into 3 chunks (T5-based)
Splitting ID IRS-2020-0030-0021 into 10 chunks (T5-based)


 42%|████▏     | 83/199 [00:02<00:02, 40.55it/s]

Splitting ID IRS-2020-0031-0005 into 32 chunks (T5-based)
Splitting ID IRS-2020-0040-0038 into 4 chunks (T5-based)
Splitting ID IRS-2020-0040-0039 into 13 chunks (T5-based)
Splitting ID IRS-2020-0043-0001 into 19 chunks (T5-based)
Splitting ID IRS-2021-0001-0009 into 16 chunks (T5-based)
Splitting ID IRS-2021-0002-0011 into 27 chunks (T5-based)
Splitting ID IRS-2021-0003-0014 into 11 chunks (T5-based)


 44%|████▍     | 88/199 [00:02<00:03, 35.82it/s]

Splitting ID IRS-2021-0012-0002 into 31 chunks (T5-based)
Splitting ID IRS-2021-0012-0004 into 41 chunks (T5-based)
Splitting ID IRS-2021-0014-0006 into 6 chunks (T5-based)
Splitting ID IRS-2021-0017-0012 into 36 chunks (T5-based)


 49%|████▊     | 97/199 [00:02<00:03, 31.73it/s]

Splitting ID IRS-2022-0002-0001 into 45 chunks (T5-based)
Splitting ID IRS-2022-0004-0012 into 22 chunks (T5-based)
Splitting ID IRS-2022-0009-0008 into 47 chunks (T5-based)


 54%|█████▍    | 108/199 [00:02<00:02, 40.75it/s]

Splitting ID IRS-2022-0016-0001 into 30 chunks (T5-based)
Splitting ID IRS-2022-0019-0008 into 17 chunks (T5-based)
Splitting ID IRS-2022-0026-0004 into 2 chunks (T5-based)
Splitting ID IRS-2022-0035-0002 into 8 chunks (T5-based)
Splitting ID IRS-2023-0012-0059 into 3 chunks (T5-based)
Splitting ID IRS-2023-0016-0017 into 32 chunks (T5-based)


 59%|█████▉    | 118/199 [00:03<00:02, 37.35it/s]

Splitting ID IRS-2023-0021-0007 into 60 chunks (T5-based)
Splitting ID IRS-2023-0027-0020 into 2 chunks (T5-based)
Splitting ID IRS-2023-0028-0085 into 4 chunks (T5-based)
Splitting ID IRS-2023-0029-0159 into 4 chunks (T5-based)
Splitting ID IRS-2023-0031-0005 into 24 chunks (T5-based)
Splitting ID IRS-2023-0041-44836 into 3 chunks (T5-based)
Splitting ID IRS-2023-0042-0349 into 8 chunks (T5-based)
Splitting ID IRS-2023-0044-0003 into 7 chunks (T5-based)
Splitting ID IRS-2023-0045-0004 into 22 chunks (T5-based)
Splitting ID IRS-2023-0046-0002 into 17 chunks (T5-based)


 61%|██████▏   | 122/199 [00:03<00:02, 28.87it/s]

Splitting ID IRS-2023-0047-0004 into 64 chunks (T5-based)
Splitting ID IRS-2023-0049-0003 into 16 chunks (T5-based)
Splitting ID IRS-2023-0050-0006 into 17 chunks (T5-based)
Splitting ID IRS-2023-0056-0011 into 3 chunks (T5-based)
Splitting ID IRS-2023-0063-0198 into 6 chunks (T5-based)


 63%|██████▎   | 126/199 [00:03<00:02, 27.54it/s]

Splitting ID IRS-2024-0010-0017 into 60 chunks (T5-based)
Splitting ID IRS-2024-0013-0020 into 36 chunks (T5-based)
Splitting ID IRS-2024-0018-0008 into 34 chunks (T5-based)
Splitting ID IRS-2024-0048-0012 into 26 chunks (T5-based)


 66%|██████▋   | 132/199 [00:03<00:02, 23.00it/s]

Splitting ID IRS_FRDOC_0001-1901 into 3 chunks (T5-based)
Splitting ID IRS_FRDOC_0001-2112 into 45 chunks (T5-based)


 70%|██████▉   | 139/199 [00:04<00:02, 22.73it/s]

Splitting ID IRS_FRDOC_0001-2208 into 66 chunks (T5-based)
Splitting ID IRS_FRDOC_0001-2211 into 4 chunks (T5-based)
Splitting ID IRS_FRDOC_0001-2267 into 23 chunks (T5-based)
Splitting ID SEC-2020-0216-0001 into 6 chunks (T5-based)
Splitting ID SEC-2020-0258-0001 into 12 chunks (T5-based)
Splitting ID SEC-2020-0294-0001 into 5 chunks (T5-based)
Splitting ID SEC-2020-0329-0001 into 41 chunks (T5-based)


 73%|███████▎  | 146/199 [00:04<00:01, 29.53it/s]

Splitting ID SEC-2020-0469-0001 into 23 chunks (T5-based)
Splitting ID SEC-2020-0865-0001 into 2 chunks (T5-based)
Splitting ID SEC-2020-1189-0001 into 3 chunks (T5-based)
Splitting ID SEC-2020-1400-0001 into 37 chunks (T5-based)
Splitting ID SEC-2020-1411-0001 into 11 chunks (T5-based)
Splitting ID SEC-2020-1527-0001 into 3 chunks (T5-based)
Splitting ID SEC-2020-1598-0001 into 2 chunks (T5-based)
Splitting ID SEC-2020-1748-0001 into 9 chunks (T5-based)


 76%|███████▌  | 151/199 [00:04<00:01, 29.72it/s]

Splitting ID SEC-2020-1930-0001 into 39 chunks (T5-based)
Splitting ID SEC-2020-2079-0001 into 9 chunks (T5-based)
Splitting ID SEC-2021-0141-0001 into 36 chunks (T5-based)
Splitting ID SEC-2021-0157-0001 into 25 chunks (T5-based)


 81%|████████  | 161/199 [00:04<00:01, 31.21it/s]

Splitting ID SEC-2021-0159-0001 into 38 chunks (T5-based)
Splitting ID SEC-2021-0281-0001 into 25 chunks (T5-based)
Splitting ID SEC-2021-0687-0001 into 17 chunks (T5-based)
Splitting ID SEC-2021-0830-0001 into 9 chunks (T5-based)
Splitting ID SEC-2021-1031-0001 into 11 chunks (T5-based)
Splitting ID SEC-2021-1101-0001 into 7 chunks (T5-based)
Splitting ID SEC-2021-1148-0001 into 5 chunks (T5-based)
Splitting ID SEC-2021-1385-0001 into 14 chunks (T5-based)


 86%|████████▋ | 172/199 [00:05<00:00, 38.65it/s]

Splitting ID SEC-2021-1588-0001 into 15 chunks (T5-based)
Splitting ID SEC-2021-1767-0001 into 14 chunks (T5-based)
Splitting ID SEC-2022-0013-0001 into 11 chunks (T5-based)
Splitting ID SEC-2022-0211-0001 into 2 chunks (T5-based)
Splitting ID SEC-2022-0491-0001 into 11 chunks (T5-based)
Splitting ID SEC-2022-0531-0001 into 17 chunks (T5-based)
Splitting ID SEC-2022-0564-0001 into 11 chunks (T5-based)
Splitting ID SEC-2022-0954-0001 into 10 chunks (T5-based)


 91%|█████████▏| 182/199 [00:05<00:00, 38.17it/s]

Splitting ID SEC-2022-1241-0001 into 26 chunks (T5-based)
Splitting ID SEC-2022-1344-0001 into 7 chunks (T5-based)
Splitting ID SEC-2022-1630-0001 into 6 chunks (T5-based)
Splitting ID SEC-2023-0079-0001 into 14 chunks (T5-based)
Splitting ID SEC-2023-0226-0001 into 28 chunks (T5-based)
Splitting ID SEC-2023-0437-0001 into 8 chunks (T5-based)
Splitting ID SEC-2023-0559-0001 into 16 chunks (T5-based)
Splitting ID SEC-2023-0605-0001 into 15 chunks (T5-based)


 94%|█████████▍| 187/199 [00:05<00:00, 35.77it/s]

Splitting ID SEC-2023-0741-0001 into 9 chunks (T5-based)
Splitting ID SEC-2023-1048-0001 into 40 chunks (T5-based)
Splitting ID SEC-2023-1103-0001 into 9 chunks (T5-based)
Splitting ID SEC-2024-0089-0001 into 10 chunks (T5-based)
Splitting ID SEC-2024-0496-0001 into 30 chunks (T5-based)


 96%|█████████▌| 191/199 [00:05<00:00, 35.04it/s]

Splitting ID SEC-2024-0510-0001 into 6 chunks (T5-based)
Splitting ID SEC-2024-0531-0001 into 10 chunks (T5-based)
Splitting ID SEC-2024-1076-0001 into 7 chunks (T5-based)
Splitting ID SEC-2024-1184-0001 into 23 chunks (T5-based)
Splitting ID SEC-2024-1493-0001 into 7 chunks (T5-based)


100%|██████████| 199/199 [00:05<00:00, 34.26it/s]


Splitting ID SEC-2024-1627-0001 into 63 chunks (T5-based)
Splitting ID SEC-2024-1650-0001 into 5 chunks (T5-based)
Splitting ID SEC-2024-1821-0001 into 6 chunks (T5-based)


  0%|          | 0/199 [00:00<?, ?it/s]

Processing document with ID: IRS-2008-0041-0003 - 2 chunks 


  1%|          | 1/199 [00:04<13:13,  4.01s/it]

Processing document with ID: IRS-2008-0053-0009 - 2 chunks 


  1%|          | 2/199 [00:08<14:43,  4.48s/it]

Processing document with ID: IRS-2008-0092-0007 - 4 chunks 


  2%|▏         | 3/199 [00:12<14:04,  4.31s/it]

Processing document with ID: IRS-2011-0050-0006 - 1 chunks 


  2%|▏         | 4/199 [00:14<11:05,  3.41s/it]

Processing document with ID: IRS-2014-0001-0014 - 1 chunks 


  3%|▎         | 5/199 [00:17<09:45,  3.02s/it]

Processing document with ID: IRS-2014-0030-0004 - 47 chunks 


  3%|▎         | 6/199 [00:44<36:35, 11.37s/it]

Processing document with ID: IRS-2016-0007-0009 - 1 chunks 


  4%|▎         | 7/199 [00:46<26:24,  8.26s/it]

Processing document with ID: IRS-2016-0007-0010 - 1 chunks 


  4%|▍         | 8/199 [00:49<20:22,  6.40s/it]

Processing document with ID: IRS-2016-0038-0007 - 30 chunks 


  5%|▍         | 9/199 [01:07<31:54, 10.08s/it]

Processing document with ID: IRS-2016-0044-0011 - 58 chunks 


  5%|▌         | 10/199 [01:41<55:34, 17.64s/it]

Processing document with ID: IRS-2016-0053-0005 - 1 chunks 


  6%|▌         | 11/199 [01:44<40:53, 13.05s/it]

Processing document with ID: IRS-2016-0053-0006 - 1 chunks 


  6%|▌         | 12/199 [01:46<30:15,  9.71s/it]

Processing document with ID: IRS-2016-0054-0015 - 25 chunks 


  7%|▋         | 13/199 [02:01<34:41, 11.19s/it]

Processing document with ID: IRS-2017-0002-0014 - 1 chunks 


  7%|▋         | 14/199 [02:04<27:08,  8.80s/it]

Processing document with ID: IRS-2017-0005-0006 - 1 chunks 


  8%|▊         | 15/199 [02:06<20:42,  6.75s/it]

Processing document with ID: IRS-2017-0005-0007 - 2 chunks 


  8%|▊         | 16/199 [02:10<18:03,  5.92s/it]

Processing document with ID: IRS-2017-0010-0003 - 23 chunks 


  9%|▊         | 17/199 [02:23<24:52,  8.20s/it]

Processing document with ID: IRS-2017-0017-0003 - 5 chunks 


  9%|▉         | 18/199 [02:28<21:10,  7.02s/it]

Processing document with ID: IRS-2018-0008-0019 - 8 chunks 


 10%|▉         | 19/199 [02:38<24:15,  8.08s/it]

Processing document with ID: IRS-2018-0011-0037 - 3 chunks 


 10%|█         | 20/199 [02:44<21:52,  7.33s/it]

Processing document with ID: IRS-2018-0027-0010 - 1 chunks 


 11%|█         | 21/199 [02:46<16:57,  5.72s/it]

Processing document with ID: IRS-2018-0027-0011 - 1 chunks 


 11%|█         | 22/199 [02:48<13:43,  4.65s/it]

Processing document with ID: IRS-2018-0037-0020 - 1 chunks 


 12%|█▏        | 23/199 [02:50<11:20,  3.87s/it]

Processing document with ID: IRS-2018-0040-0051 - 2 chunks 


 12%|█▏        | 24/199 [02:54<10:56,  3.75s/it]

Processing document with ID: IRS-2019-0005-0014 - 1 chunks 


 13%|█▎        | 25/199 [02:55<08:56,  3.08s/it]

Processing document with ID: IRS-2019-0006-0007 - 29 chunks 


 13%|█▎        | 26/199 [03:12<21:00,  7.28s/it]

Processing document with ID: IRS-2019-0009-0020 - 45 chunks 


 14%|█▎        | 27/199 [03:39<37:50, 13.20s/it]

Processing document with ID: IRS-2019-0012-0053 - 4 chunks 


 14%|█▍        | 28/199 [03:43<29:50, 10.47s/it]

Processing document with ID: IRS-2019-0012-0055 - 3 chunks 


 15%|█▍        | 29/199 [03:48<24:53,  8.78s/it]

Processing document with ID: IRS-2019-0012-0056 - 2 chunks 


 15%|█▌        | 30/199 [03:53<21:35,  7.67s/it]

Processing document with ID: IRS-2019-0017-0003 - 12 chunks 


 16%|█▌        | 31/199 [04:00<20:38,  7.37s/it]

Processing document with ID: IRS-2019-0022-0144 - 26 chunks 


 16%|█▌        | 32/199 [04:15<27:21,  9.83s/it]

Processing document with ID: IRS-2019-0022-0146 - 4 chunks 


 17%|█▋        | 33/199 [04:19<22:15,  8.05s/it]

Processing document with ID: IRS-2019-0022-0147 - 2 chunks 


 17%|█▋        | 34/199 [04:23<18:15,  6.64s/it]

Processing document with ID: IRS-2019-0023-0028 - 2 chunks 


 18%|█▊        | 35/199 [04:26<15:26,  5.65s/it]

Processing document with ID: IRS-2019-0025-0003 - 19 chunks 


 18%|█▊        | 36/199 [04:37<19:57,  7.35s/it]

Processing document with ID: IRS-2019-0027-0024 - 2 chunks 


 19%|█▊        | 37/199 [04:42<17:46,  6.59s/it]

Processing document with ID: IRS-2019-0027-0025 - 2 chunks 


 19%|█▉        | 38/199 [04:46<15:44,  5.87s/it]

Processing document with ID: IRS-2019-0028-0012 - 10 chunks 


 20%|█▉        | 39/199 [04:52<15:52,  5.95s/it]

Processing document with ID: IRS-2019-0029-0035 - 1 chunks 


 20%|██        | 40/199 [04:55<13:11,  4.98s/it]

Processing document with ID: IRS-2019-0034-0029 - 1 chunks 


 21%|██        | 41/199 [04:58<11:39,  4.43s/it]

Processing document with ID: IRS-2019-0034-0030 - 4 chunks 


 21%|██        | 42/199 [05:03<12:08,  4.64s/it]

Processing document with ID: IRS-2019-0034-0031 - 1 chunks 


 22%|██▏       | 43/199 [05:07<10:59,  4.23s/it]

Processing document with ID: IRS-2019-0036-0002 - 26 chunks 


 22%|██▏       | 44/199 [05:22<19:27,  7.53s/it]

Processing document with ID: IRS-2019-0039-8391 - 56 chunks 


 23%|██▎       | 45/199 [05:55<39:14, 15.29s/it]

Processing document with ID: IRS-2019-0043-0007 - 44 chunks 


 23%|██▎       | 46/199 [06:21<47:13, 18.52s/it]

Processing document with ID: IRS-2019-0044-0012 - 19 chunks 


 24%|██▎       | 47/199 [06:33<41:32, 16.39s/it]

Processing document with ID: IRS-2019-0054-0011 - 2 chunks 


 24%|██▍       | 48/199 [06:36<31:38, 12.57s/it]

Processing document with ID: IRS-2019-0054-0012 - 1 chunks 


 25%|██▍       | 49/199 [06:38<23:17,  9.32s/it]

Processing document with ID: IRS-2019-0056-0047 - 38 chunks 


 25%|██▌       | 50/199 [07:01<32:58, 13.28s/it]

Processing document with ID: IRS-2019-0058-0003 - 10 chunks 


 26%|██▌       | 51/199 [07:07<27:33, 11.17s/it]

Processing document with ID: IRS-2020-0002-0019 - 2 chunks 


 26%|██▌       | 52/199 [07:11<21:52,  8.93s/it]

Processing document with ID: IRS-2020-0003-0009 - 47 chunks 


 27%|██▋       | 53/199 [07:39<35:39, 14.65s/it]

Processing document with ID: IRS-2020-0005-0023 - 13 chunks 


 27%|██▋       | 54/199 [07:46<30:14, 12.52s/it]

Processing document with ID: IRS-2020-0006-0022 - 4 chunks 


 28%|██▊       | 55/199 [07:50<23:55,  9.97s/it]

Processing document with ID: IRS-2020-0006-0023 - 1 chunks 


 28%|██▊       | 56/199 [07:52<18:10,  7.63s/it]

Processing document with ID: IRS-2020-0007-0014 - 37 chunks 


 29%|██▊       | 57/199 [08:14<27:58, 11.82s/it]

Processing document with ID: IRS-2020-0010-0005 - 12 chunks 


 29%|██▉       | 58/199 [08:21<24:24, 10.38s/it]

Processing document with ID: IRS-2020-0011-0003 - 16 chunks 


 30%|██▉       | 59/199 [08:31<23:38, 10.13s/it]

Processing document with ID: IRS-2020-0012-0006 - 14 chunks 


 30%|███       | 60/199 [08:39<22:22,  9.66s/it]

Processing document with ID: IRS-2020-0013-0093 - 2 chunks 


 31%|███       | 61/199 [08:42<17:13,  7.49s/it]

Processing document with ID: IRS-2020-0014-0005 - 13 chunks 


 31%|███       | 62/199 [08:49<17:03,  7.47s/it]

Processing document with ID: IRS-2020-0017-0018 - 1 chunks 


 32%|███▏      | 63/199 [08:52<13:51,  6.12s/it]

Processing document with ID: IRS-2020-0018-0024 - 2 chunks 


 32%|███▏      | 64/199 [08:55<11:35,  5.15s/it]

Processing document with ID: IRS-2020-0019-0018 - 1 chunks 


 33%|███▎      | 65/199 [08:56<08:58,  4.02s/it]

Processing document with ID: IRS-2020-0019-0019 - 1 chunks 


 33%|███▎      | 66/199 [08:58<07:32,  3.40s/it]

Processing document with ID: IRS-2020-0020-0002 - 38 chunks 


 34%|███▎      | 67/199 [09:21<20:17,  9.22s/it]

Processing document with ID: IRS-2020-0020-0010 - 2 chunks 


 34%|███▍      | 68/199 [09:25<16:43,  7.66s/it]

Processing document with ID: IRS-2020-0020-0012 - 33 chunks 


 35%|███▍      | 69/199 [09:45<24:28, 11.30s/it]

Processing document with ID: IRS-2020-0024-0021 - 1 chunks 


 35%|███▌      | 70/199 [09:46<17:45,  8.26s/it]

Processing document with ID: IRS-2020-0024-0022 - 4 chunks 


 36%|███▌      | 71/199 [09:49<14:31,  6.81s/it]

Processing document with ID: IRS-2020-0025-0001 - 26 chunks 


 36%|███▌      | 72/199 [10:05<20:04,  9.48s/it]

Processing document with ID: IRS-2020-0028-0015 - 3 chunks 


 37%|███▋      | 73/199 [10:12<18:29,  8.80s/it]

Processing document with ID: IRS-2020-0030-0021 - 10 chunks 


 37%|███▋      | 74/199 [10:19<16:55,  8.12s/it]

Processing document with ID: IRS-2020-0031-0005 - 32 chunks 


 38%|███▊      | 75/199 [10:38<23:20, 11.29s/it]

Processing document with ID: IRS-2020-0040-0038 - 4 chunks 


 38%|███▊      | 76/199 [10:44<20:16,  9.89s/it]

Processing document with ID: IRS-2020-0040-0039 - 13 chunks 


 39%|███▊      | 77/199 [10:54<20:04,  9.88s/it]

Processing document with ID: IRS-2020-0041-0008 - 1 chunks 


 39%|███▉      | 78/199 [10:56<15:03,  7.47s/it]

Processing document with ID: IRS-2020-0041-0009 - 1 chunks 


 40%|███▉      | 79/199 [10:58<11:39,  5.83s/it]

Processing document with ID: IRS-2020-0043-0001 - 19 chunks 


 40%|████      | 80/199 [11:09<14:42,  7.42s/it]

Processing document with ID: IRS-2021-0001-0009 - 16 chunks 


 41%|████      | 81/199 [11:18<15:49,  8.05s/it]

Processing document with ID: IRS-2021-0001-0010 - 1 chunks 


 41%|████      | 82/199 [11:20<12:07,  6.22s/it]

Processing document with ID: IRS-2021-0002-0011 - 27 chunks 


 42%|████▏     | 83/199 [11:36<17:36,  9.11s/it]

Processing document with ID: IRS-2021-0003-0014 - 11 chunks 


 42%|████▏     | 84/199 [11:44<16:43,  8.72s/it]

Processing document with ID: IRS-2021-0012-0002 - 31 chunks 


 43%|████▎     | 85/199 [12:03<22:05, 11.63s/it]

Processing document with ID: IRS-2021-0012-0003 - 1 chunks 


 43%|████▎     | 86/199 [12:05<16:44,  8.89s/it]

Processing document with ID: IRS-2021-0012-0004 - 41 chunks 


 44%|████▎     | 87/199 [12:29<25:00, 13.40s/it]

Processing document with ID: IRS-2021-0014-0006 - 6 chunks 


 44%|████▍     | 88/199 [12:35<20:38, 11.16s/it]

Processing document with ID: IRS-2021-0017-0012 - 36 chunks 


 45%|████▍     | 89/199 [12:56<25:55, 14.15s/it]

Processing document with ID: IRS-2021-0017-0013 - 1 chunks 


 45%|████▌     | 90/199 [12:58<19:13, 10.59s/it]

Processing document with ID: IRS-2022-0002-0001 - 45 chunks 


 46%|████▌     | 91/199 [13:25<27:32, 15.30s/it]

Processing document with ID: IRS-2022-0002-0002 - 1 chunks 


 46%|████▌     | 92/199 [13:27<20:12, 11.33s/it]

Processing document with ID: IRS-2022-0002-0003 - 1 chunks 


 47%|████▋     | 93/199 [13:29<15:13,  8.62s/it]

Processing document with ID: IRS-2022-0002-0004 - 1 chunks 


 47%|████▋     | 94/199 [13:31<11:24,  6.52s/it]

Processing document with ID: IRS-2022-0004-0012 - 22 chunks 


 48%|████▊     | 95/199 [13:43<14:32,  8.39s/it]

Processing document with ID: IRS-2022-0006-3890 - 1 chunks 


 48%|████▊     | 96/199 [13:45<11:12,  6.53s/it]

Processing document with ID: IRS-2022-0009-0008 - 47 chunks 


 49%|████▊     | 97/199 [14:13<21:47, 12.82s/it]

Processing document with ID: IRS-2022-0016-0001 - 30 chunks 


 49%|████▉     | 98/199 [14:30<23:54, 14.20s/it]

Processing document with ID: IRS-2022-0016-0002 - 1 chunks 


 50%|████▉     | 99/199 [14:32<17:26, 10.46s/it]

Processing document with ID: IRS-2022-0019-0008 - 17 chunks 


 50%|█████     | 100/199 [14:42<16:58, 10.28s/it]

Processing document with ID: IRS-2022-0019-0009 - 1 chunks 


 51%|█████     | 101/199 [14:44<12:55,  7.91s/it]

Processing document with ID: IRS-2022-0026-0003 - 1 chunks 


 51%|█████▏    | 102/199 [14:46<09:56,  6.15s/it]

Processing document with ID: IRS-2022-0026-0004 - 2 chunks 


 52%|█████▏    | 103/199 [14:50<08:40,  5.42s/it]

Processing document with ID: IRS-2022-0026-0005 - 1 chunks 


 52%|█████▏    | 104/199 [14:53<07:10,  4.53s/it]

Processing document with ID: IRS-2022-0035-0002 - 8 chunks 


 53%|█████▎    | 105/199 [14:57<07:10,  4.58s/it]

Processing document with ID: IRS-2022-0039-0013 - 1 chunks 


 53%|█████▎    | 106/199 [14:59<05:54,  3.82s/it]

Processing document with ID: IRS-2023-0012-0059 - 3 chunks 


 54%|█████▍    | 107/199 [15:03<05:40,  3.70s/it]

Processing document with ID: IRS-2023-0016-0017 - 32 chunks 


 54%|█████▍    | 108/199 [15:21<12:16,  8.09s/it]

Processing document with ID: IRS-2023-0021-0007 - 60 chunks 


 55%|█████▍    | 109/199 [15:56<24:10, 16.12s/it]

Processing document with ID: IRS-2023-0025-0081 - 1 chunks 


 55%|█████▌    | 110/199 [15:59<18:06, 12.21s/it]

Processing document with ID: IRS-2023-0027-0020 - 2 chunks 


 56%|█████▌    | 111/199 [16:03<14:05,  9.60s/it]

Processing document with ID: IRS-2023-0028-0085 - 4 chunks 


 56%|█████▋    | 112/199 [16:07<11:35,  7.99s/it]

Processing document with ID: IRS-2023-0029-0159 - 4 chunks 


 57%|█████▋    | 113/199 [16:12<10:23,  7.25s/it]

Processing document with ID: IRS-2023-0031-0005 - 24 chunks 


 57%|█████▋    | 114/199 [16:26<13:09,  9.28s/it]

Processing document with ID: IRS-2023-0041-44836 - 3 chunks 


 58%|█████▊    | 115/199 [16:29<10:07,  7.23s/it]

Processing document with ID: IRS-2023-0042-0349 - 8 chunks 


 58%|█████▊    | 116/199 [16:36<09:55,  7.18s/it]

Processing document with ID: IRS-2023-0044-0003 - 7 chunks 


 59%|█████▉    | 117/199 [16:40<08:33,  6.26s/it]

Processing document with ID: IRS-2023-0045-0004 - 22 chunks 


 59%|█████▉    | 118/199 [16:53<11:05,  8.22s/it]

Processing document with ID: IRS-2023-0046-0002 - 17 chunks 


 60%|█████▉    | 119/199 [17:03<11:37,  8.72s/it]

Processing document with ID: IRS-2023-0047-0004 - 64 chunks 


 60%|██████    | 120/199 [17:40<22:38, 17.19s/it]

Processing document with ID: IRS-2023-0049-0003 - 16 chunks 


 61%|██████    | 121/199 [17:49<19:18, 14.85s/it]

Processing document with ID: IRS-2023-0050-0006 - 17 chunks 


 61%|██████▏   | 122/199 [17:59<17:02, 13.27s/it]

Processing document with ID: IRS-2023-0056-0011 - 3 chunks 


 62%|██████▏   | 123/199 [18:05<14:13, 11.23s/it]

Processing document with ID: IRS-2023-0063-0198 - 6 chunks 


 62%|██████▏   | 124/199 [18:10<11:35,  9.27s/it]

Processing document with ID: IRS-2024-0010-0017 - 60 chunks 


 63%|██████▎   | 125/199 [18:45<20:53, 16.94s/it]

Processing document with ID: IRS-2024-0010-0018 - 1 chunks 


 63%|██████▎   | 126/199 [18:48<15:45, 12.95s/it]

Processing document with ID: IRS-2024-0013-0020 - 36 chunks 


 64%|██████▍   | 127/199 [19:09<18:16, 15.23s/it]

Processing document with ID: IRS-2024-0018-0008 - 34 chunks 


 64%|██████▍   | 128/199 [19:29<19:40, 16.62s/it]

Processing document with ID: IRS-2024-0048-0012 - 26 chunks 


 65%|██████▍   | 129/199 [19:44<18:48, 16.12s/it]

Processing document with ID: IRS_FRDOC_0001-1901 - 3 chunks 


 65%|██████▌   | 130/199 [19:48<14:37, 12.71s/it]

Processing document with ID: IRS_FRDOC_0001-1903 - 1 chunks 


 66%|██████▌   | 131/199 [19:50<10:45,  9.49s/it]

Processing document with ID: IRS_FRDOC_0001-2112 - 45 chunks 


 66%|██████▋   | 132/199 [20:16<16:05, 14.41s/it]

Processing document with ID: IRS_FRDOC_0001-2208 - 66 chunks 


 67%|██████▋   | 133/199 [20:55<23:43, 21.57s/it]

Processing document with ID: IRS_FRDOC_0001-2211 - 4 chunks 


 67%|██████▋   | 134/199 [21:00<18:00, 16.63s/it]

Processing document with ID: IRS_FRDOC_0001-2267 - 23 chunks 


 68%|██████▊   | 135/199 [21:13<16:39, 15.61s/it]

Processing document with ID: SEC-2020-0216-0001 - 6 chunks 


 68%|██████▊   | 136/199 [21:17<12:47, 12.18s/it]

Processing document with ID: SEC-2020-0258-0001 - 12 chunks 


 69%|██████▉   | 137/199 [21:24<11:01, 10.67s/it]

Processing document with ID: SEC-2020-0294-0001 - 5 chunks 


 69%|██████▉   | 138/199 [21:29<08:59,  8.85s/it]

Processing document with ID: SEC-2020-0329-0001 - 41 chunks 


 70%|██████▉   | 139/199 [21:53<13:22, 13.37s/it]

Processing document with ID: SEC-2020-0469-0001 - 23 chunks 


 70%|███████   | 140/199 [22:06<13:12, 13.42s/it]

Processing document with ID: SEC-2020-0551-0001 - 1 chunks 


 71%|███████   | 141/199 [22:08<09:34,  9.91s/it]

Processing document with ID: SEC-2020-0865-0001 - 2 chunks 


 71%|███████▏  | 142/199 [22:10<07:15,  7.64s/it]

Processing document with ID: SEC-2020-1024-0001 - 1 chunks 


 72%|███████▏  | 143/199 [22:12<05:34,  5.97s/it]

Processing document with ID: SEC-2020-1189-0001 - 3 chunks 


 72%|███████▏  | 144/199 [22:15<04:37,  5.05s/it]

Processing document with ID: SEC-2020-1381-0001 - 1 chunks 


 73%|███████▎  | 145/199 [22:18<03:50,  4.26s/it]

Processing document with ID: SEC-2020-1400-0001 - 37 chunks 


 73%|███████▎  | 146/199 [22:39<08:20,  9.45s/it]

Processing document with ID: SEC-2020-1411-0001 - 11 chunks 


 74%|███████▍  | 147/199 [22:46<07:22,  8.52s/it]

Processing document with ID: SEC-2020-1527-0001 - 3 chunks 


 74%|███████▍  | 148/199 [22:48<05:46,  6.80s/it]

Processing document with ID: SEC-2020-1598-0001 - 2 chunks 


 75%|███████▍  | 149/199 [22:50<04:28,  5.38s/it]

Processing document with ID: SEC-2020-1748-0001 - 9 chunks 


 75%|███████▌  | 150/199 [22:55<04:17,  5.25s/it]

Processing document with ID: SEC-2020-1930-0001 - 39 chunks 


 76%|███████▌  | 151/199 [23:18<08:19, 10.41s/it]

Processing document with ID: SEC-2020-2079-0001 - 9 chunks 


 76%|███████▋  | 152/199 [23:23<07:00,  8.94s/it]

Processing document with ID: SEC-2021-0141-0001 - 36 chunks 


 77%|███████▋  | 153/199 [23:44<09:30, 12.41s/it]

Processing document with ID: SEC-2021-0157-0001 - 25 chunks 


 77%|███████▋  | 154/199 [23:59<09:52, 13.16s/it]

Processing document with ID: SEC-2021-0159-0001 - 38 chunks 


 78%|███████▊  | 155/199 [24:20<11:31, 15.72s/it]

Processing document with ID: SEC-2021-0188-0001 - 1 chunks 


 78%|███████▊  | 156/199 [24:23<08:21, 11.66s/it]

Processing document with ID: SEC-2021-0281-0001 - 25 chunks 


 79%|███████▉  | 157/199 [24:37<08:43, 12.47s/it]

Processing document with ID: SEC-2021-0332-0001 - 1 chunks 


 79%|███████▉  | 158/199 [24:39<06:24,  9.37s/it]

Processing document with ID: SEC-2021-0687-0001 - 17 chunks 


 80%|███████▉  | 159/199 [24:49<06:20,  9.52s/it]

Processing document with ID: SEC-2021-0785-0001 - 1 chunks 


 80%|████████  | 160/199 [24:51<04:44,  7.29s/it]

Processing document with ID: SEC-2021-0830-0001 - 9 chunks 


 81%|████████  | 161/199 [24:56<04:12,  6.65s/it]

Processing document with ID: SEC-2021-1031-0001 - 11 chunks 


 81%|████████▏ | 162/199 [25:03<04:04,  6.60s/it]

Processing document with ID: SEC-2021-1101-0001 - 7 chunks 


 82%|████████▏ | 163/199 [25:07<03:28,  5.78s/it]

Processing document with ID: SEC-2021-1106-0001 - 1 chunks 


 82%|████████▏ | 164/199 [25:09<02:42,  4.65s/it]

Processing document with ID: SEC-2021-1148-0001 - 5 chunks 


 83%|████████▎ | 165/199 [25:14<02:40,  4.73s/it]

Processing document with ID: SEC-2021-1385-0001 - 14 chunks 


 83%|████████▎ | 166/199 [25:22<03:10,  5.77s/it]

Processing document with ID: SEC-2021-1588-0001 - 15 chunks 


 84%|████████▍ | 167/199 [25:30<03:28,  6.50s/it]

Processing document with ID: SEC-2021-1767-0001 - 14 chunks 


 84%|████████▍ | 168/199 [25:38<03:37,  7.00s/it]

Processing document with ID: SEC-2022-0013-0001 - 11 chunks 


 85%|████████▍ | 169/199 [25:44<03:22,  6.74s/it]

Processing document with ID: SEC-2022-0211-0001 - 2 chunks 


 85%|████████▌ | 170/199 [25:49<02:54,  6.03s/it]

Processing document with ID: SEC-2022-0491-0001 - 11 chunks 


 86%|████████▌ | 171/199 [25:55<02:48,  6.02s/it]

Processing document with ID: SEC-2022-0531-0001 - 17 chunks 


 86%|████████▋ | 172/199 [26:05<03:14,  7.19s/it]

Processing document with ID: SEC-2022-0564-0001 - 11 chunks 


 87%|████████▋ | 173/199 [26:11<03:02,  7.02s/it]

Processing document with ID: SEC-2022-0892-0001 - 1 chunks 


 87%|████████▋ | 174/199 [26:12<02:10,  5.21s/it]

Processing document with ID: SEC-2022-0954-0001 - 10 chunks 


 88%|████████▊ | 175/199 [26:18<02:07,  5.30s/it]

Processing document with ID: SEC-2022-1241-0001 - 26 chunks 


 88%|████████▊ | 176/199 [26:33<03:10,  8.27s/it]

Processing document with ID: SEC-2022-1344-0001 - 7 chunks 


 89%|████████▉ | 177/199 [26:37<02:35,  7.05s/it]

Processing document with ID: SEC-2022-1630-0001 - 6 chunks 


 89%|████████▉ | 178/199 [26:41<02:10,  6.21s/it]

Processing document with ID: SEC-2023-0079-0001 - 14 chunks 


 90%|████████▉ | 179/199 [26:50<02:16,  6.84s/it]

Processing document with ID: SEC-2023-0226-0001 - 28 chunks 


 90%|█████████ | 180/199 [27:06<03:01,  9.57s/it]

Processing document with ID: SEC-2023-0437-0001 - 8 chunks 


 91%|█████████ | 181/199 [27:10<02:26,  8.16s/it]

Processing document with ID: SEC-2023-0559-0001 - 16 chunks 


 91%|█████████▏| 182/199 [27:19<02:22,  8.39s/it]

Processing document with ID: SEC-2023-0605-0001 - 15 chunks 


 92%|█████████▏| 183/199 [27:28<02:15,  8.47s/it]

Processing document with ID: SEC-2023-0741-0001 - 9 chunks 


 92%|█████████▏| 184/199 [27:33<01:53,  7.58s/it]

Processing document with ID: SEC-2023-1048-0001 - 40 chunks 


 93%|█████████▎| 185/199 [27:57<02:50, 12.21s/it]

Processing document with ID: SEC-2023-1103-0001 - 9 chunks 


 93%|█████████▎| 186/199 [28:02<02:10, 10.05s/it]

Processing document with ID: SEC-2023-1226-0001 - 1 chunks 


 94%|█████████▍| 187/199 [28:05<01:35,  7.95s/it]

Processing document with ID: SEC-2024-0089-0001 - 10 chunks 


 94%|█████████▍| 188/199 [28:10<01:20,  7.33s/it]

Processing document with ID: SEC-2024-0496-0001 - 30 chunks 


 95%|█████████▍| 189/199 [28:28<01:43, 10.30s/it]

Processing document with ID: SEC-2024-0510-0001 - 6 chunks 


 95%|█████████▌| 190/199 [28:36<01:27,  9.68s/it]

Processing document with ID: SEC-2024-0531-0001 - 10 chunks 


 96%|█████████▌| 191/199 [28:41<01:07,  8.40s/it]

Processing document with ID: SEC-2024-0800-0001 - 1 chunks 


 96%|█████████▋| 192/199 [28:43<00:44,  6.41s/it]

Processing document with ID: SEC-2024-0849-0001 - 1 chunks 


 97%|█████████▋| 193/199 [28:45<00:30,  5.11s/it]

Processing document with ID: SEC-2024-1076-0001 - 7 chunks 


 97%|█████████▋| 194/199 [28:50<00:24,  4.88s/it]

Processing document with ID: SEC-2024-1184-0001 - 23 chunks 


 98%|█████████▊| 195/199 [29:03<00:29,  7.35s/it]

Processing document with ID: SEC-2024-1493-0001 - 7 chunks 


 98%|█████████▊| 196/199 [29:07<00:19,  6.54s/it]

Processing document with ID: SEC-2024-1627-0001 - 63 chunks 


 99%|█████████▉| 197/199 [29:44<00:30, 15.47s/it]

Processing document with ID: SEC-2024-1650-0001 - 5 chunks 


 99%|█████████▉| 198/199 [29:53<00:13, 13.54s/it]

Processing document with ID: SEC-2024-1821-0001 - 6 chunks 


100%|██████████| 199/199 [29:56<00:00,  9.03s/it]


In [None]:
len(generated_summaries)

199

In [None]:
df_less_than_20000.head(10)

Unnamed: 0,ID,original_content,gold_summary,token_length,token_gold_summary
0,IRS-2008-0041-0003,background the final regulations (td 9394) tha...,this document contains corrections to treasury...,781,126
1,IRS-2008-0053-0009,background the final regulations (td 9515) tha...,this document contains a correction to treasur...,700,155
2,IRS-2008-0092-0007,background the final regulations (td 9614) tha...,this document contains a correction to a treas...,1104,124
3,IRS-2011-0050-0006,background the final regulations (td 9630) tha...,this document contains corrections to treasury...,217,97
4,IRS-2014-0001-0014,background the final regulations (td 9806) tha...,this document contains corrections to treasury...,269,132
5,IRS-2014-0030-0004,authority this document amends the income tax ...,this document contains final regulations relat...,14034,91
8,IRS-2016-0007-0009,background the final regulations (td 9981) tha...,this document contains a correction to treasur...,208,120
9,IRS-2016-0007-0010,background the final regulations (td 9981) tha...,this document corrects the correction to treas...,171,122
11,IRS-2016-0038-0007,background this document contains amendments t...,this document contains the final regulations t...,8973,74
13,IRS-2016-0044-0011,background section 401(a)(11) of the internal ...,this document sets forth final regulations pro...,17591,136


In [None]:

df_less_than_20000['generated_summary'] = generated_summaries

In [None]:

# Function to get token length of llm generated summary
def token_length_generated_summary(generated_summary):
    tokens = t5_tokenizer.tokenize(generated_summary)
    return len(tokens)
#
df_less_than_20000['tokens_generated_summary'] = df_less_than_20000['generated_summary'].apply(token_length_generated_summary)


In [None]:
df_less_than_20000.head(10)

Unnamed: 0,ID,original_content,gold_summary,token_length,token_gold_summary,generated_summary,tokens_generated_summary
0,IRS-2008-0041-0003,background the final regulations (td 9394) tha...,this document contains corrections to treasury...,781,126,final regulations (td 9394) contained errors t...,113
1,IRS-2008-0053-0009,background the final regulations (td 9515) tha...,this document contains a correction to treasur...,700,155,treasury decision 9515 was corrected at 76 cfr...,132
2,IRS-2008-0092-0007,background the final regulations (td 9614) tha...,this document contains a correction to a treas...,1104,124,final regulations (td 9614) contained errors t...,119
3,IRS-2011-0050-0006,background the final regulations (td 9630) tha...,this document contains corrections to treasury...,217,97,final regulations (td 9630) contain an error t...,63
4,IRS-2014-0001-0014,background the final regulations (td 9806) tha...,this document contains corrections to treasury...,269,132,final regulations (td 9806) contained errors t...,57
5,IRS-2014-0030-0004,authority this document amends the income tax ...,this document contains final regulations relat...,14034,91,irs regulations amend section 752 of code rega...,651
8,IRS-2016-0007-0009,background the final regulations (td 9981) tha...,this document contains a correction to treasur...,208,120,final regulations (td 9981) that are subject o...,56
9,IRS-2016-0007-0010,background the final regulations (td 9981) tha...,this document corrects the correction to treas...,171,122,final regulations (td 9981) that are the subje...,74
11,IRS-2016-0038-0007,background this document contains amendments t...,this document contains the final regulations t...,8973,74,irs is required to charge user fees for proces...,417
13,IRS-2016-0044-0011,background section 401(a)(11) of the internal ...,this document sets forth final regulations pro...,17591,136,401(a)(11) of the internal revenue code provid...,804


In [None]:
# Importing evaluate library
import evaluate
rouge_metric = evaluate.load("rouge")
rouge_results = rouge_metric.compute(predictions=df_less_than_20000.generated_summary, references=df_less_than_20000.gold_summary, use_stemmer=True)
print(rouge_results)

{'rouge1': np.float64(0.3181544008513224), 'rouge2': np.float64(0.10093954087935489), 'rougeL': np.float64(0.18038433429135386), 'rougeLsum': np.float64(0.1803582756948826)}


In [None]:
# BERTScore calculation
predictions = df_less_than_20000["generated_summary"].tolist()
references = df_less_than_20000["gold_summary"].tolist()

bertscore_metric = evaluate.load('bertscore')
bert = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
sum(bert['f1']) / len(bert['f1'])

0.8182603072880501

**Herarchical Method Implementation**

In [None]:
#
herarchical_summaries = []

for _, row in tqdm(df_less_than_20000.iterrows(), total=len(df_less_than_20000)):
    input_text = row['generated_summary'] #getting chunk based generated summaries
    target_length = int(row['token_gold_summary'])  # fetching every rows gold_summary tokens' length for dynamic length

    # Prepare input
    #tokenizing
    input_ids = t5_tokenizer(
        "summarize: " + input_text,
        return_tensors="pt",

        max_length=512 #because t5 limit to 512 tokens
    ).input_ids.to("cuda")

    # Generate with fixed length
    output_ids = t5_model.generate(
        input_ids,
        max_length=target_length, #dynamica max_length generated for each column
    )
    #decode to text back
    summary = t5_tokenizer.decode(output_ids[0], skip_special_tokens=True) # generating summaries
    herarchical_summaries.append(summary)  #append summaries to list




  0%|          | 0/199 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 199/199 [07:28<00:00,  2.25s/it]


In [None]:
len(herarchical_summaries)

199

In [None]:
df_less_than_20000['her_generated_summary'] = herarchical_summaries #putting new summaries in new column

In [None]:
# Importing evaluate library
import evaluate
rouge_metric = evaluate.load("rouge")
rouge_results = rouge_metric.compute(predictions=df_less_than_20000.her_generated_summary, references=df_less_than_20000.gold_summary, use_stemmer=True)
print(rouge_results)


{'rouge1': np.float64(0.2520255670268635), 'rouge2': np.float64(0.0764821573448203), 'rougeL': np.float64(0.1620459789418949), 'rougeLsum': np.float64(0.1615695074446009)}


In [None]:
# BERTScore calculation
predictions = df_less_than_20000["her_generated_summary"].tolist()
references = df_less_than_20000["gold_summary"].tolist()

bertscore_metric = evaluate.load('bertscore')
bert = bertscore_metric.compute(predictions=predictions, references=references, lang="en")

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
bertt = sum(bert['f1']) / len(bert['f1'])

In [None]:
bertt

0.8191639661189899

In [None]:
# 0.8049128425121307

In [None]:

# Prepare dictionary with metric names as keys and scores as values
rouge_dict = {
    "Model": "T5 (Pre-Trained)",
    "Experiment": "Herarchical",
    "ROUGE-1": rouge_results["rouge1"],
    "ROUGE-2": rouge_results["rouge2"],
    "ROUGE-L": rouge_results["rougeL"],
    "Bart ":  bertt,
}

# Create DataFrame with one row
rouge_df = pd.DataFrame([rouge_dict])

rouge_df


Unnamed: 0,Model,Experiment,ROUGE-1,ROUGE-2,ROUGE-L,Bart
0,T5 (Pre-Trained),Herarchical,0.252997,0.076905,0.162117,0.819309
