In [18]:
import re

class TextPreprocessor:
    def __init__(self):
        pass

    def change_char(self, text: str) -> str:
        """
        Cleans up text by removing or replacing certain characters and patterns.
        
        Parameters:
        text (str): The text to clean up.

        Returns:
        str: The cleaned text.
        """
        # Custom cleaning logic
        text = re.sub(r"section (\d+)\.", r"section \1", text)
        text = re.sub(r"sec\.", r"sec", text)
        text = re.sub(r"p\.d\.", r"pd", text)
        text = re.sub(r"\bno\.\b", r"number", text)
        text = re.sub(r"\brtc\b", "regional trial court", text)
        text = re.sub(r"[(),'\"’”\[\]]", " ", text)
        text = re.sub(r"[“”]", " ", text)
        text = re.sub(r"\u2033", " ", text)  # Replace double prime symbol
        text = re.sub(r"\u2032", " ", text)  # Replace prime symbol
        text = re.sub(r"\bg\b", " ", text)
        text = re.sub(r"\br\b", " ", text)
        text = re.sub(r"([^\S\n]+)", " ", text)  # Replace multiple spaces except newlines
        return text.strip()


In [28]:
class TextTokenizer:
    def __init__(self):
        pass

    def tokenize_by_paragraph(self, text: str) -> dict:
        """
        Tokenizes the text into paragraphs and creates a dictionary where the key
        is the first 5 sentences of the paragraph and the value is the full paragraph.
        
        Parameters:
        text (str): The text to tokenize into paragraphs.
        
        Returns:
        dict: A dictionary with the first 5 sentences as keys and the full paragraphs as values.
        """
        # Split the text into paragraphs based on empty lines
        paragraphs = text.split("\n")
        
        # Dictionary to store the first 5 sentences as key and the paragraph as value
        paragraph_dict = {}
        
        for paragraph in paragraphs:
            # Remove extra spaces
            paragraph = paragraph.strip()
            if paragraph:
                # Split the paragraph into sentences using a regex for sentence end markers
                sentences = re.split(r'(?<=[.!?]) +', paragraph)
                
                # Key is the first 5 sentences, or all sentences if fewer than 5
                key = " ".join(sentences[:5])
                
                # Value is the entire paragraph
                paragraph_dict[key] = paragraph
        
        return paragraph_dict

In [20]:
with open('input.txt', 'r', encoding='utf-8') as file:
    raw_text = file.read()

In [21]:
# Create an instance of the TextPreprocessor
preprocessor = TextPreprocessor()

# Clean the text
cleaned_text = preprocessor.change_char(raw_text)


In [29]:
# Create an instance of the tokenizer
tokenizer = TextTokenizer()

# Tokenize the text into paragraphs
tokenized_paragraphs = tokenizer.tokenize_by_paragraph(cleaned_text)

In [30]:
# Output the result
for key, value in tokenized_paragraphs.items():
    print(f"Key: {key}")
    print(f"Value: {value}\n")

Key: G.R. No. 191970. April 24 2012
Value: G.R. No. 191970. April 24 2012

Key: 686 Phil. 563
Value: 686 Phil. 563

Key: EN BANC
Value: EN BANC

Key: ROMMEL APOLINARIO JALOSJOS PETITIONER VS. THE COMMISSION ON ELECTIONS AND DAN ERASMO SR. RESPONDENTS.
Value: ROMMEL APOLINARIO JALOSJOS PETITIONER VS. THE COMMISSION ON ELECTIONS AND DAN ERASMO SR. RESPONDENTS.

Key: DECISION
Value: DECISION

Key: ABAD J.:
Value: ABAD J.:

Key: This case is about the proof required to establish the domicile of a reinstated Filipino citizen who seeks election as governor of a province.
Value: This case is about the proof required to establish the domicile of a reinstated Filipino citizen who seeks election as governor of a province.

Key: The Facts and the Case
Value: The Facts and the Case

Key: Petitioner Rommel Jalosjos was born in Quezon City on October 26 1973. He migrated to Australia in 1981 when he was eight years old and there acquired Australian citizenship. On November 22 2008 at age 35 he decid

In [37]:
from transformers import BartForSequenceClassification, BartTokenizer
import torch

# Load the trained model and tokenizer from the saved directory
model_path = 'my_awesome_model/checkpoint-337' # Replace with your actual model path
model = BartForSequenceClassification.from_pretrained(model_path, ignore_mismatched_sizes=True)
tokenizer = BartTokenizer.from_pretrained(model_path)

Some weights of BartForSequenceClassification were not initialized from the model checkpoint at my_awesome_model/checkpoint-337 and are newly initialized because the shapes did not match:
- model.decoder.embed_positions.weight: found shape torch.Size([1026, 768]) in the checkpoint and torch.Size([130, 768]) in the model instantiated
- model.encoder.embed_positions.weight: found shape torch.Size([1026, 768]) in the checkpoint and torch.Size([130, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [76]:
import torch
from torch.nn.functional import softmax

# Set the model to evaluation mode
model.eval()

predicted_labels_dict = {}
previous_label = 'facts'  # To keep track of the previous label


for key, value in tokenized_paragraphs.items():
    # Tokenize the input
    inputs = tokenizer(key, return_tensors="pt", max_length=128)
    
    # Perform inference
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Get the predicted label (logits are raw predictions before softmax)
    logits = outputs.logits
    
    # Calculate softmax probabilities
    probabilities = softmax(logits, dim=-1)
    
    # Get the predicted class ID and its probability
    predicted_class_id = torch.argmax(probabilities, dim=-1).item()
    max_probability = probabilities[0][predicted_class_id].item()  # Get the max probability

    print(f'max probability: {max_probability}')
    # Check if the probability is below the threshold
    if max_probability < 0.4:
        predicted_label = previous_label  # Use the previous label if below threshold
        print(f"Label for '{value}' is below threshold; using previous label: {predicted_label}")
    else:
        print(f"Predicted label for '{value}': {predicted_label}")
        
    # Map the predicted class ID to the corresponding label
    id2label = model.config.id2label
    predicted_label = id2label[predicted_class_id]
    # Output the result
    # print(f"Predicted label for '{value}': {predicted_label}")

    # Store the predicted label in the dictionary
    predicted_labels_dict[key] = predicted_label
    previous_label = predicted_label  # Update previous label for the next iteration

max probability: 0.42346492409706116
Predicted label for 'G.R. No. 191970. April 24 2012': facts
max probability: 0.3536776602268219
Label for '686 Phil. 563' is below threshold; using previous label: facts
max probability: 0.43850308656692505
Predicted label for 'EN BANC': rulings
max probability: 0.374875009059906
Label for 'ROMMEL APOLINARIO JALOSJOS PETITIONER VS. THE COMMISSION ON ELECTIONS AND DAN ERASMO SR. RESPONDENTS.' is below threshold; using previous label: facts
max probability: 0.5026956796646118
Predicted label for 'DECISION': issues
max probability: 0.48749616742134094
Predicted label for 'ABAD J.:': facts
max probability: 0.48357927799224854
Predicted label for 'This case is about the proof required to establish the domicile of a reinstated Filipino citizen who seeks election as governor of a province.': facts
max probability: 0.3963860273361206
Label for 'The Facts and the Case' is below threshold; using previous label: issues
max probability: 0.7955788969993591
Predi

In [77]:
# Prepare the output segments
facts = []
issues = []
rulings = []

for key, label in predicted_labels_dict.items():
    if label == "facts":  # Adjust based on your label names
        facts.append(key)
    elif label == "issues":
        issues.append(key)
    elif label == "rulings":
        rulings.append(key)

# Write to a text file
with open("output_segments.txt", "w") as file:
    file.write("FACTS:\n")
    for fact in facts:
        file.write(f"{fact}\n")
    
    file.write("\nISSUES:\n")
    for issue in issues:
        file.write(f"{issue}\n")
    
    file.write("\nRULINGS:\n")
    for ruling in rulings:
        file.write(f"{ruling}\n")

print("Output segments written to 'output_segments.txt'.")


Output segments written to 'output_segments.txt'.
