In [None]:
%%capture
%pip install datasets

# Data Preparation
Here, we only want to extract the parts of the rulings that relate to the "reasons" of the decision.

In [None]:
import re
from datasets import load_dataset, Dataset

## Load Data
Let's load the raw data from HuggingFace.

In [None]:
data = load_dataset("istat-ai/court-rulings-coi", split="train")

texts = data["Text"]
df = data.to_pandas()

## Start/End Identifiers
We need to identify what are the expressions that signal the beginning and the end of the section of interest.

In [None]:
start_identifiers = [
    "\nmotivo della decisione",
    "\nmotivi della decisione",
    "\nmotivi di fatto e di diritto della decisione",
    "\ndiritto",
    "\nfatto e diritto",
    "\nesaminato in diritto",
    "\nd i r i t t o",
    "\nmotivazione",
    "\nmotivi in fatto ed in diritto della decisione",
    "\nconsiderato in diritto che",
    "\nragioni della decisione",
    "\nsvolgimento del processo e motivi della decisione",
    "\nconsiderato in diritto che",
    "\nconsiderato che",
    "\nmotivazioni della decisione",
    "\nconsiderato in diritto",
    "\nragioni del decidere",
    "\nconsiderato in fatto",
    "\nritenuto in diritto",
    "\nconsiderato in diritto",
    "\nconsiderato che",
    "\nle ragioni della decisione",
    "\nritenuto in fatto e considerato in diritto",
    "\nf a t t o e d i r i t t o",
    "\ncondiderato in diritto"
]

end_identifiers = [
    "\np.q.m.",
    "p.q.m.",
    "pq.m.",
    "\np.q.m",
    "\np.q. m.",
    "\np. q. m.",
    "\npqm",
    "p . q. m.",
    "p. q. m .",
    "p.q m.",
    "p . q . m .",
    "\nper questi motivi"
]

## Clean the Texts
Now we can clean the texts to extract the sections of interest.

In [None]:
clean_texts = []

for text in texts:
    lower = text.lower()
    out = None

    for start in start_identifiers:
        if start in lower:
            idx = lower.index(start) + len(start)
            out = text[idx:]
            break

    if out is not None:
        lower_out = out.lower()
        for end in end_identifiers:
            if end in lower_out:
                idx_end = lower_out.index(end)
                out = out[:idx_end]
                break

    if out is None or out.strip() == "":
        clean_texts.append("NO_TEXT")
    else:
        clean_texts.append(out.strip())

Simple regex cleaning to remove page numbers.

In [None]:
clean_texts = [re.sub(r'\n\d+\n', ' ', text) for text in clean_texts]

## Export the Data
Now we add the clean texts to the df and turn it into a Dataset.

In [None]:
df["Clean_Text"] = clean_texts

new_data = Dataset.from_pandas(df)

Push to HF.

In [None]:
new_data.push_to_hub("istat-ai/court-rulings-coi", commit_message="Add clean text")