In [1]:
%%capture
%pip install nltk datasets bertopic sentence-transformers

# BERTopic Analysis - Court Rulings (Conflict of Interests)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer
from nltk.corpus import stopwords
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

nltk.download('stopwords')
italian_stopwords = stopwords.words('italian')

## Data
Import the dataset from Hugging Face ðŸ¤—.

In [None]:
data = load_dataset("istat-ai/court-rulings-coi", split="train")

## Sentence Tokenization
Now, we need to split the full rulings into individual phrases. To do this, we train a custom tokenizer.

In [4]:
long_text = ""

for text in data["Text"]:
    long_text += text

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(long_text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

Now we can apply it to the texts.

In [5]:
doc_ids = []
all_sentences = []

for idx, text in enumerate(data["Clean_Text"]):
    sentences = tokenizer.tokenize(text)
    sentences = [s for s in sentences if len(s) > 10]
    all_sentences.extend(sentences)
    doc_ids.extend([idx] * len(sentences))

---

## Load BERTopic Model
Load the BERTopic model from Hugging Face.

In [None]:
MERGE_TOPICS: bool = False

topic_model = BERTopic.load("istat-ai/coi-bertopic", embedding_model="google/embeddinggemma-300m")

if MERGE_TOPICS:
    topic_model.merge_topics(
        docs=all_sentences,
        topics_to_merge=[[3, 5], [1, 4]]
    )

topic_model.get_topic_info()

Save the topic info df.

In [None]:
topic_model.get_topic_info().to_csv("coi_topics_info.csv", index=False)

### Re-map Topics to Sentences

In [16]:
document_info_df = topic_model.get_document_info(all_sentences)
assigned_topics = document_info_df['Topic'].to_list()

Create a document topic df.

In [17]:
document_topic_df = pd.DataFrame({
    'Document_ID': doc_ids,
    'Document': all_sentences,
    'Assigned_Topic': assigned_topics
})

grouped_df = document_topic_df.groupby('Document_ID').agg({
    'Assigned_Topic': list,
    'Document': 'count'
}).rename(columns={'Assigned_Topic': 'Assigned_Topics', 'Document': 'sentence_count'}).reset_index()

Now let's save both the document_topic_df and the grouped_df

In [20]:
document_topic_df.to_csv("coi_document_topic_df.csv", index=False)
grouped_df.to_csv("coi_grouped_df.csv", index=False)

### Counts by Ruling
Count how many rulings the topic appears in at least one time.

In [21]:
counts = [Counter(i) for i in grouped_df['Assigned_Topics']]
sorted_counts = [Counter(dict(sorted(c.items(), key=lambda item: item[1], reverse=True))) for c in counts]

occurrences_in_docs = []

for topic in topic_model.get_topic_info()["Topic"]:
    occurrences = 0
    for count in counts:
        if topic in count:
            occurrences += 1
    occurrences_in_docs.append(occurrences)

Export the CSV.

In [23]:
unique_ruling_counts = pd.DataFrame(occurrences_in_docs)
unique_ruling_counts.to_csv("counts_by_ruling_unique.csv", index=False)

### Topic-Ruling Distribution
Create a dataframe of topic distributions per ruling.

In [25]:
df = data.to_pandas()
df["Document_ID"] = df.index

grouped_df = grouped_df.merge(df[["Document_ID", "Provision_Number", "Provision_Year", " URL"]], on="Document_ID", how="left")

Create the final ruling dataframe with info on the topic distribution per ruling.

In [26]:
final_grouped_df = grouped_df[["Provision_Number", "Provision_Year", " URL", "Assigned_Topics"]]

final_grouped_df.rename(columns={
    "Assigned_Topics": "Topics",
    "Provision_Number": "Number",
    "Provision_Year": "Year",
    " URL": "URL",
}, inplace=True)

Export the data.

In [None]:
final_grouped_df.to_csv("topic_ruling_dist.csv", index=False)