In [None]:
%%capture
%pip install nltk datasets bertopic sentence-transformers

# BERTopic Analysis - Court Rulings (Conflict of Interests)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer
from nltk.corpus import stopwords
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter

nltk.download('stopwords')
italian_stopwords = stopwords.words('italian')

  $max \{ core_k(a), core_k(b), 1/\alpha d(a,b) \}$.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


## Data
Import the dataset from Hugging Face ðŸ¤—.

In [None]:
data = load_dataset("istat-ai/court-rulings-coi", split="train")

## Sentence Tokenization
Now, we need to split the full rulings into individual phrases. To do this, we train a custom tokenizer.

In [None]:
long_text = ""

for text in data["Text"]:
    long_text += text

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(long_text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

Now we can apply it to the texts.

In [None]:
doc_ids = []
all_sentences = []

for idx, text in enumerate(data["Clean_Text"]):
    sentences = tokenizer.tokenize(text)
    sentences = [s for s in sentences if len(s) > 10]
    all_sentences.extend(sentences)
    doc_ids.extend([idx] * len(sentences))

---

## BERTopic Modeling
First, we compute the embeddings using a multilingual sentence trasformer.

In [None]:
model = SentenceTransformer("google/embeddinggemma-300m")
embeddings = model.encode(all_sentences, show_progress_bar=True, batch_size=32)

Then, we define the UMAP, HDBSCAN, and Vectorizer models.

In [None]:
umap_model = UMAP(n_neighbors=35, min_dist=0.01, n_components=5, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=300, min_samples=1, cluster_selection_epsilon=0.01)
vectorizer_model = CountVectorizer(stop_words=italian_stopwords, ngram_range=(1, 2))

Finally, we can fit our BERTopic model.

In [None]:
topic_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)

topics, probs = topic_model.fit_transform(all_sentences, embeddings)

2026-02-01 10:57:37,790 - BERTopic - Dimensionality - Fitting the dimensionality reduction algorithm
2026-02-01 10:58:28,337 - BERTopic - Dimensionality - Completed âœ“
2026-02-01 10:58:28,340 - BERTopic - Cluster - Start clustering the reduced embeddings
2026-02-01 10:58:34,930 - BERTopic - Cluster - Completed âœ“
2026-02-01 10:58:34,953 - BERTopic - Representation - Fine-tuning topics using representation models.
2026-02-01 10:58:48,431 - BERTopic - Representation - Completed âœ“


In [None]:
topic_model.get_topic_info()

Save the topic info df.

In [None]:
topic_model.get_topic_info().to_csv("coi_topics_info.csv", index=False)

### Reduce Outliers

In [None]:
new_topics = topic_model.reduce_outliers(all_sentences, topics, strategy="distributions")
topic_model.update_topics(all_sentences, topics=new_topics, vectorizer_model=vectorizer_model)

100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 49/49 [00:21<00:00,  2.29it/s]


Save the updated topic info df.

In [None]:
topic_model.get_topic_info().to_csv("coi_topics_info_no_outliers.csv", index=False)

### Re-map Topics to Sentences

In [None]:
document_info_df = topic_model.get_document_info(all_sentences)
assigned_topics = document_info_df['Topic'].to_list()

Create a document topic df.

In [None]:
document_topic_df = pd.DataFrame({
    'Document_ID': doc_ids,
    'Document': all_sentences,
    'Assigned_Topic': assigned_topics
})

grouped_df = document_topic_df.groupby('Document_ID').agg({
    'Assigned_Topic': list,
    'Document': 'count'
}).rename(columns={'Assigned_Topic': 'Assigned_Topics', 'Document': 'sentence_count'}).reset_index()

Now let's save both the document_topic_df and the grouped_df

In [None]:
document_topic_df.to_csv("coi_document_topic_df.csv", index=False)
grouped_df.to_csv("coi_grouped_df.csv", index=False)

### Counts by Ruling
Count how many rulings the topic appears in at least one time.

In [None]:
counts = [Counter(i) for i in grouped_df['Assigned_Topics']]
sorted_counts = [Counter(dict(sorted(c.items(), key=lambda item: item[1], reverse=True))) for c in counts]

occurrences_in_docs = []

for topic in topic_model.get_topic_info()["Topic"]:
    occurrences = 0
    for count in counts:
        if topic in count:
            occurrences += 1
    occurrences_in_docs.append(occurrences)

Export the CSV.

In [None]:
unique_ruling_counts = pd.DataFrame(occurrences_in_docs)
unique_ruling_counts.to_csv("counts_by_ruling_unique.csv", index=False)

### Topic-Ruling Distribution
Create a dataframe of topic distributions per ruling.

In [None]:
df = data.to_pandas()
df["Document_ID"] = df.index

grouped_df = grouped_df.merge(df[["Document_ID", "Provision_Number", "Provision_Year", " URL"]], on="Document_ID", how="left")

Create the final ruling dataframe with info on the topic distribution per ruling.

In [None]:
final_grouped_df = grouped_df[["Provision_Number", "Provision_Year", " URL", "Assigned_Topics"]]

final_grouped_df.rename(columns={
    "Assigned_Topics": "Topics",
    "Provision_Number": "Number",
    "Provision_Year": "Year",
    " URL": "URL",
}, inplace=True)

Export the data.

In [None]:
final_grouped_df.to_csv("topic_ruling_dist.csv", index=False)

Export the model to huggingface.

In [None]:
PUSH_TO_HF: bool = False

if PUSH_TO_HF:
    topic_model.push_to_hf_hub("istat-ai/coi-topic", private=True)