In [None]:
%%capture
!pip install nltk datasets bertopic sentence-transformers

# BERTopic Analysis - Court Rulings (Conflict of Interests)

In [None]:
import pandas as pd
import nltk
from nltk.tokenize.punkt import PunktTrainer, PunktSentenceTokenizer
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
from umap import UMAP
from hdbscan import HDBSCAN
from sklearn.feature_extraction.text import CountVectorizer
nltk.download('stopwords')
from nltk.corpus import stopwords
italian_stopwords = stopwords.words('italian')

## Data
Import the dataset from Hugging Face ðŸ¤—.

In [None]:
data = load_dataset("istat-ai/court-rulings-coi", split="train")

## Sentence Tokenization
Now, we need to split the full rulings into individual phrases. To do this, we train a custom tokenizer.

In [None]:
long_text = ""

for text in data["Text"]:
    long_text += text

trainer = PunktTrainer()
trainer.INCLUDE_ALL_COLLOCS = True
trainer.train(long_text)

tokenizer = PunktSentenceTokenizer(trainer.get_params())

Now we can apply it to the texts.

In [None]:
doc_ids = []
all_sentences = []

for idx, text in enumerate(data["Clean_Text"]):
    sentences = tokenizer.tokenize(text)
    all_sentences.extend(sentences)
    doc_ids.extend([idx] * len(sentences))

---

## BERTopic Modeling
First, we compute the embeddings using a multilingual sentence trasformer.

In [None]:
model = SentenceTransformer("paraphrase-multilingual-MiniLM-L12-v2")
embeddings = model.encode(all_sentences, show_progress_bar=True)

Then, we define the UMAP, HDBSCAN, and Vectorizer models.

In [None]:
umap_model = UMAP(n_neighbors=15, min_dist=0.01, n_components=5, metric='cosine')
hdbscan_model = HDBSCAN(min_cluster_size=200, min_samples=1, cluster_selection_epsilon=0.01)
vectorizer_model = CountVectorizer(stop_words=italian_stopwords, ngram_range=(1, 2))

Finally, we can fit our BERTopic model.

In [None]:
topic_model = BERTopic(
    embedding_model=model,
    umap_model=umap_model,
    hdbscan_model=hdbscan_model,
    vectorizer_model=vectorizer_model,
    verbose=True
)

topics, probs = topic_model.fit_transform(all_sentences, embeddings)

Save the topic info df.

In [None]:
topic_model.get_topic_info().to_csv("coi_topics_info.csv", index=False)

### Reduce Outliers

In [None]:
new_topics = topic_model.reduce_outliers(all_sentences, topics, strategy="distributions")
topic_model.update_topics(all_sentences, topics=new_topics)

Save the updated topic info df.

In [None]:
topic_model.get_topic_info().to_csv("coi_topics_info_no_outliers.csv", index=False)

### Re-map Topics to Sentences

In [None]:
document_info_df = topic_model.get_document_info(all_sentences)
assigned_topics = document_info_df['Topic'].to_list()

Create a document topic df.

In [None]:
document_topic_df = pd.DataFrame({
    'Document_ID': doc_ids,
    'Document': all_sentences,
    'Assigned_Topic': assigned_topics
})

grouped_df = document_topic_df.groupby('Document_ID').agg({
    'Assigned_Topic': list,
    'Document': 'count'
}).rename(columns={'Assigned_Topic': 'Assigned_Topics', 'Document': 'sentence_count'}).reset_index()

Now let's save both the document_topic_df and the grouped_df

In [None]:
document_topic_df.to_csv("coi_document_topic_df.csv", index=False)
grouped_df.to_csv("coi_grouped_df.csv", index=False)