In [1]:

from bertopic import BERTopic

In [2]:
import pandas as pd
import numpy as np

import torch
#cuML powered dimensional reduction & clustering
from cuml.cluster import HDBSCAN
from cuml.manifold import UMAP

In [3]:
import json
import os

folder_path = './Supreme Court Jurisprudence'

text_data = []  # List to store the text from each JSON file

for dirpath, dirnames, filenames in os.walk(folder_path):
    for file_name in filenames:
        if file_name.endswith('.json'):
            file_path = os.path.join(dirpath, file_name)
            with open(file_path, 'r') as file:
                json_data = json.load(file)
                text = json_data['text']  # Extract only the text part
                text_data.append({'text': text})  # Append to the list as a dictionary

# Convert the list of dictionaries to a DataFrame
text_df = pd.DataFrame(text_data)

text_df.head()

Unnamed: 0,text
0,The administrative proceedings against Atty. L...
1,"In an affidavit-complaint, [1] dated May 22, 1..."
2,"In a sworn complaint, dated 15 September 1994,..."
3,In deciding these consolidated administrative ...
4,The instant administrative matter arose from a...


In [4]:
text_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30912 entries, 0 to 30911
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    30912 non-null  object
dtypes: object(1)
memory usage: 241.6+ KB


In [9]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer_model = CountVectorizer(ngram_range=(1,2),stop_words="english",min_df=2,max_df=0.95) #this need to be carefully handles kasi depende sa dataset size ata

In [10]:
umap_model = UMAP(n_neighbors=15,
                  n_components=5,
                  min_dist=0.0,
                  metric='cosine',
                  random_state=100)
hdbscan_model = HDBSCAN(min_samples=10, gen_min_span_tree=True, prediction_data=True)

# Initiate BERTopic
topic_model = BERTopic(umap_model=umap_model, language="english", calculate_probabilities=True, hdbscan_model=hdbscan_model, vectorizer_model=vectorizer_model)

# Run BERTopic model
topics, probabilities = topic_model.fit_transform(text_df['text'])

In [11]:
topic_model.get_topic_info()

Unnamed: 0,Topic,Count,Name,Representation,Representative_Docs
0,-1,18725,-1_rtc_labor_sale_accused,"[rtc, labor, sale, accused, nlrc, lot, employe...",[There is no legal or equitable justification ...
1,0,1539,0_victim_accused_crime_appellant,"[victim, accused, crime, appellant, murder, ts...","[On November 13, 1992, at around 6:00 o'clock ..."
2,1,1391,1_complainant_respondent judge_oca_administrative,"[complainant, respondent judge, oca, administr...",[The relationship between a lawyer and a clien...
3,2,1335,2_rape_aaa_appellant_accusedappellant,"[rape, aaa, appellant, accusedappellant, accus...",[For review is the decision [1] of the Court o...
4,3,1198,3_drugs_seized_dangerous_shabu,"[drugs, seized, dangerous, shabu, police, drug...",[Before the Court is a Petition for Review on ...
...,...,...,...,...,...
103,102,5,102_hotel_gonzales_rooms_junta,"[hotel, gonzales, rooms, junta, strike, labor,...",[This is a special civil action for certiorari...
104,103,5,103_villamaria_albia_ng_bustamante,"[villamaria, albia, ng, bustamante, tauhan ng,...",[Before us is a petition for review of the dec...
105,104,5,104_ng_sihi_optimum_kasunduan,"[ng, sihi, optimum, kasunduan, lease, lukban, ...",[The Case Before the Court is a petition for r...
106,105,5,105_detainer_unlawful detainer_unlawful_tolerance,"[detainer, unlawful detainer, unlawful, tolera...",[Before the Court is a petition for review on ...


In [12]:
topic_model.get_topic(0)

[('victim', 0.010430178866298876),
 ('accused', 0.009654708335558697),
 ('crime', 0.009017644563271148),
 ('appellant', 0.00774799588162021),
 ('murder', 0.007099602354429298),
 ('tsn', 0.006555441267843883),
 ('death', 0.006225600016916859),
 ('accusedappellant', 0.006221569191586052),
 ('house', 0.0062018671478291255),
 ('prosecution', 0.005795277286361217)]

In [26]:
topic_model.visualize_barchart()
chart = topic_model.visualize_barchart()

# Write the chart as a html file
chart.write_html("supremecourt_barchart.html")

In [27]:
topic_model.visualize_term_rank()
chart = topic_model.visualize_term_rank()

# Write the chart as a html file
chart.write_html("supremecourt_termrank.html")

In [28]:
topic_model.visualize_topics()
chart = topic_model.visualize_topics()

# Write the chart as a html file
chart.write_html("supremecourt_topics.html")

In [29]:
topic_model.visualize_hierarchy(top_n_topics=12)
chart = topic_model.visualize_hierarchy(top_n_topics=12)

# Write the chart as a html file
chart.write_html("supremecourt_hierarchy12.html")

In [30]:
topic_model.visualize_heatmap()
chart = topic_model.visualize_heatmap()

# Write the chart as a html file
chart.write_html("supremecourt_heatmap.html")

In [31]:
topic_model.visualize_distribution(topic_model.probabilities_[1], min_probability=0.015)

In [32]:
chart = topic_model.visualize_distribution(topic_model.probabilities_[0])

# Write the chart as a html file
chart.write_html("supremecourt_topic_probability(distribution.html")

In [33]:
topic_model.probabilities_[0]

array([0.0038884 , 0.00758807, 0.00227496, 0.00203621, 0.00794579,
       0.00423755, 0.00712601, 0.00476983, 0.00375195, 0.00820925,
       0.0124694 , 0.00894703, 0.00509884, 0.02203508, 0.00669605,
       0.0050995 , 0.01040745, 0.0075761 , 0.00654317, 0.00325238,
       0.00504307, 0.00525552, 0.0083088 , 0.01040407, 0.02268064,
       0.00621816, 0.00258604, 0.00743793, 0.00562332, 0.00330505,
       0.00833757, 0.00594769, 0.00862767, 0.00562881, 0.00641926,
       0.00867791, 0.00620795, 0.01085359, 0.00754428, 0.01024834,
       0.00531578, 0.00691319, 0.00803421, 0.00456034, 0.00907692,
       0.02130449, 0.00204522, 0.00704337, 0.00645547, 0.00759338,
       0.00753079, 0.00472496, 0.00598627, 0.03427731, 0.00655834,
       0.00584048, 0.00623555, 0.01481764, 0.02177798, 0.00575694,
       0.012678  , 0.0073821 , 0.0088221 , 0.00648577, 0.01086549,
       0.00605854, 0.00863863, 0.00952421, 0.00993913, 0.00853328,
       0.00698004, 0.00978017, 0.00685289, 0.00692378, 0.00804

In [34]:
topic_prediction = topic_model.topics_[:]

# Save the predictions in the dataframe
text_df['topic_prediction'] = topic_prediction

# Take a look at the data
text_df.head()

Unnamed: 0,text,topic_prediction
0,The administrative proceedings against Atty. L...,-1
1,"In an affidavit-complaint, [1] dated May 22, 1...",-1
2,"In a sworn complaint, dated 15 September 1994,...",1
3,In deciding these consolidated administrative ...,1
4,The instant administrative matter arose from a...,-1


In [35]:
new_text = "Andres and Priscilla robbed a bank and hurt some victims."

# Find topics
num_of_topics = 3
similar_topics, similarity = topic_model.find_topics(new_text, top_n=num_of_topics);

# Print results
print(f'The top {num_of_topics} similar topics are {similar_topics}, and the similarities are {np.round(similarity,2)}')

The top 3 similar topics are [0, 26, 101], and the similarities are [0.47 0.44 0.43]


In [36]:
for i in range(num_of_topics):
  print(f'The top keywords for topic {similar_topics[i]} are:')
  print(topic_model.get_topic(similar_topics[i]))

The top keywords for topic 0 are:
[('victim', 0.010430178866298876), ('accused', 0.009654708335558697), ('crime', 0.009017644563271148), ('appellant', 0.00774799588162021), ('murder', 0.007099602354429298), ('tsn', 0.006555441267843883), ('death', 0.006225600016916859), ('accusedappellant', 0.006221569191586052), ('house', 0.0062018671478291255), ('prosecution', 0.005795277286361217)]
The top keywords for topic 26 are:
[('kidnapping', 0.025711819364476866), ('ransom', 0.02382776532389467), ('detention', 0.01014100760492133), ('appellant', 0.01010412845648313), ('kidnapping ransom', 0.009637037315963118), ('accused', 0.008898808423802683), ('crime', 0.008704921443085972), ('appellants', 0.008385652539898699), ('car', 0.008026864940923356), ('victim', 0.007881708743769979)]
The top keywords for topic 101 are:
[('check', 0.03742709117895057), ('sakata', 0.028351453233192166), ('bodoy', 0.02733131278607298), ('checks', 0.023042020646872524), ('cashiers', 0.022686022610345906), ('pci bank',

In [24]:
# Save the topic model
topic_model.save("testing_topic_model")

# Load the topic model
my_model = BERTopic.load("testing_topic_model")



In [37]:
my_model = BERTopic.load("testing_topic_model")

In [38]:
topic_model.save("topic_model_supreme", serialization="pickle")

