In [1]:
import pandas as pd
from gensim.models import LdaModel
from gensim.corpora import Dictionary
from gensim.utils import simple_preprocess
from nltk.corpus import stopwords
import string

# Load the CSV file
df = pd.read_csv("sectionized_data.csv")
df = df.dropna()

# Preprocessing function
def preprocess(text):
    text = text.lower()  # Convert text to lowercase
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    tokens = simple_preprocess(text, deacc=True)  # Tokenize and remove accents
    return tokens

# Tokenize and preprocess the text
df['tokens'] = df['Body'].apply(preprocess)

#print(df['tokens'])
# Create a dictionary representation of the documents
dictionary = Dictionary(df['tokens'])

# Bag-of-words representation of the documents
corpus = [dictionary.doc2bow(doc) for doc in df['tokens']]

In [None]:
from transformers import BertTokenizer, BertModel
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from gensim.models import CoherenceModel
import numpy as np
import torch

import warnings
# Ignore all warnings
warnings.filterwarnings("ignore")

# Load pre-trained BERT legal model and tokenizer
model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Filter out rows with empty token lists
df = df[df['tokens'].map(len) > 0]

# Encode your documents into BERT embeddings
documents = df['tokens']  # Your list of preprocessed text documents

# Encode each document
encoded_documents = [tokenizer.encode(doc, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='pt') for doc in documents]

# Compute BERT embeddings for each document
with torch.no_grad():
    embeddings = []
    for encoded_doc in encoded_documents:
        outputs = model(encoded_doc)
        pooled_output = outputs[1]  # Take the pooled output
        embeddings.append(pooled_output.numpy())

# Flatten the embeddings
flat_embeddings = np.concatenate(embeddings, axis=0)

# Initialize variables to store optimal values
optimal_num_topics = 0
max_coherence_score = -1

# Range of possible numbers of topics to try
num_topics_range = range(2, 100)

# Iterate over different numbers of topics
for num_topics in num_topics_range:
    # Apply clustering (e.g., KMeans) to identify topics
    kmeans = KMeans(n_clusters=num_topics, random_state=42)
    topic_labels = kmeans.fit_predict(flat_embeddings)
    
    # Compute coherence score
    coherence_model = CoherenceModel(topics=documents, texts=documents, dictionary=dictionary, coherence='c_v')
    coherence_score = coherence_model.get_coherence()
    
    # Update optimal number of topics if coherence score is higher
    if coherence_score > max_coherence_score:
        optimal_num_topics = num_topics
        max_coherence_score = coherence_score

print("Optimal Number of Topics:", optimal_num_topics)
print("Max Coherence Score:", max_coherence_score)


  torch.utils._pytree._register_pytree_node(
We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


In [None]:
print('Coeherance Done!')

In [None]:
from transformers import BertTokenizer, BertModel
import torch
from sklearn.cluster import KMeans
from collections import Counter
import string
from nltk.corpus import stopwords

# Load pre-trained BERT Legal model and tokenizer
model_name = 'nlpaueb/legal-bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Load English stop words
stop_words = set(stopwords.words('english'))

# Encode your documents into BERT embeddings
documents = df['tokens']  # Your list of preprocessed text documents

# Encode each document
encoded_documents = [tokenizer.encode(doc, add_special_tokens=True, max_length=512, truncation=True, padding='max_length', return_tensors='pt') for doc in documents]

# Compute BERT embeddings for each document
with torch.no_grad():
    embeddings = []
    for encoded_doc in encoded_documents:
        outputs = model(encoded_doc)
        pooled_output = outputs[1]  # Take the pooled output
        embeddings.append(pooled_output.numpy())

# Flatten the embeddings
flat_embeddings = np.concatenate(embeddings, axis=0)

# Apply clustering (e.g., KMeans) to identify topics
num_topics = 2
kmeans = KMeans(n_clusters=num_topics, random_state=42)
topic_labels = kmeans.fit_predict(flat_embeddings)

# Print the assigned topic labels for each document
document_topic_labels = {}
for i, label in enumerate(topic_labels):
    print(f"Document {i}: Topic {label}")
    document_topic_labels["Document " + str(i)] = "Topic " + str(label)
    
#print(document_topic_labels)
# Collect documents assigned to each topic
topic_documents = [[] for _ in range(num_topics)]
for i, label in enumerate(topic_labels):
    topic_documents[label].append(documents[i])

# Tokenize the combined text of each topic and filter out stop words, punctuation, short words, and words containing '#'
topic_tokenized_texts = []
for docs in topic_documents:
    # Ensure that each element in docs is a string
    docs = [str(doc) for doc in docs]
    combined_text = ' '.join(docs)
    # Tokenize the text
    tokens = tokenizer.tokenize(combined_text)
    # Filter out stop words, punctuation, short words, and words containing '#'
    filtered_tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation and len(token) > 3 and '#' not in token]
    topic_tokenized_texts.append(filtered_tokens)

# Count the occurrences of each word in the tokenized text
word_distributions = [Counter(tokens) for tokens in topic_tokenized_texts]

# Calculate total number of words in each topic
topic_word_counts = [sum(word_distribution.values()) for word_distribution in word_distributions]

# Calculate the probability of each word in each topic and get the top 10 words
top_words_probability = []
for topic_idx, word_distribution in enumerate(word_distributions):
    top_words = word_distribution.most_common(10)
    topic_probability = {word: count / topic_word_counts[topic_idx] for word, count in top_words}
    top_words_probability.append(topic_probability)

# Create a dictionary to store the top words distribution for each topic
top_words_distribution_dict = {}
for topic_idx, topic_probability in enumerate(top_words_probability):
    top_words_distribution_dict[f"Topic {topic_idx + 1}"] = topic_probability

# Print the dictionary
print(top_words_distribution_dict)


In [None]:
import matplotlib.pyplot as plt

# Function to plot the word distribution for each topic
def plot_topic_word_distribution(top_words_distribution_dict):
    num_topics = len(top_words_distribution_dict)
    fig, axes = plt.subplots(nrows=num_topics, ncols=1, figsize=(10, 6*num_topics))

    for i, (topic, word_distribution) in enumerate(top_words_distribution_dict.items()):
        words = list(word_distribution.keys())
        probabilities = list(word_distribution.values())

        ax = axes[i] if num_topics > 1 else axes
        ax.barh(words, probabilities, color='skyblue')
        ax.set_title(f'{topic}')
        ax.set_xlabel('Probability')
        ax.set_ylabel('Word')

    plt.tight_layout()
    plt.show()

# Plot the topic word distribution
plot_topic_word_distribution(top_words_distribution_dict)


In [None]:
#Dominant topic distribution
from collections import defaultdict

# Initialize a dictionary to store the results
document_topic_distribution_dict = defaultdict(dict)

# Count the occurrences of each topic label
topic_counts = Counter(topic_labels)

# Normalize the counts to get probabilities
total_documents = len(documents)
topic_distribution = {topic: count / total_documents for topic, count in topic_counts.items()}

# Populate the dictionary with the topic distribution for each document
for i, label in enumerate(topic_labels):
    document_topic_distribution_dict[f"Document {i+1}"][f"Topic {label}"] = topic_distribution[label]

# Print the dictionary
for document, distribution in document_topic_distribution_dict.items():
    print(document, ":", distribution)


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the dictionary to a DataFrame for visualization
df_distribution = pd.DataFrame(document_topic_distribution_dict).T.fillna(0)

# Plot the heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(df_distribution, cmap="YlGnBu", annot=True, fmt=".3f", cbar=False)
plt.title("Probability Distribution of Topics Across Documents")
plt.xlabel("Topic")
plt.ylabel("Document")
plt.show()


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

# Compute pairwise cosine similarity between document embeddings
def compute_coherence_score(embeddings, labels):
    num_topics = len(set(labels))
    coherence_scores = []
    
    for i in range(num_topics):
        # Find documents assigned to the current topic
        topic_documents = embeddings[labels == i]
        
        # Compute pairwise cosine similarity between documents in the same topic
        similarity_matrix = cosine_similarity(topic_documents)
        
        # Compute the average pairwise similarity for the topic
        average_similarity = similarity_matrix.mean()
        
        # Append the coherence score for the topic
        coherence_scores.append(average_similarity)
    
    # Compute the overall coherence score as the mean of the coherence scores for all topics
    overall_coherence_score = sum(coherence_scores) / num_topics
    return overall_coherence_score

# Compute coherence score
coherence_score = compute_coherence_score(flat_embeddings, topic_labels)
print("Coherence Score:", coherence_score)

In [None]:
# Dictionary mapping topic numbers to labels
topics_word_distribution = top_words_distribution_dict

topic_labels = {
   
    
}

# Replace topic numbers with labels
labeled_document_topic = {}

for docs, topic in document_topic_labels.items():
    labeled_document_topic[docs] = topic_labels[topic]

# Print the labeled topic distribution
print(labeled_document_topic)