# DS 542 Document Clustering

## Setup

### Imports

In [1]:
import json

import matplotlib.pyplot as plt
from find_dataset import locate
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from tqdm.auto import tqdm

DATA = locate("Datasets") / "processed"

print(f"Datasets: {DATA}")

Datasets: /home/cethan/GitHub/542-LegalContract-AI/Datasets/processed


  from .autonotebook import tqdm as notebook_tqdm


### Reading Data

In [2]:
with open(DATA / "contractnli.json") as f:
    cnli = json.load(f)

with open(DATA / "cuad.json") as f:
    cuad = json.load(f)

In [3]:
cuad_text = [i["text"] for i in cuad]
cnli_text = [i["text"] for i in cnli]

### Helper Functions

In [4]:
def plot_top_words(model, feature_names, n_top_words, title):
    """Plot the top words for each topic in the model.

    Parameters
    ----------
    model : object
        The topic model (NMF, LDA, etc.) that contains the components_ attribute
    feature_names : array-like
        The names of the features (words) corresponding to each component
    n_top_words : int
        Number of top words to display for each topic
    title : str
        Title for the plot

    Returns
    -------
    None
        Displays a matplotlib figure with subplots for each topic

    """
    fig, axes = plt.subplots(1, 5, figsize=(12, 5), sharex=True)
    axes = axes.flatten()
    for topic_idx, topic in enumerate(model.components_):
        top_features_ind = topic.argsort()[-n_top_words:]
        top_features = feature_names[top_features_ind]
        weights = topic[top_features_ind]

        ax = axes[topic_idx]
        ax.barh(top_features, weights, height=0.7)
        ax.set_title(f"Topic {topic_idx + 1}", fontdict={"fontsize": 14})
        ax.tick_params(axis="both", which="major", labelsize=12)
        for i in ["top", "right", "left"]:
            ax.spines[i].set_visible(False)
        fig.suptitle(title, fontsize=14)

    plt.subplots_adjust(top=0.90, bottom=0.05, wspace=0.90, hspace=0.3)
    plt.show()

In [5]:
def classify_text(text, vectorizer, lda_model):
    """Classify a text using the trained LDA model.

    Parameters
    ----------
    text : str
        The text to classify
    vectorizer : CountVectorizer
        The fitted vectorizer
    lda_model : LatentDirichletAllocation
        The fitted LDA model

    Returns
    -------
    topic_probs : numpy array
        Probability distribution over topics
    dominant_topic : int
        The index of the dominant topic

    """
    # Transform the text using the vectorizer
    text_tf = vectorizer.transform([text])

    # Get the topic distribution for the text
    topic_probs = lda_model.transform(text_tf)[0]

    # Get the dominant topic
    dominant_topic = topic_probs.argmax()

    return topic_probs, dominant_topic

In [6]:
def create_process(
    dataset: list,
    n_features: int = 1000,
    n_components: int = 5,
    n_top_words: int = 10,
    batch_size: int = 128,
    init: str = "nndsvda",
):
    """Create and process the dataset for topic modeling."""
    vectorizer = CountVectorizer(max_df=0.95, min_df=1, max_features=n_features, stop_words="english")
    vectorizer.fit(dataset)
    tf = vectorizer.transform(dataset)

    lda = LatentDirichletAllocation(
        n_components=n_components,
        max_iter=10,
        learning_method="online",
        learning_offset=50.0,
        random_state=542,
        n_jobs=-1,
    )

    lda.fit(tf)

    return lda, vectorizer

## Vectorizer and LDA

In [7]:
n_features = 1000
n_components = 5
n_top_words = 10
batch_size = 128
init = "nndsvda"

doc_lda, doc_vec = create_process(
    cuad_text + cnli_text,
    n_features=n_features,
    n_components=n_components,
    n_top_words=n_top_words,
    batch_size=batch_size,
    init=init,
)

In [8]:
full_process = []

for doc in tqdm(cuad + cnli):
    doc_topic_probs, doc_dominant_topic = classify_text(doc["text"], doc_vec, doc_lda)

    temp = {
        "id": doc["id"],
        "file_name": doc["file_name"],
        "full_text": doc["text"],
        "full_text_topic_num": n_components,
        "full_text_topic_label": int(doc_dominant_topic),
        "lines": [],
        "lines_topic_num": n_components,
        "line_topic_label": [],
    }

    doc_lines = [i for i in doc["text"].split("\n") if i.strip()]

    if len(doc_lines) == 1:
        temp["lines"].append(doc_lines[0])
        temp["line_topic_label"].append(0)
        full_process.append(temp)
        continue

    temp_lda, temp_vec = create_process(
        doc_lines,
        n_features=n_features,
        n_components=n_components,
        n_top_words=n_top_words,
        batch_size=batch_size,
        init=init,
    )

    for line in doc_lines:
        line_topic_probs, line_dominant_topic = classify_text(line, temp_vec, temp_lda)

        temp["lines"].append(line)
        temp["line_topic_label"].append(int(line_dominant_topic))

    full_process.append(temp)

100%|██████████| 1117/1117 [26:53<00:00,  1.44s/it] 


In [9]:
with open(DATA / "complete.json", "w") as f:
    json.dump(full_process, f, indent=4)