In [3]:
### static variables
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import normalize
import os
import numpy as np
from typing import List
import pandas as pd
import ast

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

In [None]:
from utils import io_helpers

docs_df = io_helpers.get_documents()
docs_list = docs_df["content"].to_list()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(docs_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of vocabulary: {len(tfidf_feature_names)}")

Size of vocabulary: 6956


In [None]:
# ## NMF (perfect condictions)
# nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
# nmf = nmf.fit(tfidf_features[:108])
# nmf_data = nmf.transform(tfidf_features)
# nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [5]:
### NMF (only no. of topics known)
nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
nmf_data = nmf.fit_transform(tfidf_features)
nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [14]:
### LSA
lsa = TruncatedSVD(n_components=108)
lsa_data = lsa.fit_transform(tfidf_features)
lsa_data_normalised = normalize(lsa_data, norm="l2", axis=1)

In [15]:
### LDA
lda = LatentDirichletAllocation(n_components=108)
lda_data_normalised = lda.fit_transform(tfidf_features, normalize=True)

In [11]:
### helper functions (1/2)
def calc_topics(row):
    if isinstance(row["original_doc_ids"], list):
        if len(row["original_doc_ids"]) > 1:
            return np.argsort(row["doc_vector"])[-10:][::-1].tolist()
    return [np.argmax(row["doc_vector"])]


def calc_topics_for_cumulative_threshold(row, threshold=0.9):
    sorted_indices = np.argsort(row)[::-1]

    # Sort the probabilities accordingly
    sorted_probs = row[sorted_indices]

    # Compute cumulative sum
    cumulative = np.cumsum(sorted_probs)

    # Find the cutoff index where cumulative sum first exceeds threshold
    cutoff = np.searchsorted(cumulative, threshold)

    # Select the indices up to and including that point
    selected_indices = sorted_indices[: cutoff + 1]

    return selected_indices.tolist()

In [9]:
### helper functions (2/2)
def calc_topic_hitrate(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    original_doc_ids: List[int] = row["original_doc_ids"]

    res = []

    for id in original_doc_ids:
        topics_row = set(row["topics"])
        original_row = docs.loc[docs["doc_id"].astype(int) == int(id)].iloc[0]
        topics_original = set(original_row["topics"])
        res.append(len(topics_row.intersection(topics_original)) > 0)

    return np.mean(res)

In [66]:
### evaluate method
import importlib
from utils import evaluation

importlib.reload(evaluation)

transformed_data = lda_data_normalised.copy()

docs = docs_df.copy()
docs["doc_vector"] = list(transformed_data)

docs["topics"] = docs["doc_vector"].apply(calc_topics_for_cumulative_threshold, args=(0.55,))

# docs["len(topics)"] = docs["topics"].apply(len)
# docs["topic_hitrate"] = docs.apply(calc_topic_hitrate, axis=1)
# docs["num_non-zeros_in_vector"] = docs["doc_vector"].apply(lambda v: sum(i > 0 for i in v))

# print(f"Avg. number of topics: {round(docs["len(topics)"].mean(), 2)}")
# docs["doc_vector"] = docs["doc_vector"].apply(lambda v: np.sort(v)[::-1]).apply(lambda v: [round(i, 4) for i in v])

docs, recall = evaluation.evaluate_clusters(docs, "topics")
filtered_docs = docs.loc[docs["original_doc_ids"].apply(lambda x: len(x) > 0)]
print(f"Recall: {round(recall * 100, 2)} %")
print(f"Recall on non-tabular docs: {round(filtered_docs["topics_hitrate"][:-3].mean() * 100, 2)} %")
print(f"Average comparisons to make: {evaluation.count_avg_related_docs(docs, "topics"):.2f}")
filtered_docs

Recall: 100.0 %
Recall on non-tabular docs: 100.0 %
Average comparisons to make: 49.00


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,doc_id,domain,content,original_doc_ids,doc_vector,topics,topics_hitrate
0,100046,Finance,"JetWing Aviation, established on April 15, 200...",[46],"[0.0011499778990178046, 0.0011499778990178046,...","[78, 17]",1.0
1,100071,Finance,"ABC Education Corporation, a publicly listed c...",[71],"[0.001030695364096049, 0.001030695364096049, 0...","[78, 71]",1.0
2,100208,Medical,Hospitalization Record\n\nBasic Information:\n...,[208],"[0.0006295532498659552, 0.0006295532498659552,...","[3, 21]",1.0
3,100119,Law,"**RIVERTON, HAMILTON COURT CRIMINAL JUDGMENT**...",[119],"[0.000918519715422666, 0.000918519715422666, 0...","[35, 71]",1.0
4,100123,Law,**Sterling Quarryville Court**\n\n**Sterling Q...,[123],"[0.0008129004361594237, 0.0008129004361594237,...",[71],1.0
...,...,...,...,...,...,...,...
28,400116,Law,In a significant legal proceeding at the Cedar...,[116],"[0.0008738349685771486, 0.0008738349685771486,...","[7, 71]",1.0
29,400059,Finance,"Retail Emporium, a well-established retail gia...",[59],"[0.0008184300650146137, 0.0008184300650146137,...","[54, 78]",1.0
0,300002,Finance,Month and year when Entertainment Enterprises ...,"[41, 44, 46, 49, 54, 55, 62, 67, 69, 71]","[0.0010194274652340425, 0.0010194274652340425,...","[70, 78]",1.0
1,300003,Law,Amount embezzled by N. Adams in July 2022 acco...,"[111, 113, 119, 122, 123, 124, 129, 131, 135, ...","[0.0016409168834667714, 0.0016409168834667714,...","[71, 9, 102]",1.0
