In [3]:
### static variables
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import normalize
import os
import numpy as np
from typing import List
import pandas as pd
import ast

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

In [None]:
from utils import io_helpers

docs_df = io_helpers.get_documents()
docs_list = docs_df["content"].to_list()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(docs_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of vocabulary: {len(tfidf_feature_names)}")

Size of vocabulary: 6956


In [None]:
# ## NMF (perfect condictions)
# nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
# nmf = nmf.fit(tfidf_features[:108])
# nmf_data = nmf.transform(tfidf_features)
# nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [5]:
### NMF (only no. of topics known)
nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
nmf_data = nmf.fit_transform(tfidf_features)
nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [6]:
### LSA
# lsa = TruncatedSVD(n_components=108)
# lsa_data = lsa.fit_transform(tfidf_features)
# lsa_data_normalised = normalize(lsa_data, norm="l2", axis=1)

In [7]:
### LDA
# lda = LatentDirichletAllocation(n_components=108)
# lda_data_normalised = lda.fit_transform(tfidf_features, normalize=True)

In [11]:
### helper functions (1/2)
def calc_topics(row):
    if isinstance(row["original_doc_ids"], list):
        if len(row["original_doc_ids"]) > 1:
            return np.argsort(row["doc_vector"])[-10:][::-1].tolist()
    return [np.argmax(row["doc_vector"])]


def calc_topics_for_cumulative_threshold(row, threshold=0.9):
    sorted_indices = np.argsort(row)[::-1]

    # Sort the probabilities accordingly
    sorted_probs = row[sorted_indices]

    # Compute cumulative sum
    cumulative = np.cumsum(sorted_probs)

    # Find the cutoff index where cumulative sum first exceeds threshold
    cutoff = np.searchsorted(cumulative, threshold)

    # Select the indices up to and including that point
    selected_indices = sorted_indices[: cutoff + 1]

    return selected_indices.tolist()

In [9]:
### helper functions (2/2)
def calc_topic_hitrate(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    original_doc_ids: List[int] = row["original_doc_ids"]

    res = []

    for id in original_doc_ids:
        topics_row = set(row["topics"])
        original_row = docs.loc[docs["doc_id"].astype(int) == int(id)].iloc[0]
        topics_original = set(original_row["topics"])
        res.append(len(topics_row.intersection(topics_original)) > 0)

    return np.mean(res)

In [13]:
### evaluate method
import importlib
from utils import evaluation

importlib.reload(evaluation)

transformed_data = nmf_data_normalised

docs = docs_df.copy()
docs["doc_vector"] = list(transformed_data)
docs["topics"] = docs["doc_vector"].apply(calc_topics_for_cumulative_threshold, args=(0.95,))

# docs["len(topics)"] = docs["topics"].apply(len)
# docs["topic_hitrate"] = docs.apply(calc_topic_hitrate, axis=1)
# docs["num_non-zeros_in_vector"] = docs["doc_vector"].apply(lambda v: sum(i > 0 for i in v))

# print(f"Avg. number of topics: {round(docs["len(topics)"].mean(), 2)}")
# docs["doc_vector"] = docs["doc_vector"].apply(lambda v: np.sort(v)[::-1]).apply(lambda v: [round(i, 4) for i in v])

docs, recall = evaluation.evaluate_clusters(docs, "topics")
filtered_docs = docs.loc[docs["original_doc_ids"].notna()]
print(f"Recall: {round(recall * 100, 2)} %")
print(f"Recall on non-tabular docs: {round(filtered_docs["topics_hitrate"][:-3].mean() * 100, 2)} %")
print(f"Average comparisons to make: {evaluation.count_avg_related_docs(docs, "topics"):.2f}")
filtered_docs

Recall: 62.22 %
Recall on non-tabular docs: 91.67 %
Average comparisons to make: 13.72


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,doc_id,domain,content,original_doc_ids,doc_vector,topics,topics_hitrate
0,40,Finance,Acme Government Solutions is a government indu...,[],"[0.0, 0.028452418256950295, 0.0, 0.0, 0.009268...","[18, 88, 93, 84, 17, 98, 79, 60, 44, 34, 1, 76...",
1,41,Finance,Entertainment Enterprises Inc. is an entertain...,[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 6.0549910910951...",[11],
2,42,Finance,"Advanced Manufacturing Solutions Inc., establi...",[],"[0.0, 0.00963491193391321, 0.00073682970161823...","[36, 107, 77, 44, 89, 31, 34, 76, 88, 22, 73, ...",
3,43,Finance,"EcoGuard Solutions, established on April 15, 2...",[],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[98],
4,44,Finance,"Green Fields Agriculture Ltd., established on ...",[],"[0.0, 0.013276668909498765, 0.0, 0.0, 0.841487...","[4, 77, 42, 51, 1, 63, 23]",
...,...,...,...,...,...,...,...
28,400116,Law,In a significant legal proceeding at the Cedar...,[116],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.004125167297731591...","[13, 33, 43, 40, 104, 58, 96]",1.0
29,400059,Finance,"Retail Emporium, a well-established retail gia...",[59],"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[58],0.0
0,300002,Finance,Month and year when Entertainment Enterprises ...,"[41, 44, 46, 49, 54, 55, 62, 67, 69, 71]","[0.0, 0.0, 0.0, 0.0, 0.00018629520278044213, 0...",[96],0.0
1,300003,Law,Amount embezzled by N. Adams in July 2022 acco...,"[111, 113, 119, 122, 123, 124, 129, 131, 135, ...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[10, 28]",0.1
