In [1]:
### static variables

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

In [2]:
### helper functions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import normalize
import os
import numpy as np
from typing import List
import pandas as pd
import ast


def get_documents() -> pd.DataFrame:
    docs_original = pd.read_csv("data/DRAGONball/en/docs.csv", usecols=["doc_id", "domain", "content"])
    docs_manipulated_single_textual = pd.read_csv(
        "data/additional_data/docs/textual_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    docs_manipulated_single_tabular = pd.read_csv(
        "data/additional_data/docs/tabular_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_ids"],
        converters={"original_doc_ids": ast.literal_eval},
    )
    docs_manipulated_multi_textual = pd.read_csv(
        "data/additional_data/docs/multi_textual_manipulations.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    print(f"# original docs: {len(docs_original)}")
    print(f"# manipulated textual docs: {len(docs_manipulated_single_textual)}")
    print(f"# manipulated tabular docs: {len(docs_manipulated_single_tabular)}")
    print(f"# manipulated textual multi docs: {len(docs_manipulated_multi_textual)}")

    return pd.concat(
        [
            docs_original,
            docs_manipulated_single_textual,
            docs_manipulated_multi_textual,
            docs_manipulated_single_tabular,
        ],
        sort=False,
    )

In [None]:
docs_df = get_documents()
docs_list = docs_df["content"].to_list()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(docs_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of vocabulary: {len(tfidf_feature_names)}")

ValueError: Usecols do not match columns, columns expected but not found: ['original_doc_id']

In [None]:
### NMF (perfect condictions)
# nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
# nmf = nmf.fit(tfidf_features[:108])
# nmf_data = nmf.transform(tfidf_features)
# nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [None]:
### NMF (only no. of topics known)
nmf = NMF(n_components=108, init="nndsvda", max_iter=400)
nmf_data = nmf.fit_transform(tfidf_features)
nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [None]:
### LSA
# lsa = TruncatedSVD(n_components=108)
# lsa_data = lsa.fit_transform(tfidf_features)
# lsa_data_normalised = normalize(lsa_data, norm="l2", axis=1)

In [None]:
### LDA
# lda = LatentDirichletAllocation(n_components=108)
# lda_data_normalised = lda.fit_transform(tfidf_features, normalize=True)

In [None]:
### helper functions (1/2)
def merge_original_ids(row):
    if isinstance(row["original_doc_ids_tmp"], list):
        return row["original_doc_ids_tmp"]
    elif pd.notna(row["original_doc_id_tmp"]):
        return [row["original_doc_id_tmp"]]
    else:
        return pd.NA


def calc_topics(row):
    if isinstance(row["original_doc_ids"], list):
        if len(row["original_doc_ids"]) > 1:
            return np.argsort(row["doc_vector"])[-10:][::-1].tolist()
    return [np.argmax(row["doc_vector"])]


def calc_topics_for_cumulative_threshold(row, threshold=0.9):
    sorted_indices = np.argsort(row)[::-1]

    # Sort the probabilities accordingly
    sorted_probs = row[sorted_indices]

    # Compute cumulative sum
    cumulative = np.cumsum(sorted_probs)

    # Find the cutoff index where cumulative sum first exceeds threshold
    cutoff = np.searchsorted(cumulative, threshold)

    # Select the indices up to and including that point
    selected_indices = sorted_indices[: cutoff + 1]

    return selected_indices.tolist()

In [None]:
### helper functions (2/2)
def calc_topic_hitrate(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    original_doc_ids: List[int] = row["original_doc_ids"]

    res = []

    for id in original_doc_ids:
        topics_row = set(row["topics"])
        original_row = docs.loc[docs["doc_id"].astype(int) == int(id)].iloc[0]
        topics_original = set(original_row["topics"])
        res.append(len(topics_row.intersection(topics_original)) > 0)

    return np.mean(res)

In [None]:
### evaluate method
import importlib
from utils import evaluation

importlib.reload(evaluation)

transformed_data = nmf_data_normalised

docs = pd.DataFrame(
    {
        "doc_id": docs_df["doc_id"].to_list(),
        "original_doc_id_tmp": docs_df["original_doc_id"].to_list(),
        "original_doc_ids_tmp": docs_df["original_doc_ids"].to_list(),
        "doc_vector": list([doc for doc in transformed_data]),
    }
)

docs["original_doc_ids"] = docs.apply(merge_original_ids, axis=1)
docs = docs.drop(["original_doc_id_tmp", "original_doc_ids_tmp"], axis=1)
docs["topics"] = docs["doc_vector"].apply(calc_topics_for_cumulative_threshold, args=(0.95,))

# docs["len(topics)"] = docs["topics"].apply(len)
# docs["topic_hitrate"] = docs.apply(calc_topic_hitrate, axis=1)
# docs["num_non-zeros_in_vector"] = docs["doc_vector"].apply(lambda v: sum(i > 0 for i in v))

# print(f"Avg. number of topics: {round(docs["len(topics)"].mean(), 2)}")
# docs["doc_vector"] = docs["doc_vector"].apply(lambda v: np.sort(v)[::-1]).apply(lambda v: [round(i, 4) for i in v])


docs, recall = evaluation.evaluate_clusters(docs, "topics")
filtered_docs = docs.loc[docs["original_doc_ids"].notna()]
print(f"Recall: {round(recall * 100, 2)} %")
print(f"Recall on non-tabular docs: {round(filtered_docs["topics_hitrate"][:-3].mean() * 100, 2)} %")
print(f"Average comparisons to make: {evaluation.count_avg_related_docs(docs, "topics"):.2f}")
filtered_docs

Recall: 56.67 %
Recall on non-tabular docs: 85.0 %
Average comparisons to make: 13.87


Unnamed: 0,doc_id,doc_vector,original_doc_ids,topics,topics_hitrate
108,100134,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000...",[134],[9],1.0
109,100136,"[0.0, 0.0020720391791662052, 0.004018328717750...",[136],"[5, 11, 104, 94, 98, 9, 30, 78, 89]",1.0
110,100139,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[139],[11],1.0
111,100046,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 8.170...",[46],[23],1.0
112,100047,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[47],[10],1.0
...,...,...,...,...,...
166,400110,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.000...",[110],"[45, 105, 54, 103]",1.0
167,400116,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[116],"[70, 36, 105, 54]",1.0
168,300001,"[0.0, 0.0, 0.0, 0.0, 0.004799967831828898, 0.0...","[46, 47, 52, 59, 66, 71, 72, 77, 78, 79]",[26],0.0
169,300002,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[134, 136, 139, 112, 114, 115, 119, 123, 125, ...",[37],0.0
