In [2]:
### static variables

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

In [4]:
### helper functions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import normalize
import os
import numpy as np
from typing import List
import pandas as pd
import ast


def get_documents() -> pd.DataFrame:
    docs_original = pd.read_csv("data/DRAGONball/en/docs.csv", usecols=["doc_id", "domain", "content"])
    docs_manipulated_textual = pd.read_csv(
        "data/additional_data/docs/textual_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    docs_manipulated_tabular = pd.read_csv(
        "data/additional_data/docs/tabular_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_ids"],
        converters={"original_doc_ids": ast.literal_eval},
    )
    print(f"# original docs: {len(docs_original)}")
    print(f"# manipulated textual docs: {len(docs_manipulated_textual)}")
    print(f"# manipulated tabular docs: {len(docs_manipulated_tabular)}")

    return pd.concat([docs_original, docs_manipulated_textual, docs_manipulated_tabular], sort=False)

In [5]:
docs_df = get_documents()
docs_list = docs_df["content"].to_list()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(docs_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of vocabulary: {len(tfidf_feature_names)}")

# original docs: 108
# manipulated textual docs: 30
# manipulated tabular docs: 3
Size of vocabulary: 6791


In [None]:
### Spacy load
import spacy

nlp = spacy.load("en_core_web_sm")

labels = [
    "CARDINAL",
    "DATE",
    "EVENT",
    "FAC",
    "GPE",
    "LANGUAGE",
    "LAW",
    "LOC",
    "MONEY",
    "NORP",
    "ORDINAL",
    "ORG",
    "PERCENT",
    "PERSON",
    "PRODUCT",
    "QUANTITY",
    "TIME",
    "WORK_OF_ART",
]
for label in labels:
    print(f"{label}: {spacy.explain(label)}")

CARDINAL: Numerals that do not fall under another type
DATE: Absolute or relative dates or periods
EVENT: Named hurricanes, battles, wars, sports events, etc.
FAC: Buildings, airports, highways, bridges, etc.
GPE: Countries, cities, states
LANGUAGE: Any named language
LAW: Named documents made into laws.
LOC: Non-GPE locations, mountain ranges, bodies of water
MONEY: Monetary values, including unit
NORP: Nationalities or religious or political groups
ORDINAL: "first", "second", etc.
ORG: Companies, agencies, institutions, etc.
PERCENT: Percentage, including "%"
PERSON: People, including fictional
PRODUCT: Objects, vehicles, foods, etc. (not services)
QUANTITY: Measurements, as of weight or distance
TIME: Times smaller than a day
WORK_OF_ART: Titles of books, songs, etc.


In [None]:
### NER
from collections import Counter
from sklearn.feature_extraction import DictVectorizer

# Extract named entities from each document
entity_lists = []
for doc in docs_list:
    spacy_doc = nlp(doc)
    # entities = [ent.text.lower() for ent in spacy_doc.ents if ent.label_ in {"PERSON", "ORG", "GPE", "PRODUCT", "EVENT", "LAW", "LOC", "WORK_OF_ART"}]
    entities = [
        ent.text.lower().strip().replace("\n", "")
        for ent in spacy_doc.ents
        if ent.label_ in {"PERSON", "ORG", "GPE", "PRODUCT"}
    ]
    entity_lists.append(Counter(dict(Counter(entities).most_common(100))))

entity_vectorizer = DictVectorizer()
X_entities = entity_vectorizer.fit_transform(entity_lists)
entity_feature_names = entity_vectorizer.get_feature_names_out()

In [191]:
### Combine TF-IDF with NER
from scipy.sparse import hstack

X_combined = hstack([tfidf_features, X_entities])

In [192]:
### NMF
nmf = NMF(n_components=80, init="nndsvda", max_iter=400)
# nmf = nmf.fit(tfidf_features[:-3])
# nmf_data = nmf.transform(tfidf_features)
nmf_data = nmf.fit_transform(X_combined)
nmf_data_normalised = normalize(nmf_data, norm="l1", axis=1)

In [193]:
def print_top_terms_per_topic(nmf_model, feature_names, n_top_words=10):
    for topic_idx, topic in enumerate(nmf_model.components_):
        top_indices = topic.argsort()[::-1][:n_top_words]
        top_terms = [feature_names[i] for i in top_indices]
        print(f"Topic #{topic_idx + 1}: {' | '.join(top_terms)}")


print_top_terms_per_topic(nmf, list(tfidf_feature_names) + list(entity_feature_names), n_top_words=10)

Topic #1: educorp | ai | johnson | alpha schools | collins | global education solutions | lisa collins | project smartlearn | steven johnson | the city of education
Topic #2: jetwing aviation | jetwing aviation's | aerotech avionics | miami | skyflight airlines | project skylink | florida | united states | aviation | jetwing
Topic #3: mediacorp | big pictures | big pictures' | digital media solutions | project vision | whistleblower protection program | nyse | united states | new york | mediacorp
Topic #4: elevate retail inc. | elevate retail | ar | sustainable packaging program | manhattan | sustainability task force | logistics solutions inc. | elevate retail's | the united states | sustainable fashion organizations
Topic #5: innovatetech | ipo | inc. | john smith | nasdaq | artificial intelligence (ai | socialconnect, inc. | project ampere | code of ethics and conduct | initial public offering
Topic #6: energen solutions ltd | energen solutions ltd's | energen solutions ltd. | cfo |

In [194]:
### LSA
lsa = TruncatedSVD(n_components=108)
lsa_data = lsa.fit_transform(tfidf_features)
lsa_data_normalised = normalize(lsa_data, norm="l2", axis=1)

In [195]:
### LDA
lda = LatentDirichletAllocation(n_components=108)
lda_data_normalised = lda.fit_transform(tfidf_features, normalize=True)

In [5]:
import kagglehub

# Download latest version
path = kagglehub.model_download("google/universal-sentence-encoder/tensorFlow2/cmlm-en-base")

print("Path to model files:", path)

Path to model files: /Users/leon/.cache/kagglehub/models/google/universal-sentence-encoder/tensorFlow2/cmlm-en-base/1


In [None]:
from top2vec import Top2Vec

model = Top2Vec(documents=docs_list)

2025-05-18 17:25:55,950 - top2vec - INFO - Pre-processing documents for training
2025-05-18 17:25:56,568 - top2vec - INFO - Creating joint document/word embedding
2025-05-18 17:26:01,688 - top2vec - INFO - Creating lower dimension embedding of documents
2025-05-18 17:26:11,220 - top2vec - INFO - Finding dense areas of documents
2025-05-18 17:26:11,240 - top2vec - INFO - Finding topics


In [196]:
### helper functions (1/2)
def merge_original_ids(row):
    if isinstance(row["original_doc_ids_tmp"], list):
        return row["original_doc_ids_tmp"]
    elif pd.notna(row["original_doc_id_tmp"]):
        return [row["original_doc_id_tmp"]]
    else:
        return pd.NA


def calc_topics(row):
    if isinstance(row["original_doc_ids"], list):
        if len(row["original_doc_ids"]) > 1:
            return np.argsort(row["doc_vector"])[-10:][::-1].tolist()
    return [np.argmax(row["doc_vector"])]


def calc_topics_for_cumulative_threshold(row, threshold=0.9):
    sorted_indices = np.argsort(row)[::-1]

    # Sort the probabilities accordingly
    sorted_probs = row[sorted_indices]

    # Compute cumulative sum
    cumulative = np.cumsum(sorted_probs)

    # Find the cutoff index where cumulative sum first exceeds threshold
    cutoff = np.searchsorted(cumulative, threshold)

    # Select the indices up to and including that point
    selected_indices = sorted_indices[: cutoff + 1]

    return selected_indices

In [197]:
### helper functions (2/2)
def calc_topic_hitrate(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    original_doc_ids: List[int] = row["original_doc_ids"]

    res = []

    for id in original_doc_ids:
        topics_row = set(row["topics"])
        original_row = docs.loc[docs["doc_id"].astype(int) == int(id)].iloc[0]
        topics_original = set(original_row["topics"])
        res.append(len(topics_row.intersection(topics_original)) > 0)

    return np.mean(res)

In [198]:
### evaluate method
transformed_data = nmf_data_normalised

docs = pd.DataFrame(
    {
        "doc_id": docs_df["doc_id"].to_list(),
        "original_doc_id_tmp": docs_df["original_doc_id"].to_list(),
        "original_doc_ids_tmp": docs_df["original_doc_ids"].to_list(),
        "doc_vector": list([doc for doc in transformed_data]),
    }
)

docs["original_doc_ids"] = docs.apply(merge_original_ids, axis=1)
docs = docs.drop(["original_doc_id_tmp", "original_doc_ids_tmp"], axis=1)


docs["topics"] = docs["doc_vector"].apply(calc_topics_for_cumulative_threshold, args=(0.95,))
docs["len(topics)"] = docs["topics"].apply(len)
docs["topic_hitrate"] = docs.apply(calc_topic_hitrate, axis=1)
docs["num_non-zeros_in_vector"] = docs["doc_vector"].apply(lambda v: sum(i > 0 for i in v))

print(f"Avg. number of topics: {round(docs["len(topics)"].mean(), 2)}")
docs["doc_vector"] = docs["doc_vector"].apply(lambda v: np.sort(v)[::-1]).apply(lambda v: [round(i, 4) for i in v])

docs.loc[docs["original_doc_ids"].notna()]

Avg. number of topics: 2.13


Unnamed: 0,doc_id,doc_vector,original_doc_ids,topics,len(topics),topic_hitrate,num_non-zeros_in_vector
108,100134,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[134],[18],1,1.0,26
109,100136,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[136],[23],1,1.0,18
110,100139,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[139],[21],1,1.0,10
111,100046,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[46],[1],1,1.0,10
112,100047,"[0.9855, 0.0077, 0.0025, 0.0016, 0.001, 0.0006...",[47],[17],1,1.0,14
113,100179,"[0.973, 0.0249, 0.0014, 0.0006, 0.0001, 0.0, 0...",[179],[79],1,1.0,5
114,100052,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[52],[0],1,1.0,7
115,100181,"[0.9671, 0.0328, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0,...",[181],[73],1,1.0,10
116,100059,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[59],[11],1,1.0,3
117,100066,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",[66],[9],1,1.0,10
