In [2]:
### static variables

COLUMNS_DOCS = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_name",
    "court_name",
    "hospital_patient_name",
]

COLUMNS_DOCS_MANIPULATED_TEXTUAL = [
    *COLUMNS_DOCS,
    "original_doc_id",
]

COLUMNS_DOCS_MANIPULATED_TABULAR = [
    "doc_id",
    "language",
    "domain",
    "content",
    "company_names",
    "court_names",
    "hospital_patient_names",
    "original_doc_ids",
]

In [3]:
### helper functions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import os
import numpy as np
from typing import List
import pandas as pd
import ast


def get_documents() -> pd.DataFrame:
    docs_original = pd.read_csv("data/DRAGONball/en/docs.csv", usecols=["doc_id", "domain", "content"])
    docs_manipulated_textual = pd.read_csv(
        "data/additional_data/docs/textual_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    docs_manipulated_tabular = pd.read_csv(
        "data/additional_data/docs/tabular_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_ids"],
        converters={"original_doc_ids": ast.literal_eval},
    )
    print(f"# original docs: {len(docs_original)}")
    print(f"# manipulated textual docs: {len(docs_manipulated_textual)}")
    print(f"# manipulated tabular docs: {len(docs_manipulated_tabular)}")

    return pd.concat([docs_original, docs_manipulated_textual, docs_manipulated_tabular], sort=False)

In [4]:
docs_df = get_documents()
docs_list = docs_df["content"].to_list()


tfidf_vectorizer = TfidfVectorizer(max_df=0.95, min_df=1, stop_words="english")
tfidf_features = tfidf_vectorizer.fit_transform(docs_list)
tfidf_feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Size of vocabulary: {len(tfidf_feature_names)}")

nmf = NMF(n_components=108, init=None, max_iter=400)
transformed_data = nmf.fit_transform(tfidf_features)

# original docs: 108
# manipulated textual docs: 30
# manipulated tabular docs: 3
Size of vocabulary: 6791


In [10]:
docs = pd.DataFrame(
    {
        "doc_id": docs_df["doc_id"].to_list(),
        "original_doc_id_tmp": docs_df["original_doc_id"].to_list(),
        "original_doc_ids_tmp": docs_df["original_doc_ids"].to_list(),
        "doc_vector": list([doc for doc in transformed_data]),
    }
)


def merge_original_ids(row):
    if isinstance(row["original_doc_ids_tmp"], list):
        return row["original_doc_ids_tmp"]
    elif pd.notna(row["original_doc_id_tmp"]):
        return [row["original_doc_id_tmp"]]
    else:
        return pd.NA


def calc_max_topics(row):
    if isinstance(row["original_doc_ids"], list):
        if len(row["original_doc_ids"]) > 1:
            return np.argsort(row["doc_vector"])[-10:][::-1].tolist()
    return [np.argmax(row["doc_vector"])]


docs["original_doc_ids"] = docs.apply(merge_original_ids, axis=1)
docs = docs.drop(["original_doc_id_tmp", "original_doc_ids_tmp"], axis=1)
docs["max_topics"] = docs.apply(calc_max_topics, axis=1)

docs

Unnamed: 0,doc_id,doc_vector,original_doc_ids,max_topics
0,40,"[0.0, 4.39169452731574e-06, 0.0, 0.0, 0.0, 0.0...",,[43]
1,41,"[8.27025394173132e-08, 0.0, 0.0, 0.0, 0.0, 7.2...",,[31]
2,42,"[4.0572547720270834e-08, 0.0, 0.0, 0.0, 0.0, 0...",,[105]
3,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.591977125729418e-0...",,[72]
4,44,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.002398445858402711...",,[25]
...,...,...,...,...
136,100125,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.000309989180716783...",[125],[51]
137,100127,"[1.4773751654683805e-06, 0.0, 1.05291937734843...",[127],[15]
138,300001,"[0.0, 4.766330141227765e-08, 0.0, 0.0, 1.52213...","[46, 47, 52, 59, 66, 71, 72, 77, 78, 79]","[87, 42, 4, 22, 73, 37, 9, 79, 1, 102]"
139,300002,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[134, 136, 139, 112, 114, 115, 119, 123, 125, ...","[80, 34, 105, 107, 103, 102, 101, 100, 99, 98]"


In [None]:
def calc_same_topic(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    original_doc_ids: List[int] = row["original_doc_ids"]
    original_topics = [int(docs[docs["doc_id"] == id].iloc[0]["max_topics"][0]) for id in original_doc_ids]

    res = []

    for topic in row["max_topics"]:
        res.append(topic in original_topics)

    return np.mean(res)


docs["is_same_topic"] = docs.apply(calc_same_topic, axis=1)

docs.loc[docs["original_doc_ids"].notna()]

docs

Unnamed: 0,doc_id,doc_vector,original_doc_ids,max_topics,is_same_topic
0,40,"[0.0, 4.39169452731574e-06, 0.0, 0.0, 0.0, 0.0...",,[43],
1,41,"[8.27025394173132e-08, 0.0, 0.0, 0.0, 0.0, 7.2...",,[31],
2,42,"[4.0572547720270834e-08, 0.0, 0.0, 0.0, 0.0, 0...",,[105],
3,43,"[0.0, 0.0, 0.0, 0.0, 0.0, 2.591977125729418e-0...",,[72],
4,44,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.002398445858402711...",,[25],
...,...,...,...,...,...
136,100125,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.000309989180716783...",[125],[51],1.0
137,100127,"[1.4773751654683805e-06, 0.0, 1.05291937734843...",[127],[15],1.0
138,300001,"[0.0, 4.766330141227765e-08, 0.0, 0.0, 1.52213...","[46, 47, 52, 59, 66, 71, 72, 77, 78, 79]","[87, 42, 4, 22, 73, 37, 9, 79, 1, 102]",0.4
139,300002,"[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...","[134, 136, 139, 112, 114, 115, 119, 123, 125, ...","[80, 34, 105, 107, 103, 102, 101, 100, 99, 98]",0.0
