In [69]:
### helper functions

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF, TruncatedSVD, LatentDirichletAllocation
from sklearn.preprocessing import normalize
import os
import numpy as np
from typing import List
import pandas as pd
import ast
import csv


def get_documents(print_info: bool = False) -> pd.DataFrame:
    docs_original = pd.read_csv("data/DRAGONball/en/docs.csv", usecols=["doc_id", "domain", "content"])
    docs_manipulated_single_textual = pd.read_csv(
        "data/additional_data/docs/textual_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    docs_manipulated_single_textual["original_doc_id"] = docs_manipulated_single_textual["original_doc_id"].apply(
        lambda i: [i] if pd.notna(i) else []
    )
    docs_manipulated_single_textual.rename(columns={"original_doc_id": "original_doc_ids"}, inplace=True)

    docs_manipulated_single_tabular = pd.read_csv(
        "data/additional_data/docs/tabular_manipulations_result.csv",
        usecols=["doc_id", "domain", "content", "original_doc_ids"],
        converters={"original_doc_ids": ast.literal_eval},
    )

    docs_manipulated_multi_textual = pd.read_csv(
        "data/additional_data/docs/multi_textual_manipulations.csv",
        usecols=["doc_id", "domain", "content", "original_doc_id"],
        dtype={"original_doc_id": "Int64"},
    )
    docs_manipulated_multi_textual["original_doc_id"] = docs_manipulated_multi_textual["original_doc_id"].apply(
        lambda i: [i] if pd.notna(i) else []
    )
    docs_manipulated_multi_textual.rename(columns={"original_doc_id": "original_doc_ids"}, inplace=True)

    if print_info == True:
        print(f"# original docs: {len(docs_original)}")
        print(f"# manipulated textual docs: {len(docs_manipulated_single_textual)}")
        print(f"# manipulated tabular docs: {len(docs_manipulated_single_tabular)}")
        print(f"# manipulated textual multi docs: {len(docs_manipulated_multi_textual)}")

    return pd.concat(
        [
            docs_original,
            docs_manipulated_single_textual,
            docs_manipulated_multi_textual,
            docs_manipulated_single_tabular,
        ],
        sort=False,
    )


documents = get_documents(print_info=True)

# original docs: 108
# manipulated textual docs: 30
# manipulated tabular docs: 3
# manipulated textual multi docs: 30


In [45]:
### Get embeddings
from openai import OpenAI, RateLimitError
from dotenv import load_dotenv
from tenacity import retry, retry_if_exception_type, wait_random, stop_after_attempt

load_dotenv("/Users/leon/.env")
client = OpenAI()

FILEPATH = "data/additional_data/docs/embeddings.csv"
FIELDNAMES = ["doc_id", "content", "embedding"]


def get_embeddings(input: str) -> List[float]:
    print("DEBUG: Getting embedding from OpenAI")
    embedding = client.embeddings.create(model="text-embedding-3-small", input=input, encoding_format="float")
    print(f"DEBUG: Got embedding. Usage: {embedding.usage.total_tokens} total tokens.")
    return embedding.data[0].embedding

@retry(
    retry=retry_if_exception_type(RateLimitError),
    wait=wait_random(min=30, max=60),
    stop=stop_after_attempt(6),
)
def get_and_save_emebeddings(row) -> None:
    docs_processed = pd.read_csv(FILEPATH, usecols=["doc_id"])["doc_id"].to_list()
    if row["doc_id"] in docs_processed:
        print(f"Doc with ID '{row["doc_id"]}' has been processed before.")
        return 
    
    embedding = get_embeddings(row["content"])
    with open(FILEPATH, "a", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=FIELDNAMES)
        writer.writerow({
            "doc_id": row["doc_id"],
            "content": row["content"],
            "embedding": embedding
        })


In [46]:
_ = documents.apply(get_and_save_emebeddings, axis=1)

print("Done")

Doc with ID '40' has been processed before.
Doc with ID '41' has been processed before.
Doc with ID '42' has been processed before.
Doc with ID '43' has been processed before.
Doc with ID '44' has been processed before.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 1664 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 2776 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 1749 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 2033 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 1704 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 1958 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 2672 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 2242 total tokens.
DEBUG: Getting embedding from OpenAI
DEBUG: Got embedding. Usage: 1714 total tokens.
DEBUG: Getting 

In [70]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans


def kmeans_cluster_embeddings(df: pd.DataFrame, n_clusters: int = 108) -> pd.DataFrame:
    """
    Applies KMeans clustering to a column of document embeddings in a DataFrame.

    Parameters:
    - df: pandas DataFrame containing the embeddings.
    - n_clusters: int, number of clusters to form (default is 108).

    Returns:
    - df: original DataFrame with an added column 'cluster' containing cluster labels.
    """
    # Extract embeddings and convert to NumPy array
    embeddings = np.vstack(df["embedding"].values)

    # Run KMeans clustering
    kmeans = KMeans(n_clusters=n_clusters, random_state=42)
    cluster_labels = kmeans.fit_predict(embeddings)

    # Add the labels back to the DataFrame
    df["cluster"] = cluster_labels

    return df

In [71]:
def calc_cluster_hitrate(row):
    if not isinstance(row["original_doc_ids"], list):
        return None

    res = []

    for id in row["original_doc_ids"]:
        org_row = documents.loc[documents["doc_id"].astype(int) == int(id)].iloc[0]
        org_cluster = org_row["cluster"]
        res.append(row["cluster"] == org_cluster)

    return np.mean(res)

In [75]:
documents = get_documents()
doc_embeddings = pd.read_csv(FILEPATH, usecols=["doc_id", "embedding"], converters={"embedding": ast.literal_eval})
documents = documents.join(other=doc_embeddings.set_index("doc_id"), on="doc_id", how="left")

documents = kmeans_cluster_embeddings(df=documents)
documents["cluster_hitrate"] = documents.apply(calc_cluster_hitrate, axis=1)

documents_filtered = documents.loc[documents["cluster_hitrate"].notna()]
documents_filtered

Unnamed: 0,doc_id,domain,content,original_doc_ids,embedding,cluster,cluster_hitrate
0,100134,Law,"**IN THE DANBURY, PINEHURST COURT**\n\n**CRIMI...",[134],"[0.015011516, 0.03634062, 0.0030247627, 0.0489...",0,1.0
1,100136,Law,"**Upton, Georgetown, Court**\n\n*Criminal Divi...",[136],"[0.03939665, 0.058982093, 0.020784838, 0.05212...",38,1.0
2,100139,Law,"Glenwood, Quailwood Court\n9th Judicial Circui...",[139],"[0.031122264, 0.0073628416, 0.042542476, 0.039...",11,1.0
3,100046,Finance,"JetWing Aviation, established on April 15, 200...",[46],"[-0.009702898, 0.0164151, 0.06840374, 0.018056...",17,1.0
4,100047,Finance,CleanCo Housekeeping Services is a housekeepin...,[47],"[0.05300756, 0.0015094227, 0.03901874, 0.04113...",25,1.0
...,...,...,...,...,...,...,...
28,400110,Law,In a significant ruling that has reverberated ...,[110],"[0.04030493, 0.02091089, -0.0038811294, 0.0674...",43,0.0
29,400116,Law,In a dramatic courtroom showdown that has rive...,[116],"[0.058137763, -0.011234419, 0.0049859015, 0.04...",51,0.0
0,300001,Finance,Changes that occurred in senior management of ...,"[46, 47, 52, 59, 66, 71, 72, 77, 78, 79]","[-0.004240031, -0.007925675, 0.018725948, 0.03...",65,0.0
1,300002,Law,Chief judge according to the court judgment of...,"[134, 136, 139, 112, 114, 115, 119, 123, 125, ...","[-0.0014971758, 0.015988875, 0.023612805, 0.02...",24,0.0
