In [1]:
import json
import os
import re
import pandas as pd
from tqdm import tqdm

from utils.utils import compute_german_confidence

tqdm.pandas()

# Preprocessing.ipynb

This dataset contains the code to extract the prompts from Document Chat's ChatMessage table and to bring them into the correct form

In [None]:
data = pd.read_csv("document_chat_chatmessage_table.csv")  # Original dataset omitted from this repository for data security reasons
print(data.shape)

In [11]:
data['sentAt'] = pd.to_datetime(data['sentAt'])
cutoff_date = '2025-03-12'
df = data[data["sentAt"] > pd.Timestamp(cutoff_date)].sort_values("sentAt")

pattern = r"(?<=spaces/)[^/]+(?=/)"

results = []

for (user, context), group in df.groupby(['userID', 'contextID']):
    reset_flag = True
    group = group.reset_index(drop=True)

    if user == 70:
        # Skipping GenAI Sandbox User --> Customer Data
        continue

    for i, row in group.iterrows():
        if row['messageType'] in ('RESET_HISTORY', 'DOCUPLOAD'):
            # Mark that the next USER message is the first of a new session.
            reset_flag = True
        elif row['messageType'] == 'USER' and reset_flag:
            # This is the first prompt after a reset.
            answer = None
            chunks = None
            # Find the next LLM answer
            for j in range(i + 1, len(group)):
                if group.loc[j, 'messageType'] == 'LLM':
                    answer = group.loc[j, 'messageText']
                    chunks = group.loc[j, 'relevantNodes']
                    break
            if not chunks or chunks == 'null' or not isinstance(chunks, str):
                continue

            chunks_json = json.loads(json.loads(chunks))
            top_k = len(chunks_json)
            documents = tuple(set([str(chunk.get("documentID")) + chunk.get("documentName") for chunk in chunks_json]))
            sources = tuple(set(chunk.get("source") for chunk in chunks_json))
            if sources and sources[0] == "CONFLUENCE":
                # Extract confluence Space
                origins = {re.search(pattern, chunk.get("origin")).group() for chunk in chunks_json if re.search(pattern, chunk.get("origin"))}
                sources = (f"CONFLUENCE-{'-'.join(origins)}",)

            skip_prompt = False

            # Check document presence
            for chunk in chunks_json:
                blob_id = str(chunk.get("documentID")) + chunk.get("documentName")
                if chunk.get("source") not in ("SHAREPOINT", "CONFLUENCE", "INNOVATION_FACTORY"):
                    if os.path.exists(f"../documents/{blob_id}"):
                        # print(f"Found: {blob_id}")
                        pass
                    else:
                        print(f"[{user}/{context}/{i}] NOT FOUND: {blob_id}")
                        skip_prompt = True

            if skip_prompt:
                continue

            results.append({
                'userID': user,
                'contextID': context,
                'prompt': row['messageText'],
                'answer': answer,
                'chunks': chunks,
                'documents': documents,
                'sources': sources,
                'llm': row["usedLLM"],
                'embed_model': "multilingual-e5-large",
                'top_k': top_k,
                'chunk_size': 256,
                'language': "DE" if compute_german_confidence(row['messageText']) > 0.5 else "EN",
            })
            # Only capture the first USER prompt after the reset
            reset_flag = False

new_df = pd.DataFrame(results)

print("Shape before duplicate removal: ", new_df.shape)
df_no_duplicates = new_df.drop_duplicates(subset=['prompt', 'documents', 'userID'])
print("Shape after duplicate removal: ", df_no_duplicates.shape)

df_no_duplicates.to_csv(f"prompts-since-{cutoff_date}.csv", index=False)

[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/5] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 247598Basel III (1).pdf
[10/550/10] NOT FOUND: 24