In [4]:
import json
import random
import pandas as pd
from pathlib import Path

# --- CONFIG ---
DATA_DIR = Path("CANDOR-corpus-backbiter")
CONV_FILE = DATA_DIR / "conversations.json"
UTTER_FILE = DATA_DIR / "utterances.jsonl"
SEED = 31415
N = 10  # number of conversations to sample

random.seed(SEED)

# --- LOAD ---
with open(CONV_FILE, "r") as f:
    conversations = json.load(f)

utterances = []
with open(UTTER_FILE, "r") as f:
    for line in f:
        utterances.append(json.loads(line))
utter_df = pd.DataFrame(utterances)

# --- SAMPLE ---
conv_ids = list(conversations.keys())
sampled_convs = random.sample(conv_ids, N)
sampled_convs[:]

['bc427c9e-28a4-47fd-bfca-a5f09dd13840',
 'fc70af18-cb2f-4cb6-a7a0-7a6809511e3f',
 '5ad3e585-7fe0-484e-9fc2-8b552af24e02',
 '8ee0f515-ff66-4189-bd7a-bde430931fe9',
 '0e772ef8-9014-48a8-8eb9-762ef55b4fe9',
 'ae393cc1-fa62-4505-8c2f-3d4b404eebd3',
 'a8855c03-359f-42d9-af04-e459f9547107',
 '863be6a8-2c9b-4cfb-868a-71bcd4c54802',
 '2bc13e09-30ac-4bd6-941c-24036501b329',
 'e3717831-71e4-47c8-8069-e4efc1e0e8c4']

In [15]:
cid = sampled_convs[0]  # change 0 → 1, 2, … for each conversation
conv_meta = conversations[cid]
subset = utter_df[utter_df["conversation_id"] == cid]

print(f"\n{'='*80}")
print(f"Conversation — ID: {cid}")
print(f"{'='*80}\n")

# --- Print all metadata cleanly ---
print("=== Conversation Metadata ===")
for key, value in conv_meta.items():
    # Pretty formatting for nested dictionaries
    if isinstance(value, dict):
        print(f"{key}:")
        for subkey, subval in value.items():
            print(f"  - {subkey}: {subval}")
    else:
        print(f"{key}: {value}")

# --- Print transcript ---
print("=== Conversation Transcript ===")

speaker_map = {}
def label_for(speaker_id):
    key = speaker_id if speaker_id not in (None, "") else "unknown"
    if key not in speaker_map:
        idx = len(speaker_map)
        if idx < 26:
            letter = chr(ord('A') + idx)
        else:
            letter = f"X{idx-25}"
        speaker_map[key] = f"{letter}"
    return speaker_map[key]

for turn_idx, (_, row) in enumerate(subset.iterrows(), start=1):
    speaker_id = row.get("speaker_id") or row.get("speaker") or "unknown"
    text = row.get("text", "")
    print(f"{turn_idx:02d} - {label_for(speaker_id)}: {text}")



Conversation — ID: bc427c9e-28a4-47fd-bfca-a5f09dd13840

=== Conversation Metadata ===
meta:
  - partner_id: {'5dab7683c514d8001730ae4e': '5f4e7f5ac0126206d331b7b2', '5f4e7f5ac0126206d331b7b2': '5dab7683c514d8001730ae4e'}
  - date: <##bin{1208}&&@**>
  - survey_duration_in_seconds: {'5dab7683c514d8001730ae4e': 3320, '5f4e7f5ac0126206d331b7b2': 3526}
  - time_zone: {'5dab7683c514d8001730ae4e': 8.0, '5f4e7f5ac0126206d331b7b2': 5.0}
  - pre_affect: {'5dab7683c514d8001730ae4e': 6.0, '5f4e7f5ac0126206d331b7b2': 6.0}
  - pre_arousal: {'5dab7683c514d8001730ae4e': 5.0, '5f4e7f5ac0126206d331b7b2': 5.0}
  - technical_quality: {'5dab7683c514d8001730ae4e': 1.0, '5f4e7f5ac0126206d331b7b2': 1.0}
  - conv_length: {'5dab7683c514d8001730ae4e': 30.0, '5f4e7f5ac0126206d331b7b2': 31.0}
  - affect: {'5dab7683c514d8001730ae4e': 7.0, '5f4e7f5ac0126206d331b7b2': 7.0}
  - arousal: {'5dab7683c514d8001730ae4e': 6.0, '5f4e7f5ac0126206d331b7b2': 7.0}
  - overall_affect: {'5dab7683c514d8001730ae4e': 9.0, '5f4e7f5a