# Coordination Examples Sampler

Use this notebook to locate low-, medium-, and high-coordination snippets (20-turn window near the middle) from the CANDOR corpus for the Google Form activity.

In [1]:
import math
from pathlib import Path

import numpy as np
import pandas as pd

from IPython.display import display

try:
    from convokit import Corpus, Coordination
except ModuleNotFoundError as exc:
    raise ModuleNotFoundError(
        "Convokit is required for this notebook. Install with `pip install convokit`."
    ) from exc

from tqdm.auto import tqdm

pd.set_option('display.max_columns', 120)




TransformerDecoderModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.
TransformerEncoderModel requires ML dependencies. Run 'pip install convokit[llm]' to install them.
An error occurred: No module named 'torch'




In [2]:
DATA_DIR = (Path.cwd() / 'CANDOR-corpus-backbiter').resolve()

if not DATA_DIR.exists():
    raise FileNotFoundError(f'Corpus directory not found: {DATA_DIR}')

corpus = Corpus(filename=str(DATA_DIR))
print(f"{len(corpus.conversations)} conversations; {len(corpus.utterances)} utterances loaded.")


1656 conversations; 557864 utterances loaded.


In [3]:
invalid_reply_to = []
for utt in list(corpus.iter_utterances()):
    parent_id = utt.reply_to
    if not parent_id or parent_id in corpus.utterances:
        continue
    invalid_reply_to.append({'utterance_id': utt.id, 'reply_to': parent_id})
    corpus.utterances[utt.id].reply_to = None

print(f"Cleared {len(invalid_reply_to)} invalid reply_to references.")
if invalid_reply_to:
    display(pd.DataFrame(invalid_reply_to).head())


Cleared 0 invalid reply_to references.


In [4]:
coord = Coordination(coordination_attribute_name='coord')
coord.fit_transform(corpus)
print('Coordination scores stored in speaker metadata.')


Coordination scores stored in speaker metadata.


In [5]:
def get_speaker_id(utt):
    if getattr(utt, 'speaker', None) is not None and getattr(utt.speaker, 'id', None) is not None:
        return utt.speaker.id
    if getattr(utt, 'user', None) is not None and getattr(utt.user, 'id', None) is not None:
        return utt.user.id
    return None

utterance_rows = []
for utt in corpus.iter_utterances():
    utterance_rows.append({
        'utterance_id': utt.id,
        'conversation_id': utt.conversation_id,
        'speaker_id': get_speaker_id(utt),
        'text': getattr(utt, 'text', '').strip(),
        'reply_to': utt.reply_to,
    })

utterance_df = pd.DataFrame(utterance_rows)
utterance_df.head()


Unnamed: 0,utterance_id,conversation_id,speaker_id,text,reply_to
0,0,0020a0c5-1658-4747-99c1-2839e736b481,5fa072f4f4aa580b63834357,"Mhm. Mhm. Just, mm. And Uh huh, mm. Mhm. Mhm. ...",
1,1,0020a0c5-1658-4747-99c1-2839e736b481,5a73899f9cdd1800017786f0,hey I'm gone.,0.0
2,2,0020a0c5-1658-4747-99c1-2839e736b481,5fa072f4f4aa580b63834357,"good, how are you? Yeah. Yeah, so this will be...",1.0
3,3,0020a0c5-1658-4747-99c1-2839e736b481,5a73899f9cdd1800017786f0,yeah I've done a few,2.0
4,4,0020a0c5-1658-4747-99c1-2839e736b481,5fa072f4f4aa580b63834357,I,3.0


In [6]:
MARKERS_OF_INTEREST = {'adverb', 'quant', 'ppron', 'ipron'}


def summarize_conversation_coordination(corpus, coord_transformer, conversation, markers=None):
    markers = markers or MARKERS_OF_INTEREST
    utterances = [utt for utt in conversation.iter_utterances() if getattr(utt, 'text', '').strip()]
    if len(utterances) < 4:
        return None

    participant_ids = {
        utt.speaker.id
        for utt in utterances
        if getattr(getattr(utt, 'speaker', None), 'id', None) is not None
    }
    if len(participant_ids) < 2:
        return None

    allowed_ids = {utt.id for utt in utterances}

    def speaker_selector(speaker, allowed=participant_ids):
        return getattr(speaker, 'id', None) in allowed

    def target_selector(speaker, allowed=participant_ids):
        return getattr(speaker, 'id', None) in allowed

    def utterance_filter(speaker_utt, target_utt, mask=allowed_ids):
        if speaker_utt.id not in mask:
            return False
        if target_utt is not None and target_utt.id not in mask:
            return False
        return True

    summary = coord_transformer.summarize(
        corpus,
        speaker_selector=speaker_selector,
        target_selector=target_selector,
        utterance_thresh_func=utterance_filter,
        summary_report=False,
    )

    marker_values = {}
    for speaker_scores in summary.values():
        if not isinstance(speaker_scores, dict):
            continue
        for marker, stat in speaker_scores.items():
            if markers and marker not in markers:
                continue
            if str(marker).startswith('num_') or str(marker).endswith('_count'):
                continue
            if isinstance(stat, dict):
                value = stat.get('coordination')
            elif isinstance(stat, (int, float, np.floating, np.integer)):
                value = stat
            else:
                value = None
            if value is None:
                continue
            value = float(value)
            if np.isnan(value):
                continue
            marker_values.setdefault(marker, []).append(value)

    if not marker_values:
        return None

    marker_means = {marker: float(np.nanmean(values)) for marker, values in marker_values.items()}
    overall_avg = float(np.nanmean(list(marker_means.values())))

    return {
        'overall_avg_coord': overall_avg,
        'marker_means': marker_means,
        'n_utterances': len(utterances),
        'participant_ids': sorted(participant_ids),
    }


In [7]:
conversation_records = []
for conversation in tqdm(list(corpus.iter_conversations()), desc='Scoring conversations'):
    summary = summarize_conversation_coordination(corpus, coord, conversation)
    if summary is None:
        continue
    record = {
        'conversation_id': conversation.id,
        'overall_avg_coord': summary['overall_avg_coord'],
        'n_utterances': summary['n_utterances'],
        'participant_ids': summary['participant_ids'],
    }
    for marker, value in summary['marker_means'].items():
        record[f'marker_{marker}'] = value
    conversation_records.append(record)

coord_summary_df = pd.DataFrame(conversation_records)
coord_summary_df = coord_summary_df.sort_values('overall_avg_coord').reset_index(drop=True)

print(f"{len(coord_summary_df)} conversations with coordination scores.")
coord_summary_df.head()


Scoring conversations:   0%|          | 0/1656 [00:00<?, ?it/s]

1655 conversations with coordination scores.


Unnamed: 0,conversation_id,overall_avg_coord,n_utterances,participant_ids,marker_adverb,marker_ppron,marker_ipron,marker_quant
0,9d3808dd-d448-42cc-930f-1dd15b383f23,-0.095701,263,"[5d6807f9778de5001a120d9a, 5e11882114ae638303c...",-0.115473,-0.138962,-0.029228,-0.099142
1,2f9de920-25a3-4bb4-b7a8-eb83e5d0cfe2,-0.082674,175,"[5dd28beafce6062a4f5221b3, 5f9b44bb3e13ec35fc8...",-0.059527,-0.066298,-0.005287,-0.199582
2,19313347-d014-4c87-bc6c-9823d2a64bc5,-0.066578,443,"[5dd61a909ca8a35c14d8d610, 5e11882114ae638303c...",-0.1199,-0.065345,-0.002369,-0.078699
3,cc7a5173-9c79-4773-83dc-0d9177bcc524,-0.065806,479,"[5db6f5c8c5ffeb000ac7cfb2, 5efe907ba8384039f55...",-0.079053,-0.053035,-0.048471,-0.082665
4,4a104536-429e-495f-a03a-7aa87467db1e,-0.065614,90,"[5d36600685d1d50001affacb, 5eb10e1b6e2f921d6d6...",-0.108127,-0.09051,0.017388,-0.081206


In [8]:
coord_summary_df['overall_avg_coord'].describe()


count    1655.000000
mean        0.028867
std         0.032869
min        -0.095701
25%         0.007845
50%         0.030127
75%         0.050833
max         0.202897
Name: overall_avg_coord, dtype: float64

In [21]:
def fetch_conversation(corpus, convo_id):
    if hasattr(corpus, 'get_conversation'):
        return corpus.get_conversation(convo_id)
    return corpus.conversations[convo_id]


def conversation_snippet(conversation, max_turns=20, min_turns=4):
    utterances = [utt for utt in conversation.iter_utterances() if getattr(utt, 'text', '').strip()]
    if len(utterances) < min_turns:
        return []

    total_utts = len(utterances)
    if total_utts <= max_turns:
        snippet_utts = utterances
    else:
        mid_idx = total_utts // 2
        half_window = max_turns // 2
        start_idx = max(0, mid_idx - half_window)
        end_idx = start_idx + max_turns
        if end_idx > total_utts:
            end_idx = total_utts
            start_idx = max(0, end_idx - max_turns)
        snippet_utts = utterances[start_idx:end_idx]
    speaker_alias = {}
    alias_counter = 1
    formatted = []

    for utt in snippet_utts:
        speaker_id = getattr(getattr(utt, 'speaker', None), 'id', None) or 'UNKNOWN'
        if speaker_id not in speaker_alias:
            speaker_alias[speaker_id] = f'Speaker {alias_counter}'
            alias_counter += 1
        formatted.append({
            'utterance_id': utt.id,
            'speaker_id': speaker_id,
            'speaker_label': speaker_alias[speaker_id],
            'text': getattr(utt, 'text', '').strip(),
        })
    return formatted


In [22]:
usable = coord_summary_df.dropna(subset=['overall_avg_coord'])
usable = usable[usable['n_utterances'] >= 4].reset_index(drop=True)

if usable.empty:
    raise ValueError('No conversations with computed coordination scores.')

coord_scores = usable['overall_avg_coord']
q_low = coord_scores.quantile(0.25)
q_med_low = coord_scores.quantile(0.45)
q_med_high = coord_scores.quantile(0.55)
q_high = coord_scores.quantile(0.75)
q_high_upper = coord_scores.quantile(0.95)

low_pool = usable[coord_scores <= q_low]
medium_pool = usable[(coord_scores >= q_med_low) & (coord_scores <= q_med_high)]
high_pool = usable[(coord_scores >= q_high) & (coord_scores <= q_high_upper)]

# Fallbacks guard against empty pools (e.g., limited corpus coverage)
if low_pool.empty:
    low_pool = usable.nsmallest(10, 'overall_avg_coord')
if medium_pool.empty:
    midpoint = len(usable) // 2
    medium_pool = usable.iloc[max(0, midpoint - 5): midpoint + 5]
if high_pool.empty:
    high_pool = usable[coord_scores >= q_high]
if high_pool.empty:
    high_pool = usable.nlargest(10, 'overall_avg_coord')

rng = np.random.default_rng()

def pick_random(pool):
    idx = int(rng.integers(len(pool)))
    return pool.iloc[idx].to_dict()

low_example = pick_random(low_pool)
medium_example = pick_random(medium_pool)
high_example = pick_random(high_pool)

selected_meta = pd.DataFrame([
    {'coordination_level': 'Low', **low_example},
    {'coordination_level': 'Medium', **medium_example},
    {'coordination_level': 'High', **high_example},
])

selected_meta[['coordination_level', 'conversation_id', 'overall_avg_coord', 'n_utterances', 'participant_ids']]



Unnamed: 0,coordination_level,conversation_id,overall_avg_coord,n_utterances,participant_ids
0,Low,c0c54a77-1d33-41a4-8e13-92a4840e82b8,-0.009871,629,"[5c12f106c4b80e000192c472, 5ee70f2038ee524987f..."
1,Medium,11cb78ed-49fb-4634-8a7a-3c59109563b5,0.030127,506,"[5dcfe5ec41fb3f0de0080f1a, 5f499b471c384e50e42..."
2,High,0dd022be-f343-4afb-b170-0dc8003078c7,0.061064,188,"[56df9a836b4093000bd896c7, 5eee2f5b6a22952a2bc..."


In [23]:
def display_transcript(level_label, convo_id):
    conversation = fetch_conversation(corpus, convo_id)
    snippet = conversation_snippet(conversation)
    if not snippet:
        print(f'{level_label}: Conversation {convo_id} does not have enough eligible turns.')
        return
    print(f'=== {level_label} coordination example — Conversation {convo_id} ===')
    display(pd.DataFrame(snippet)[['speaker_label', 'text']])


display_transcript('Low', low_example['conversation_id'])
display_transcript('Medium', medium_example['conversation_id'])
display_transcript('High', high_example['conversation_id'])


=== Low coordination example — Conversation c0c54a77-1d33-41a4-8e13-92a4840e82b8 ===


Unnamed: 0,speaker_label,text
0,Speaker 1,The customer in 95% of the time is an idiot. Y...
1,Speaker 2,The i. d.
2,Speaker 1,cripple.
3,Speaker 2,10 whatever. Uh you know what I mean? But yeah...
4,Speaker 1,Look
5,Speaker 2,anyway. And uh and then there's others just like
6,Speaker 1,"there,"
7,Speaker 2,"we read reviews, me and my husband about a pro..."
8,Speaker 1,"No, sir."
9,Speaker 2,"and they may say, well I didn't get around to ..."


=== Medium coordination example — Conversation 11cb78ed-49fb-4634-8a7a-3c59109563b5 ===


Unnamed: 0,speaker_label,text
0,Speaker 1,"so many eggs and they worked, I mean they had ..."
1,Speaker 2,man.
2,Speaker 1,"they always worked really hard on the place, t..."
3,Speaker 2,Yeah.
4,Speaker 1,"You know, there's just a lot of things that yo..."
5,Speaker 2,yeah that makes sense. So you live there too when
6,Speaker 1,I lived there. We very first got married. We l...
7,Speaker 2,oh that's cool that's like what we're doing ri...
8,Speaker 1,it's very good for families.
9,Speaker 2,but if I'm like like half an acre or three qua...


=== High coordination example — Conversation 0dd022be-f343-4afb-b170-0dc8003078c7 ===


Unnamed: 0,speaker_label,text
0,Speaker 1,Yeah. There's nobody really doing anything abo...
1,Speaker 2,No.
2,Speaker 1,as far as I. Uh
3,Speaker 2,"And like, when I was like starting up an under..."
4,Speaker 1,my God. Yeah. That's so awful. Uh It's part of...
5,Speaker 2,I
6,Speaker 1,have
7,Speaker 2,know.
8,Speaker 1,"students down to government student loans, but..."
9,Speaker 2,"Yeah, I chose to start paying it off. Like, as..."


In [24]:
def formatted_lines(snippet):
    return [f"{row['speaker_label']}: {row['text']}" for row in snippet]

snippets_for_form = {}
for label, meta in [('Low', low_example), ('Medium', medium_example), ('High', high_example)]:
    conversation = fetch_conversation(corpus, meta['conversation_id'])
    snippet = conversation_snippet(conversation)
    snippets_for_form[label] = formatted_lines(snippet)

snippets_for_form


{'Low': ['Speaker 1: The customer in 95% of the time is an idiot. Yeah,',
  'Speaker 2: The i. d.',
  'Speaker 1: cripple.',
  "Speaker 2: 10 whatever. Uh you know what I mean? But yeah they you know you give them an inch and they'll take a mile and um there's some that you know they're honest and they really mean what they're you know saying and they they've done everything to just to live it. Uh I think of the word I'm trying to say but um you know to not have a problem. And they do",
  'Speaker 1: Look',
  "Speaker 2: anyway. And uh and then there's others just like",
  'Speaker 1: there,',
  'Speaker 2: we read reviews, me and my husband about a product',
  'Speaker 1: No, sir.',
  "Speaker 2: and they may say, well I didn't get around to open it until after six months",
  'Speaker 1: What?',
  "Speaker 2: and it's broken and now I had, they won't take it back, who buys something? It doesn't look it over and use it in six months.",
  "Speaker 1: as soon as I get something in, the f