In [None]:
import json
import pandas as pd
from pathlib import Path
from extract_conversations import BurrConversationExtractor
import matplotlib.pyplot as plt
import seaborn as sns

sns.set_style("whitegrid")

## 1. Initialize the Extractor

In [None]:
# Initialize the extractor
extractor = BurrConversationExtractor(burr_storage_dir="~/.burr")

# List available projects
projects = extractor.list_projects()
print(f"Found {len(projects)} projects:")
for proj in projects:
    apps = extractor.list_applications(proj)
    print(f"  - {proj}: {len(apps)} applications")

## 2. Extract All Conversations

In [None]:
# Extract all conversations from all projects
conversations = extractor.extract_all_conversations()

print(f"\nExtracted {len(conversations)} total conversations")
print(f"Total turns across all conversations: {sum(c['total_turns'] for c in conversations)}")

## 3. Explore a Single Conversation

In [None]:
# Let's look at the first conversation in detail
if conversations:
    conv = conversations[0]
    print(f"Project: {conv['project_id']}")
    print(f"App ID: {conv['app_id']}")
    print(f"Total Turns: {conv['total_turns']}")
    print(f"Total Actions: {conv['metadata']['total_actions']}")
    print("\n" + "="*60)
    
    # Display each turn
    for turn in conv['turns']:
        print(f"\nTurn {turn['turn_number']}:")
        print("-" * 40)
        for msg in turn['messages']:
            role = msg['role'].upper()
            content = msg['content'][:200] + "..." if len(msg['content']) > 200 else msg['content']
            print(f"\n[{role}]: {content}")

## 4. Convert to DataFrame for Analysis

In [None]:
# Create a flattened DataFrame for easier analysis
rows = []

for conv in conversations:
    for turn in conv['turns']:
        turn_data = {
            'conversation_id': conv['app_id'],
            'project_id': conv['project_id'],
            'turn_number': turn['turn_number'],
            'num_messages_in_turn': len(turn['messages'])
        }
        
        # Extract user and assistant messages
        for msg in turn['messages']:
            if msg['role'] == 'user':
                turn_data['user_message'] = msg['content']
                turn_data['user_message_length'] = len(msg['content'])
            elif msg['role'] == 'assistant':
                turn_data['assistant_message'] = msg['content']
                turn_data['assistant_message_length'] = len(msg['content'])
        
        rows.append(turn_data)

df = pd.DataFrame(rows)
print(f"Created DataFrame with {len(df)} turns")
df.head(10)

## 5. Conversation Statistics

In [None]:
# Basic statistics
print("=" * 60)
print("CONVERSATION STATISTICS")
print("=" * 60)

print(f"\nTotal Conversations: {df['conversation_id'].nunique()}")
print(f"Total Turns: {len(df)}")
print(f"\nAverage turns per conversation: {len(df) / df['conversation_id'].nunique():.2f}")
print(f"\nMessage Length Statistics:")
print(f"  User messages:")
print(f"    - Mean: {df['user_message_length'].mean():.0f} chars")
print(f"    - Median: {df['user_message_length'].median():.0f} chars")
print(f"  Assistant messages:")
print(f"    - Mean: {df['assistant_message_length'].mean():.0f} chars")
print(f"    - Median: {df['assistant_message_length'].median():.0f} chars")

## 6. Visualizations

In [None]:
# Turn distribution
fig, axes = plt.subplots(2, 2, figsize=(14, 10))

# 1. Turns per conversation
turns_per_conv = df.groupby('conversation_id')['turn_number'].max()
axes[0, 0].hist(turns_per_conv, bins=20, edgecolor='black', alpha=0.7)
axes[0, 0].set_xlabel('Number of Turns')
axes[0, 0].set_ylabel('Number of Conversations')
axes[0, 0].set_title('Distribution of Turns per Conversation')

# 2. Message length distribution
axes[0, 1].hist(df['user_message_length'], bins=30, alpha=0.5, label='User', edgecolor='black')
axes[0, 1].hist(df['assistant_message_length'], bins=30, alpha=0.5, label='Assistant', edgecolor='black')
axes[0, 1].set_xlabel('Message Length (characters)')
axes[0, 1].set_ylabel('Frequency')
axes[0, 1].set_title('Message Length Distribution')
axes[0, 1].legend()

# 3. Turn progression
turn_stats = df.groupby('turn_number').agg({
    'user_message_length': 'mean',
    'assistant_message_length': 'mean'
})
turn_stats.plot(ax=axes[1, 0], marker='o')
axes[1, 0].set_xlabel('Turn Number')
axes[1, 0].set_ylabel('Average Message Length')
axes[1, 0].set_title('Message Length by Turn Number')
axes[1, 0].legend(['User', 'Assistant'])

# 4. Conversations by project
project_counts = df.groupby('project_id')['conversation_id'].nunique()
axes[1, 1].bar(range(len(project_counts)), project_counts.values, alpha=0.7, edgecolor='black')
axes[1, 1].set_xlabel('Project')
axes[1, 1].set_ylabel('Number of Conversations')
axes[1, 1].set_title('Conversations per Project')
axes[1, 1].set_xticks(range(len(project_counts)))
axes[1, 1].set_xticklabels([p[:20] + '...' if len(p) > 20 else p for p in project_counts.index], rotation=45, ha='right')

plt.tight_layout()
plt.show()

## 7. Extract Actions Timeline

In [None]:
# Analyze action sequences
if conversations:
    conv = conversations[0]
    
    print("Action Timeline for First Conversation:")
    print("=" * 60)
    
    actions_df = pd.DataFrame(conv['actions'])
    
    if not actions_df.empty:
        # Calculate duration for each action
        if 'start_time' in actions_df.columns and 'end_time' in actions_df.columns:
            actions_df['start_dt'] = pd.to_datetime(actions_df['start_time'])
            actions_df['end_dt'] = pd.to_datetime(actions_df['end_time'])
            actions_df['duration_ms'] = (actions_df['end_dt'] - actions_df['start_dt']).dt.total_seconds() * 1000
        
        display(actions_df[['sequence_id', 'action', 'duration_ms']].head(20))
        
        # Action summary
        print("\nAction Summary:")
        action_summary = actions_df.groupby('action').agg({
            'sequence_id': 'count',
            'duration_ms': 'mean'
        }).rename(columns={'sequence_id': 'count', 'duration_ms': 'avg_duration_ms'})
        display(action_summary.sort_values('count', ascending=False))

## 8. Save Processed Data

In [None]:
# Save the extracted conversations to JSON
output_file = "extracted_conversations.json"
extractor.save_to_json(conversations, output_file)

# Also save the DataFrame as CSV for easier analysis
df.to_csv("conversation_turns.csv", index=False)
print("✓ Saved conversation turns to conversation_turns.csv")

## 9. Export for Multi-Turn Evaluation

Format the data specifically for evaluation purposes.

In [None]:
# Create evaluation format
eval_data = []

for conv in conversations:
    eval_conv = {
        "conversation_id": conv["app_id"],
        "project_id": conv["project_id"],
        "metadata": conv["metadata"],
        "turns": []
    }
    
    for turn in conv["turns"]:
        eval_turn = {
            "turn_number": turn["turn_number"],
            "user_input": None,
            "assistant_response": None,
            "context": []
        }
        
        for msg in turn["messages"]:
            if msg["role"] == "user":
                eval_turn["user_input"] = {
                    "text": msg["content"],
                    "type": msg["type"]
                }
            elif msg["role"] == "assistant":
                eval_turn["assistant_response"] = {
                    "text": msg["content"],
                    "type": msg["type"]
                }
        
        eval_conv["turns"].append(eval_turn)
    
    eval_data.append(eval_conv)

# Save evaluation format
with open("conversations_for_evaluation.json", "w") as f:
    json.dump(eval_data, f, indent=2)

print(f"✓ Exported {len(eval_data)} conversations in evaluation format")
print("  File: conversations_for_evaluation.json")

# Show example
if eval_data:
    print("\nExample conversation structure:")
    print(json.dumps(eval_data[0], indent=2)[:500] + "...")

## 10. Multi-Turn Metrics

Calculate metrics useful for multi-turn evaluation.

In [None]:
# Calculate multi-turn specific metrics
metrics = []

for conv in conversations:
    conv_metrics = {
        "conversation_id": conv["app_id"],
        "total_turns": conv["total_turns"],
        "total_messages": conv["metadata"]["total_messages"],
        "total_actions": conv["metadata"]["total_actions"],
        "actions_per_turn": conv["metadata"]["total_actions"] / max(conv["total_turns"], 1),
        "avg_user_message_length": 0,
        "avg_assistant_message_length": 0,
        "turn_types": []
    }
    
    user_lengths = []
    assistant_lengths = []
    
    for turn in conv["turns"]:
        turn_type = "multi_message" if len(turn["messages"]) > 2 else "standard"
        conv_metrics["turn_types"].append(turn_type)
        
        for msg in turn["messages"]:
            if msg["role"] == "user":
                user_lengths.append(len(msg["content"]))
            elif msg["role"] == "assistant":
                assistant_lengths.append(len(msg["content"]))
    
    if user_lengths:
        conv_metrics["avg_user_message_length"] = sum(user_lengths) / len(user_lengths)
    if assistant_lengths:
        conv_metrics["avg_assistant_message_length"] = sum(assistant_lengths) / len(assistant_lengths)
    
    metrics.append(conv_metrics)

metrics_df = pd.DataFrame(metrics)
print("Multi-Turn Evaluation Metrics:")
display(metrics_df)

# Save metrics
metrics_df.to_csv("conversation_metrics.csv", index=False)
print("\n✓ Saved metrics to conversation_metrics.csv")