# UBS Client Conversation Analysis - SwissAIHacks25

This notebook analyzes the UBS synthetic call transcripts dataset to understand the types of tasks extracted from client conversations and provides comprehensive visualizations of the data patterns.

## Project Overview

The SwissAIHacks25 project tackles the challenge: **"From Talk to Task: Insights from Client Conversations"**. The goal is to use Generative AI to analyze transcripts of client conversations and automatically extract actionable insights such as client requests and action items.

### Key Components:
- **Dataset**: UBS synthetic call transcripts with ground truth task extractions
- **Tasks**: Various banking operations like KYC updates, contact info changes, investment requests
- **Languages**: Multilingual support (German, English, etc.)
- **Models**: Comparison of different AI models for task extraction

## 1. Import Required Libraries

In [2]:
# Import Required Libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import json
import os
from pathlib import Path
from collections import Counter, defaultdict
import re
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Set up plotting style
plt.style.use('default')
sns.set_palette("husl")
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

print("Libraries imported successfully!")
print(f"Python version: {np.__version__} (NumPy), {pd.__version__} (Pandas)")
print(f"Matplotlib version: {plt.matplotlib.__version__}")
print(f"Seaborn version: {sns.__version__}")

ModuleNotFoundError: No module named 'pandas'

## 2. Load and Explore Repository Data

In [None]:
# Define data paths
data_dir = Path("data/ubs_synthetic_call_transcripts_dataset")
train_dir = data_dir / "train"
test_dir = data_dir / "test"
validation_dir = data_dir / "validation"

# Function to load data from each split
def load_dataset_split(split_dir: Path) -> Dict[str, Any]:
    """Load all JSON and TXT files from a dataset split directory"""
    json_files = list(split_dir.glob("*.json"))
    txt_files = list(split_dir.glob("*.txt"))
    
    data = {
        "json_files": [],
        "txt_files": [],
        "conversations": [],
        "tasks": []
    }
    
    # Load JSON files (ground truth tasks)
    for json_file in json_files:
        try:
            with open(json_file, 'r', encoding='utf-8') as f:
                tasks_data = json.load(f)
                data["tasks"].append({
                    "file_id": json_file.stem,
                    "tasks": tasks_data
                })
        except Exception as e:
            print(f"Error loading {json_file}: {e}")
    
    # Load TXT files (conversation transcripts)
    for txt_file in txt_files:
        try:
            with open(txt_file, 'r', encoding='utf-8') as f:
                content = f.read()
                data["conversations"].append({
                    "file_id": txt_file.stem,
                    "content": content
                })
        except Exception as e:
            print(f"Error loading {txt_file}: {e}")
    
    return data

# Load all dataset splits
print("Loading dataset splits...")
train_data = load_dataset_split(train_dir)
test_data = load_dataset_split(test_dir)
validation_data = load_dataset_split(validation_dir)

# Print dataset statistics
print(f"\nDataset Statistics:")
print(f"Train: {len(train_data['tasks'])} task files, {len(train_data['conversations'])} conversation files")
print(f"Test: {len(test_data['tasks'])} task files, {len(test_data['conversations'])} conversation files") 
print(f"Validation: {len(validation_data['tasks'])} task files, {len(validation_data['conversations'])} conversation files")

# Verify file matching
print(f"\nFile matching verification:")
for split_name, split_data in [("Train", train_data), ("Test", test_data), ("Validation", validation_data)]:
    task_ids = {item['file_id'] for item in split_data['tasks']}
    conv_ids = {item['file_id'] for item in split_data['conversations']}
    matching = len(task_ids & conv_ids)
    print(f"{split_name}: {matching} files have both tasks and conversations")

## 3. Data Preprocessing and Cleaning

In [None]:
# Function to clean and process conversation data
def clean_conversation_text(text: str) -> Dict[str, Any]:
    """Extract and clean conversation data"""
    lines = text.split('\n')
    
    # Remove disclaimer
    clean_lines = []
    skip_disclaimer = True
    speakers = []
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # Skip disclaimer section
        if skip_disclaimer and "Speaker" not in line:
            continue
        skip_disclaimer = False
        
        # Extract speaker information
        if line.startswith("Speaker"):
            speaker_match = re.match(r'Speaker (\d+):', line)
            if speaker_match:
                speaker_id = speaker_match.group(1)
                content = line[speaker_match.end():].strip()
                speakers.append({
                    'speaker_id': speaker_id,
                    'content': content
                })
        
        clean_lines.append(line)
    
    # Calculate conversation statistics
    full_text = ' '.join(clean_lines)
    word_count = len(full_text.split())
    char_count = len(full_text)
    speaker_count = len(set([s['speaker_id'] for s in speakers]))
    
    return {
        'clean_text': full_text,
        'speakers': speakers,
        'word_count': word_count,
        'char_count': char_count,
        'speaker_count': speaker_count,
        'line_count': len(clean_lines)
    }

# Process all conversations
def process_all_conversations(data_splits):
    """Process conversations from all data splits"""
    processed_data = []
    
    for split_name, split_data in data_splits.items():
        for conv in split_data['conversations']:
            try:
                processed_conv = clean_conversation_text(conv['content'])
                processed_conv['file_id'] = conv['file_id']
                processed_conv['split'] = split_name
                processed_data.append(processed_conv)
            except Exception as e:
                print(f"Error processing conversation {conv['file_id']}: {e}")
    
    return processed_data

# Process conversations
print("Processing conversations...")
data_splits = {"train": train_data, "test": test_data, "validation": validation_data}
processed_conversations = process_all_conversations(data_splits)

# Create DataFrame for conversations
conv_df = pd.DataFrame(processed_conversations)
print(f"Processed {len(conv_df)} conversations")
print(f"Columns: {conv_df.columns.tolist()}")

# Display basic statistics
print(f"\nConversation Statistics:")
print(conv_df[['word_count', 'char_count', 'speaker_count', 'line_count']].describe())

## 4. Task Type Classification and Analysis

In [None]:
# Process and analyze task data
def process_all_tasks(data_splits):
    """Process tasks from all data splits"""
    all_tasks = []
    
    for split_name, split_data in data_splits.items():
        for task_data in split_data['tasks']:
            file_id = task_data['file_id']
            tasks = task_data['tasks']
            
            # Handle empty task lists
            if not tasks:
                all_tasks.append({
                    'file_id': file_id,
                    'split': split_name,
                    'task_type': 'no_tasks',
                    'has_parameters': False,
                    'parameter_count': 0,
                    'parameters': {}
                })
            else:
                for task in tasks:
                    task_type = task.get('task_type', 'unknown')
                    parameters = task.get('parameters', {})
                    
                    all_tasks.append({
                        'file_id': file_id,
                        'split': split_name,
                        'task_type': task_type,
                        'has_parameters': len(parameters) > 0,
                        'parameter_count': len(parameters),
                        'parameters': parameters
                    })
    
    return all_tasks

# Process all tasks
print("Processing tasks...")
all_tasks = process_all_tasks(data_splits)
tasks_df = pd.DataFrame(all_tasks)

print(f"Total tasks extracted: {len(tasks_df)}")
print(f"Unique task types: {tasks_df['task_type'].nunique()}")
print(f"Files with no tasks: {(tasks_df['task_type'] == 'no_tasks').sum()}")

# Analyze task types
task_type_counts = tasks_df['task_type'].value_counts()
print(f"\nTop 10 Task Types:")
print(task_type_counts.head(10))

# Create task categories
def categorize_task_type(task_type: str) -> str:
    """Categorize task types into broader categories"""
    if 'kyc' in task_type.lower():
        return 'KYC Updates'
    elif 'contact' in task_type.lower():
        return 'Contact Information'
    elif 'investment' in task_type.lower() or 'portfolio' in task_type.lower():
        return 'Investment Services'
    elif 'account' in task_type.lower() or 'banking' in task_type.lower():
        return 'Account Management'
    elif 'document' in task_type.lower():
        return 'Document Services'
    elif 'advisory' in task_type.lower() or 'consultation' in task_type.lower():
        return 'Advisory Services'
    elif task_type == 'no_tasks':
        return 'No Tasks'
    else:
        return 'Other'

tasks_df['task_category'] = tasks_df['task_type'].apply(categorize_task_type)

# Display task category distribution
print(f"\nTask Category Distribution:")
category_counts = tasks_df['task_category'].value_counts()
print(category_counts)

## 5. Create Distribution Plots for Task Types

In [None]:
# Create comprehensive task distribution visualizations

# Set up the plotting style
plt.style.use('default')
sns.set_palette("husl")

# 1. Task Category Distribution - Horizontal Bar Chart
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# Task categories horizontal bar chart
category_counts.plot(kind='barh', ax=ax1, color=sns.color_palette("viridis", len(category_counts)))
ax1.set_title('Distribution of Task Categories', fontsize=14, fontweight='bold')
ax1.set_xlabel('Number of Tasks')
ax1.grid(axis='x', alpha=0.3)

# Add value labels on bars
for i, v in enumerate(category_counts.values):
    ax1.text(v + max(category_counts.values) * 0.01, i, str(v), 
             va='center', fontweight='bold')

# 2. Task Category Distribution - Pie Chart
wedges, texts, autotexts = ax2.pie(category_counts.values, labels=category_counts.index, 
                                   autopct='%1.1f%%', startangle=90,
                                   colors=sns.color_palette("husl", len(category_counts)))
ax2.set_title('Task Categories - Percentage Distribution', fontsize=14, fontweight='bold')

# Improve text readability
for autotext in autotexts:
    autotext.set_color('white')
    autotext.set_fontweight('bold')

# 3. Top 15 Specific Task Types
top_15_tasks = task_type_counts.head(15)
bars = ax3.bar(range(len(top_15_tasks)), top_15_tasks.values, 
               color=sns.color_palette("coolwarm", len(top_15_tasks)))
ax3.set_title('Top 15 Specific Task Types', fontsize=14, fontweight='bold')
ax3.set_xlabel('Task Types')
ax3.set_ylabel('Frequency')
ax3.set_xticks(range(len(top_15_tasks)))
ax3.set_xticklabels(top_15_tasks.index, rotation=45, ha='right')
ax3.grid(axis='y', alpha=0.3)

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height + max(top_15_tasks.values) * 0.01,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 4. Distribution across dataset splits
split_category_crosstab = pd.crosstab(tasks_df['split'], tasks_df['task_category'])
split_category_crosstab.plot(kind='bar', stacked=True, ax=ax4, 
                            color=sns.color_palette("Set3", len(category_counts)))
ax4.set_title('Task Categories by Dataset Split', fontsize=14, fontweight='bold')
ax4.set_xlabel('Dataset Split')
ax4.set_ylabel('Number of Tasks')
ax4.legend(title='Task Category', bbox_to_anchor=(1.05, 1), loc='upper left')
ax4.set_xticklabels(ax4.get_xticklabels(), rotation=0)
ax4.grid(axis='y', alpha=0.3)

plt.tight_layout()
plt.show()

# Print summary statistics
print(f"\n{'='*60}")
print("TASK DISTRIBUTION SUMMARY")
print(f"{'='*60}")
print(f"Total tasks analyzed: {len(tasks_df):,}")
print(f"Unique task types: {tasks_df['task_type'].nunique()}")
print(f"Most common task category: {category_counts.index[0]} ({category_counts.iloc[0]} tasks)")
print(f"Files with no tasks: {(tasks_df['task_type'] == 'no_tasks').sum()}")
print(f"Files with parameters: {tasks_df['has_parameters'].sum()}")
print(f"Average parameters per task: {tasks_df['parameter_count'].mean():.2f}")

## 6. Generate Conversation Analysis Charts

In [None]:
# Analyze conversation characteristics and patterns

# Create conversation analysis visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Word count distribution
ax1.hist(conv_df['word_count'], bins=30, alpha=0.7, color='skyblue', edgecolor='black')
ax1.axvline(conv_df['word_count'].mean(), color='red', linestyle='--', 
           label=f'Mean: {conv_df["word_count"].mean():.0f}')
ax1.axvline(conv_df['word_count'].median(), color='green', linestyle='--', 
           label=f'Median: {conv_df["word_count"].median():.0f}')
ax1.set_title('Distribution of Conversation Word Counts', fontsize=14, fontweight='bold')
ax1.set_xlabel('Word Count')
ax1.set_ylabel('Frequency')
ax1.legend()
ax1.grid(alpha=0.3)

# 2. Speaker count analysis
speaker_counts = conv_df['speaker_count'].value_counts().sort_index()
bars2 = ax2.bar(speaker_counts.index, speaker_counts.values, 
                color=sns.color_palette("coolwarm", len(speaker_counts)))
ax2.set_title('Distribution of Speaker Counts per Conversation', fontsize=14, fontweight='bold')
ax2.set_xlabel('Number of Speakers')
ax2.set_ylabel('Number of Conversations')
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars2:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + max(speaker_counts.values) * 0.01,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 3. Conversation length by dataset split
conv_df.boxplot(column='word_count', by='split', ax=ax3)
ax3.set_title('Conversation Length Distribution by Dataset Split', fontsize=14, fontweight='bold')
ax3.set_xlabel('Dataset Split')
ax3.set_ylabel('Word Count')
ax3.grid(alpha=0.3)
plt.suptitle('')  # Remove the default title

# 4. Character count vs word count relationship
scatter = ax4.scatter(conv_df['word_count'], conv_df['char_count'], 
                     c=conv_df['speaker_count'], cmap='viridis', alpha=0.6)
ax4.set_title('Character Count vs Word Count (colored by Speaker Count)', 
              fontsize=14, fontweight='bold')
ax4.set_xlabel('Word Count')
ax4.set_ylabel('Character Count')
plt.colorbar(scatter, ax=ax4, label='Number of Speakers')
ax4.grid(alpha=0.3)

# Add trend line
z = np.polyfit(conv_df['word_count'], conv_df['char_count'], 1)
p = np.poly1d(z)
ax4.plot(conv_df['word_count'], p(conv_df['word_count']), "r--", alpha=0.8, 
         label=f'Trend line (slope: {z[0]:.2f})')
ax4.legend()

plt.tight_layout()
plt.show()

# Print conversation statistics
print(f"\n{'='*60}")
print("CONVERSATION ANALYSIS SUMMARY")
print(f"{'='*60}")
print(f"Total conversations: {len(conv_df):,}")
print(f"Average word count: {conv_df['word_count'].mean():.1f}")
print(f"Average character count: {conv_df['char_count'].mean():.1f}")
print(f"Average speakers per conversation: {conv_df['speaker_count'].mean():.1f}")
print(f"Most common speaker count: {conv_df['speaker_count'].mode().iloc[0]}")
print(f"\nWord count statistics:")
print(conv_df['word_count'].describe())

## 7. Visualize Task Complexity Metrics

In [None]:
# Merge conversation and task data for complexity analysis
# Create a mapping of file_id to conversation metrics
conv_metrics = conv_df.set_index('file_id')[['word_count', 'char_count', 'speaker_count', 'split']].to_dict('index')

# Add conversation metrics to tasks
tasks_with_conv = []
for _, task_row in tasks_df.iterrows():
    file_id = task_row['file_id']
    task_data = task_row.to_dict()
    
    if file_id in conv_metrics:
        task_data.update(conv_metrics[file_id])
    else:
        task_data.update({'word_count': 0, 'char_count': 0, 'speaker_count': 0})
    
    tasks_with_conv.append(task_data)

# Create enhanced dataframe
enhanced_tasks_df = pd.DataFrame(tasks_with_conv)

# Calculate task complexity metrics
def calculate_task_complexity(row):
    """Calculate a complexity score based on various factors"""
    complexity_score = 0
    
    # Parameter complexity (0-3 points)
    if row['parameter_count'] == 0:
        complexity_score += 0
    elif row['parameter_count'] <= 2:
        complexity_score += 1
    elif row['parameter_count'] <= 4:
        complexity_score += 2
    else:
        complexity_score += 3
    
    # Conversation length complexity (0-2 points)
    word_count = row.get('word_count', 0)
    if word_count > 200:
        complexity_score += 2
    elif word_count > 100:
        complexity_score += 1
    
    # Multi-speaker complexity (0-1 point)
    speaker_count = row.get('speaker_count', 0)
    if speaker_count > 2:
        complexity_score += 1
    
    return complexity_score

enhanced_tasks_df['complexity_score'] = enhanced_tasks_df.apply(calculate_task_complexity, axis=1)

# Create task complexity visualizations
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))

# 1. Box plot of parameter counts by task category
enhanced_tasks_df.boxplot(column='parameter_count', by='task_category', ax=ax1)
ax1.set_title('Parameter Count Distribution by Task Category', fontsize=14, fontweight='bold')
ax1.set_xlabel('Task Category')
ax1.set_ylabel('Number of Parameters')
ax1.tick_params(axis='x', rotation=45)
plt.suptitle('')  # Remove default title

# 2. Complexity score distribution
complexity_counts = enhanced_tasks_df['complexity_score'].value_counts().sort_index()
bars3 = ax2.bar(complexity_counts.index, complexity_counts.values, 
                color=sns.color_palette("rocket", len(complexity_counts)))
ax2.set_title('Task Complexity Score Distribution', fontsize=14, fontweight='bold')
ax2.set_xlabel('Complexity Score (0-6)')
ax2.set_ylabel('Number of Tasks')
ax2.grid(axis='y', alpha=0.3)

# Add value labels
for bar in bars3:
    height = bar.get_height()
    ax2.text(bar.get_x() + bar.get_width()/2., height + max(complexity_counts.values) * 0.01,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 3. Scatter plot: Conversation length vs Task complexity
# Filter out tasks with no conversation data
filtered_df = enhanced_tasks_df[enhanced_tasks_df['word_count'] > 0]
scatter2 = ax3.scatter(filtered_df['word_count'], filtered_df['complexity_score'], 
                      c=filtered_df['parameter_count'], cmap='plasma', alpha=0.6, s=30)
ax3.set_title('Conversation Length vs Task Complexity', fontsize=14, fontweight='bold')
ax3.set_xlabel('Conversation Word Count')
ax3.set_ylabel('Task Complexity Score')
plt.colorbar(scatter2, ax=ax3, label='Parameter Count')
ax3.grid(alpha=0.3)

# 4. Violin plot of complexity by task category
# Filter categories with enough data
categories_with_data = enhanced_tasks_df['task_category'].value_counts()
top_categories = categories_with_data[categories_with_data >= 10].index
filtered_for_violin = enhanced_tasks_df[enhanced_tasks_df['task_category'].isin(top_categories)]

if len(filtered_for_violin) > 0:
    sns.violinplot(data=filtered_for_violin, x='task_category', y='complexity_score', ax=ax4)
    ax4.set_title('Task Complexity Distribution by Category (Violin Plot)', fontsize=14, fontweight='bold')
    ax4.set_xlabel('Task Category')
    ax4.set_ylabel('Complexity Score')
    ax4.tick_params(axis='x', rotation=45)
else:
    ax4.text(0.5, 0.5, 'Insufficient data for violin plot', 
             transform=ax4.transAxes, ha='center', va='center', fontsize=12)
    ax4.set_title('Task Complexity Distribution by Category', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

# Print complexity analysis
print(f"\n{'='*60}")
print("TASK COMPLEXITY ANALYSIS")
print(f"{'='*60}")
print(f"Average complexity score: {enhanced_tasks_df['complexity_score'].mean():.2f}")
print(f"Most complex task category: {enhanced_tasks_df.groupby('task_category')['complexity_score'].mean().idxmax()}")
print(f"Simplest task category: {enhanced_tasks_df.groupby('task_category')['complexity_score'].mean().idxmin()}")
print(f"\nComplexity score breakdown:")
for score in sorted(enhanced_tasks_df['complexity_score'].unique()):
    count = (enhanced_tasks_df['complexity_score'] == score).sum()
    percentage = count / len(enhanced_tasks_df) * 100
    print(f"  Score {score}: {count} tasks ({percentage:.1f}%)")
    
# Show average complexity by category
print(f"\nAverage complexity by task category:")
complexity_by_category = enhanced_tasks_df.groupby('task_category')['complexity_score'].mean().sort_values(ascending=False)
for category, avg_complexity in complexity_by_category.items():
    count = (enhanced_tasks_df['task_category'] == category).sum()
    print(f"  {category}: {avg_complexity:.2f} (n={count})")

## 8. Create Correlation Heatmaps

In [None]:
# Create correlation analysis
# Select numeric columns for correlation analysis
numeric_columns = ['word_count', 'char_count', 'speaker_count', 'parameter_count', 'complexity_score']
correlation_data = enhanced_tasks_df[numeric_columns].fillna(0)

# Calculate correlation matrix
correlation_matrix = correlation_data.corr()

# Create correlation heatmaps
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))

# 1. Main correlation heatmap
mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
sns.heatmap(correlation_matrix, mask=mask, annot=True, cmap='coolwarm', center=0,
            square=True, ax=ax1, cbar_kws={'label': 'Correlation Coefficient'})
ax1.set_title('Correlation Matrix - Task and Conversation Metrics', 
              fontsize=14, fontweight='bold')

# 2. Task category vs metrics heatmap
# Create a pivot table for task categories and metrics
category_metrics = enhanced_tasks_df.groupby('task_category')[numeric_columns].mean()

# Normalize the data for better visualization
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
category_metrics_normalized = pd.DataFrame(
    scaler.fit_transform(category_metrics),
    index=category_metrics.index,
    columns=category_metrics.columns
)

sns.heatmap(category_metrics_normalized.T, annot=True, cmap='viridis', 
            cbar_kws={'label': 'Standardized Score'}, ax=ax2)
ax2.set_title('Task Categories vs Metrics (Standardized)', 
              fontsize=14, fontweight='bold')
ax2.set_xlabel('Task Category')
ax2.set_ylabel('Metrics')

plt.tight_layout()
plt.show()

# Additional correlation analysis
print(f"\n{'='*60}")
print("CORRELATION ANALYSIS")
print(f"{'='*60}")
print("Strong correlations (|r| > 0.5):")
for i in range(len(correlation_matrix.columns)):
    for j in range(i+1, len(correlation_matrix.columns)):
        corr_value = correlation_matrix.iloc[i, j]
        if abs(corr_value) > 0.5:
            var1 = correlation_matrix.columns[i]
            var2 = correlation_matrix.columns[j]
            print(f"  {var1} ↔ {var2}: {corr_value:.3f}")

print(f"\nKey insights:")
print(f"  • Word count and character count correlation: {correlation_matrix.loc['word_count', 'char_count']:.3f}")
print(f"  • Complexity score and parameter count correlation: {correlation_matrix.loc['complexity_score', 'parameter_count']:.3f}")
print(f"  • Speaker count and word count correlation: {correlation_matrix.loc['speaker_count', 'word_count']:.3f}")

# Show category-specific statistics
print(f"\nTask Category Statistics:")
print("=" * 40)
for category in category_metrics.index:
    n_tasks = (enhanced_tasks_df['task_category'] == category).sum()
    avg_complexity = category_metrics.loc[category, 'complexity_score']
    avg_params = category_metrics.loc[category, 'parameter_count']
    avg_words = category_metrics.loc[category, 'word_count']
    
    print(f"{category}:")
    print(f"  Tasks: {n_tasks}")
    print(f"  Avg Complexity: {avg_complexity:.2f}")
    print(f"  Avg Parameters: {avg_params:.2f}")
    print(f"  Avg Word Count: {avg_words:.0f}")
    print()

## 9. Generate Summary Statistics Dashboard

In [None]:
# Create comprehensive summary dashboard
fig = plt.figure(figsize=(20, 16))

# Create a grid layout for the dashboard
gs = fig.add_gridspec(4, 4, hspace=0.3, wspace=0.3)

# 1. Dataset overview (top left)
ax1 = fig.add_subplot(gs[0, 0])
dataset_sizes = [len(train_data['tasks']), len(test_data['tasks']), len(validation_data['tasks'])]
colors1 = ['#FF6B6B', '#4ECDC4', '#45B7D1']
wedges, texts, autotexts = ax1.pie(dataset_sizes, labels=['Train', 'Test', 'Validation'], 
                                   autopct='%1.1f%%', colors=colors1, startangle=90)
ax1.set_title('Dataset Split Distribution', fontweight='bold')

# 2. Task category summary (top middle-left)
ax2 = fig.add_subplot(gs[0, 1])
top_5_categories = category_counts.head(5)
bars2 = ax2.barh(range(len(top_5_categories)), top_5_categories.values, color=sns.color_palette("viridis", 5))
ax2.set_yticks(range(len(top_5_categories)))
ax2.set_yticklabels(top_5_categories.index)
ax2.set_title('Top 5 Task Categories', fontweight='bold')
ax2.set_xlabel('Count')
for i, v in enumerate(top_5_categories.values):
    ax2.text(v + max(top_5_categories.values) * 0.02, i, str(v), va='center')

# 3. Conversation metrics (top middle-right)
ax3 = fig.add_subplot(gs[0, 2])
metrics_data = [conv_df['word_count'].mean(), conv_df['char_count'].mean()/10, 
                conv_df['speaker_count'].mean()*50]
metrics_labels = ['Avg Words', 'Avg Chars/10', 'Avg Speakers×50']
bars3 = ax3.bar(metrics_labels, metrics_data, color=['#FF9F40', '#36A2EB', '#4BC0C0'])
ax3.set_title('Conversation Metrics', fontweight='bold')
ax3.set_ylabel('Value')
for bar in bars3:
    height = bar.get_height()
    ax3.text(bar.get_x() + bar.get_width()/2., height,
             f'{height:.0f}', ha='center', va='bottom')

# 4. Key statistics (top right)
ax4 = fig.add_subplot(gs[0, 3])
ax4.axis('off')
stats_text = f"""
KEY STATISTICS

Total Files: {len(conv_df):,}
Total Tasks: {len(tasks_df):,}
Unique Task Types: {tasks_df['task_type'].nunique()}

Tasks with Parameters: {tasks_df['has_parameters'].sum():,}
Empty Task Files: {(tasks_df['task_type'] == 'no_tasks').sum():,}

Avg Words/Conversation: {conv_df['word_count'].mean():.0f}
Max Word Count: {conv_df['word_count'].max():,}
Min Word Count: {conv_df['word_count'].min():,}
"""
ax4.text(0.05, 0.95, stats_text, transform=ax4.transAxes, fontsize=10,
         verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor="lightgray"))

# 5. Word count distribution (second row, span 2 columns)
ax5 = fig.add_subplot(gs[1, :2])
ax5.hist(conv_df['word_count'], bins=30, alpha=0.7, color='lightcoral', edgecolor='black')
ax5.axvline(conv_df['word_count'].mean(), color='red', linestyle='--', linewidth=2)
ax5.axvline(conv_df['word_count'].median(), color='blue', linestyle='--', linewidth=2)
ax5.set_title('Word Count Distribution Across All Conversations', fontweight='bold')
ax5.set_xlabel('Word Count')
ax5.set_ylabel('Frequency')
ax5.legend(['Mean', 'Median', 'Distribution'])
ax5.grid(alpha=0.3)

# 6. Complexity distribution (second row, right side)
ax6 = fig.add_subplot(gs[1, 2:])
complexity_by_category = enhanced_tasks_df.groupby('task_category')['complexity_score'].mean().sort_values()
bars6 = ax6.bar(range(len(complexity_by_category)), complexity_by_category.values, 
                color=sns.color_palette("rocket", len(complexity_by_category)))
ax6.set_title('Average Task Complexity by Category', fontweight='bold')
ax6.set_xticks(range(len(complexity_by_category)))
ax6.set_xticklabels(complexity_by_category.index, rotation=45, ha='right')
ax6.set_ylabel('Average Complexity Score')
ax6.grid(axis='y', alpha=0.3)

# 7. Task distribution across splits (third row, left)
ax7 = fig.add_subplot(gs[2, :2])
split_counts = enhanced_tasks_df['split'].value_counts()
bars7 = ax7.bar(split_counts.index, split_counts.values, color=['#FF6B6B', '#4ECDC4', '#45B7D1'])
ax7.set_title('Task Distribution Across Dataset Splits', fontweight='bold')
ax7.set_ylabel('Number of Tasks')
for bar in bars7:
    height = bar.get_height()
    ax7.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 8. Parameter analysis (third row, right)
ax8 = fig.add_subplot(gs[2, 2:])
param_counts = enhanced_tasks_df['parameter_count'].value_counts().sort_index()
bars8 = ax8.bar(param_counts.index, param_counts.values, color='skyblue', edgecolor='navy')
ax8.set_title('Distribution of Parameter Counts per Task', fontweight='bold')
ax8.set_xlabel('Number of Parameters')
ax8.set_ylabel('Number of Tasks')
ax8.grid(axis='y', alpha=0.3)

# 9. Language analysis (bottom row - analyze text patterns)
ax9 = fig.add_subplot(gs[3, :2])
# Simple language detection based on common words
def detect_language_simple(text):
    """Simple language detection based on common German words"""
    german_words = ['der', 'die', 'das', 'und', 'ich', 'sie', 'ist', 'mit', 'zu', 'von', 'auf']
    english_words = ['the', 'and', 'to', 'of', 'a', 'in', 'for', 'is', 'on', 'that', 'by']
    
    text_lower = text.lower()
    german_score = sum(1 for word in german_words if word in text_lower)
    english_score = sum(1 for word in english_words if word in text_lower)
    
    if german_score > english_score:
        return 'German'
    elif english_score > german_score:
        return 'English'
    else:
        return 'Mixed/Other'

# Apply language detection to a sample of conversations
sample_convs = conv_df.sample(min(200, len(conv_df)))
sample_convs['detected_language'] = sample_convs['clean_text'].apply(detect_language_simple)
lang_counts = sample_convs['detected_language'].value_counts()

bars9 = ax9.bar(lang_counts.index, lang_counts.values, color=['#FFA07A', '#98D8C8', '#F7DC6F'])
ax9.set_title(f'Detected Languages (Sample of {len(sample_convs)} conversations)', fontweight='bold')
ax9.set_ylabel('Number of Conversations')
for bar in bars9:
    height = bar.get_height()
    ax9.text(bar.get_x() + bar.get_width()/2., height,
             f'{int(height)}', ha='center', va='bottom', fontweight='bold')

# 10. Model performance insights (bottom right)
ax10 = fig.add_subplot(gs[3, 2:])
ax10.axis('off')
insights_text = f"""
MODEL INSIGHTS & RECOMMENDATIONS

🎯 TASK EXTRACTION CHALLENGES:
• {(tasks_df['task_type'] == 'no_tasks').sum()}/{len(tasks_df)} conversations have no extractable tasks
• Avg {enhanced_tasks_df['parameter_count'].mean():.1f} parameters per task
• Most complex category: {complexity_by_category.index[-1]}

📊 DATA CHARACTERISTICS:
• Multilingual dataset (German dominant)
• Variable conversation lengths (50-500+ words)
• {conv_df['speaker_count'].mode().iloc[0]}-speaker conversations most common

🔧 MODEL RECOMMENDATIONS:
• Focus on {category_counts.index[0]} tasks (most frequent)
• Handle empty task cases ({(tasks_df['task_type'] == 'no_tasks').sum()} cases)
• Consider conversation length in complexity scoring
• Multilingual model required for production

📈 EVALUATION METRICS:
• Test on {len(test_data['tasks'])} files
• Validate on {len(validation_data['tasks'])} files
• Consider task-specific F1 scores
"""
ax10.text(0.02, 0.98, insights_text, transform=ax10.transAxes, fontsize=9,
          verticalalignment='top', bbox=dict(boxstyle="round,pad=0.3", facecolor="lightyellow"))

plt.suptitle('UBS Client Conversation Analysis - Complete Dashboard', fontsize=18, fontweight='bold', y=0.98)
plt.tight_layout()
plt.show()

# Final summary
print(f"\n{'='*80}")
print("🎯 FINAL ANALYSIS SUMMARY - SWISSAIHACKS25")
print(f"{'='*80}")
print(f"📊 DATASET OVERVIEW:")
print(f"   • Total conversations analyzed: {len(conv_df):,}")
print(f"   • Total tasks extracted: {len(tasks_df):,}")
print(f"   • Unique task types identified: {tasks_df['task_type'].nunique()}")
print(f"   • Dataset splits: {len(train_data['tasks'])} train, {len(test_data['tasks'])} test, {len(validation_data['tasks'])} validation")
print(f"\n🏆 KEY FINDINGS:")
print(f"   • Most common task category: {category_counts.index[0]} ({category_counts.iloc[0]} instances)")
print(f"   • Average conversation length: {conv_df['word_count'].mean():.0f} words")
print(f"   • {(tasks_df['task_type'] == 'no_tasks').sum()} conversations have no extractable tasks")
print(f"   • Tasks with parameters: {tasks_df['has_parameters'].sum()}/{len(tasks_df)} ({tasks_df['has_parameters'].mean()*100:.1f}%)")
print(f"\n🎯 MODEL DEVELOPMENT INSIGHTS:")
print(f"   • Focus areas: {', '.join(category_counts.head(3).index)}")
print(f"   • Complexity range: 0-6 (avg: {enhanced_tasks_df['complexity_score'].mean():.2f})")
print(f"   • Multilingual support needed (German/English detected)")
print(f"   • Consider conversation length in task extraction performance")
print(f"{'='*80}")