In [None]:
## Compare User Categorization Methods

Now let's explore how different methods of categorizing users (high, medium, low) compare with each other. The CohortAnalyzer can now categorize users based on five different approaches:

1. **Combined** (default): Based on both ideas and steps counts
2. **By Ideas**: Based only on number of ideas created
3. **By Steps**: Based only on number of steps completed
4. **By Completion**: Based on framework completion percentage
5. **By Interactions**: Based on total interactions (ideas + steps)

Let's visualize how these different categorization methods compare:

```python
# Extract usage cohort data
usage_cohorts = results.get('usage_cohorts', {})
method_comparison = usage_cohorts.get('method_comparison', {})
categorization_methods = usage_cohorts.get('categorization_methods', [])

if method_comparison and categorization_methods:
    # Create a figure for method distribution comparison
    plt.figure(figsize=(12, 8))
    
    # Format method names for display
    method_labels = []
    for method in categorization_methods:
        # Convert method name to more readable form
        if method == 'usage_level':
            label = 'Combined'
        elif method == 'usage_by_ideas':
            label = 'By Ideas'
        elif method == 'usage_by_steps':
            label = 'By Steps'
        elif method == 'usage_by_completion':
            label = 'By Completion'
        elif method == 'usage_by_interactions':
            label = 'By Interactions'
        else:
            label = method.replace('usage_', '').replace('_', ' ').title()
        
        method_labels.append(label)
    
    # Extract distribution data
    distributions = method_comparison.get('method_distributions', {})
    
    # Extract values for each level
    high_values = [distributions[method].get('high', 0) * 100 for method in categorization_methods]
    medium_values = [distributions[method].get('medium', 0) * 100 for method in categorization_methods]
    low_values = [distributions[method].get('low', 0) * 100 for method in categorization_methods]
    none_values = [distributions[method].get('none', 0) * 100 for method in categorization_methods]
    
    # Create stacked bar chart
    bar_width = 0.6
    bottom_values = np.zeros(len(method_labels))
    
    p1 = plt.bar(method_labels, high_values, bar_width, label='High', bottom=bottom_values, color='darkgreen')
    bottom_values += high_values
    
    p2 = plt.bar(method_labels, medium_values, bar_width, label='Medium', bottom=bottom_values, color='yellowgreen')
    bottom_values += medium_values
    
    p3 = plt.bar(method_labels, low_values, bar_width, label='Low', bottom=bottom_values, color='gold')
    bottom_values += low_values
    
    p4 = plt.bar(method_labels, none_values, bar_width, label='None', bottom=bottom_values, color='lightcoral')
    
    # Add percentage labels
    def add_labels(bars, values, bottom):
        for i, bar in enumerate(bars):
            if values[i] > 5:  # Only show label if segment is large enough
                plt.text(bar.get_x() + bar.get_width()/2,
                       bottom[i] + values[i]/2,
                       f"{values[i]:.1f}%",
                       ha='center', va='center', color='white' if values[i] < 15 else 'black',
                       fontweight='bold')
    
    # Add labels to each segment
    bottom_for_labels = np.zeros(len(method_labels))
    add_labels(p1, high_values, bottom_for_labels)
    bottom_for_labels += high_values
    add_labels(p2, medium_values, bottom_for_labels)
    bottom_for_labels += medium_values
    add_labels(p3, low_values, bottom_for_labels)
    bottom_for_labels += low_values
    add_labels(p4, none_values, bottom_for_labels)
    
    # Add title and labels
    plt.title('User Distribution by Categorization Method', fontsize=16)
    plt.ylabel('Percentage of Users (%)', fontsize=12)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), ncol=4)
    
    plt.tight_layout(rect=[0, 0.08, 1, 0.98])
    plt.show()
    
    # Display agreement rates between methods
    agreement_rates = method_comparison.get('agreement_rates', {})
    print("\nAgreement Rates Between Categorization Methods:")
    for pair, rate in sorted(agreement_rates.items(), key=lambda x: x[1], reverse=True):
        # Format method names
        method1, method2 = pair.split('_vs_')
        method1_label = method1.replace('usage_', '').replace('_', ' ').title()
        method2_label = method2.replace('usage_', '').replace('_', ' ').title()
        print(f"{method1_label} vs {method2_label}: {rate*100:.1f}% agreement")

else:
    print("No method comparison data available")
```

Let's also look at how the distribution changes across cohorts using different categorization methods:

```python
# Extract usage by time cohort data
usage_by_time = usage_cohorts.get('usage_by_time_cohort', {})

if usage_by_time:
    # Select a specific categorization method to visualize
    target_method = 'usage_by_completion'  # This focuses on framework completion
    
    # Extract data for this method
    cohort_names = sorted(usage_by_time.keys())
    x_labels = [cohort.replace('_', ' ').title() for cohort in cohort_names]
    
    # Prepare data for visualization
    high_pct = []
    medium_pct = []
    low_pct = []
    none_pct = []
    tool_versions = []
    
    for cohort in cohort_names:
        if 'categories' in usage_by_time[cohort] and target_method in usage_by_time[cohort]['categories']:
            # Get categorization data
            cat_data = usage_by_time[cohort]['categories'][target_method]
            
            # Get percentages
            percentages = cat_data.get('percentages', {})
            
            # Extract percentages for each level
            high_pct.append(percentages.get('high', 0) * 100)
            medium_pct.append(percentages.get('medium', 0) * 100)
            low_pct.append(percentages.get('low', 0) * 100)
            none_pct.append(percentages.get('none', 0) * 100)
            
            # Get tool version
            tool_versions.append(usage_by_time[cohort].get('tool_version', 'unknown'))
    
    # Create stacked bar chart
    plt.figure(figsize=(12, 8))
    
    bottom_values = np.zeros(len(x_labels))
    
    # Define colors for usage levels
    colors = {
        'high': 'darkgreen',
        'medium': 'yellowgreen',
        'low': 'gold',
        'none': 'lightcoral'
    }
    
    p1 = plt.bar(x_labels, high_pct, bottom=bottom_values, label='High Completion', color=colors['high'])
    bottom_values += high_pct
    
    p2 = plt.bar(x_labels, medium_pct, bottom=bottom_values, label='Medium Completion', color=colors['medium'])
    bottom_values += medium_pct
    
    p3 = plt.bar(x_labels, low_pct, bottom=bottom_values, label='Low Completion', color=colors['low'])
    bottom_values += low_pct
    
    p4 = plt.bar(x_labels, none_pct, bottom=bottom_values, label='No Completion', color=colors['none'])
    
    # Add percentage labels
    def add_labels(bars, values, bottom):
        for i, bar in enumerate(bars):
            if values[i] > 5:  # Only show label if segment is large enough
                plt.text(bar.get_x() + bar.get_width()/2, 
                       bottom[i] + values[i]/2,
                       f'{values[i]:.1f}%',
                       ha='center', va='center',
                       color='white' if values[i] < 50 else 'black',
                       fontweight='bold')
    
    # Add labels to each segment
    bottom_for_labels = np.zeros(len(x_labels))
    add_labels(p1, high_pct, bottom_for_labels)
    bottom_for_labels += high_pct
    add_labels(p2, medium_pct, bottom_for_labels)
    bottom_for_labels += medium_pct
    add_labels(p3, low_pct, bottom_for_labels)
    bottom_for_labels += low_pct
    add_labels(p4, none_pct, bottom_for_labels)
    
    # Add title and labels
    plt.title('User Engagement by Framework Completion Across Cohorts', fontsize=16)
    plt.ylabel('Percentage of Users (%)', fontsize=12)
    plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.08), ncol=4)
    
    # Add tool version annotations
    for i, version in enumerate(tool_versions):
        plt.annotate(f"Tool: {version}",
                   xy=(i, 5),
                   xytext=(0, -30),
                   textcoords='offset points',
                   ha='center')
    
    plt.tight_layout(rect=[0, 0.08, 1, 0.98])
    plt.show()
else:
    print("No usage by time cohort data available")
```