# Exploratory Analysis of Anthropic Economic Index

This notebook provides an initial exploration of the Anthropic Economic Index dataset, which offers insights into how AI is being incorporated into real-world tasks across different occupations.

## Setup and Data Loading

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set plotting style
plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('colorblind')

# Display settings
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.width', 1000)

In [None]:
# Define paths to data files
DATA_DIR = '../data/raw/'

# Load the key datasets
auto_aug_task = pd.read_csv(DATA_DIR + 'automation_vs_augmentation_by_task.csv')
soc_structure = pd.read_csv(DATA_DIR + 'SOC_Structure.csv')
onet_tasks = pd.read_csv(DATA_DIR + 'onet_task_statements.csv')
thinking_fractions = pd.read_csv(DATA_DIR + 'task_thinking_fractions.csv')
cluster_data = pd.read_csv(DATA_DIR + 'cluster_level_dataset.tsv', sep='\t')

## Exploring the Datasets
Let's first get a sense of what each dataset contains.

### Automation vs. Augmentation by Task

In [None]:
print(f"Shape: {auto_aug_task.shape}")
auto_aug_task.head()

In [None]:
# Summary statistics
auto_aug_task.describe()

### SOC Structure

In [None]:
print(f"Shape: {soc_structure.shape}")
soc_structure.head()

### O*NET Task Statements

In [None]:
print(f"Shape: {onet_tasks.shape}")
onet_tasks.head()

### Task Thinking Fractions

In [None]:
print(f"Shape: {thinking_fractions.shape}")
thinking_fractions.head()

### Cluster Level Data

In [None]:
print(f"Shape: {cluster_data.shape}")
cluster_data.head()

## Initial Visualizations
Let's create some initial visualizations to understand the data better.

### Distribution of Automation vs. Augmentation Patterns

In [None]:
# Calculate the mean values for each collaboration pattern
pattern_means = auto_aug_task[['directive', 'feedback_loop', 'validation', 
                            'task_iteration', 'learning', 'filtered']].mean()

# Create a bar plot
plt.figure(figsize=(10, 6))
sns.barplot(x=pattern_means.index, y=pattern_means.values)
plt.title('Average Distribution of Collaboration Patterns Across All Tasks', fontsize=14)
plt.xlabel('Collaboration Pattern', fontsize=12)
plt.ylabel('Mean Value', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

### Top Tasks by Automation Potential

In [None]:
# Calculate automation score (directive + feedback_loop)
auto_aug_task['automation_score'] = auto_aug_task['directive'] + auto_aug_task['feedback_loop']

# Get top 10 tasks by automation potential
top_automation = auto_aug_task.nlargest(10, 'automation_score')

# Create a horizontal bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='automation_score', y='task_name', data=top_automation, orient='h')
plt.title('Top 10 Tasks by Automation Potential', fontsize=14)
plt.xlabel('Automation Score (directive + feedback_loop)', fontsize=12)
plt.ylabel('Task Name', fontsize=12)
plt.tight_layout()
plt.show()

### Top Tasks by Augmentation Potential

In [None]:
# Calculate augmentation score (validation + task_iteration + learning)
auto_aug_task['augmentation_score'] = (auto_aug_task['validation'] + 
                                     auto_aug_task['task_iteration'] + 
                                     auto_aug_task['learning'])

# Get top 10 tasks by augmentation potential
top_augmentation = auto_aug_task.nlargest(10, 'augmentation_score')

# Create a horizontal bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='augmentation_score', y='task_name', data=top_augmentation, orient='h')
plt.title('Top 10 Tasks by Augmentation Potential', fontsize=14)
plt.xlabel('Augmentation Score (validation + task_iteration + learning)', fontsize=12)
plt.ylabel('Task Name', fontsize=12)
plt.tight_layout()
plt.show()

### Distribution of Thinking Fractions

In [None]:
plt.figure(figsize=(10, 6))
sns.histplot(thinking_fractions['thinking_fraction'], bins=30, kde=True)
plt.title('Distribution of Thinking Fractions Across Tasks', fontsize=14)
plt.xlabel('Thinking Fraction', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.tight_layout()
plt.show()

### Top Tasks Requiring Extended Thinking

In [None]:
# Get top 10 tasks by thinking fraction
top_thinking = thinking_fractions.nlargest(10, 'thinking_fraction')

# Create a horizontal bar plot
plt.figure(figsize=(12, 8))
sns.barplot(x='thinking_fraction', y='task_name', data=top_thinking, orient='h')
plt.title('Top 10 Tasks Requiring Extended Thinking', fontsize=14)
plt.xlabel('Thinking Fraction', fontsize=12)
plt.ylabel('Task Name', fontsize=12)
plt.tight_layout()
plt.show()

## Cluster Analysis
Let's explore the hierarchical clusters in the dataset.

In [None]:
# Look at the distribution of top-level clusters
cluster_level2_counts = cluster_data['cluster_name_2'].value_counts()

plt.figure(figsize=(12, 8))
sns.barplot(x=cluster_level2_counts.values, y=cluster_level2_counts.index)
plt.title('Distribution of Top-Level Clusters', fontsize=14)
plt.xlabel('Count', fontsize=12)
plt.ylabel('Cluster Name', fontsize=12)
plt.tight_layout()
plt.show()

## Next Steps
Based on this initial exploration, we can pursue several directions for further analysis:

1. Analyze the relationship between occupations and automation/augmentation patterns
2. Explore correlations between extended thinking usage and task types
3. Conduct detailed cluster analysis to understand task relationships
4. Create visualizations to show the economic impact by sector

These analyses will be pursued in subsequent notebooks.