<a href="https://colab.research.google.com/github/leosammallahti/AnalysisCoLab/blob/main/Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import json
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Load the enriched RCT dataset
data_path = '/content/drive/MyDrive/AEA_RCT_Parsed/structured_studies_full_v2.csv'
df = pd.read_csv(data_path, low_memory=False)

# Display basic info
print(f"Dataset loaded: {len(df)} studies")
print(f"Columns available: {df.columns.tolist()[:10]}... [{len(df.columns)} total columns]")

# Filter out studies with inadequate abstracts (as noted in your pipeline)
df['abstract_length'] = df['abstract'].fillna('').str.len()
df_clean = df[df['abstract_length'] >= 100].copy()
print(f"\nAfter filtering short/missing abstracts: {len(df_clean)} studies remain")
print(f"Removed {len(df) - len(df_clean)} studies with inadequate abstracts")

# Check the AI-generated fields we'll analyze
ai_fields = ['tldr', 'intervention_type', 'outcome_extracted', 'population_extracted', 'country_extracted']
print("\n=== AI-Generated Fields Coverage ===")
for field in ai_fields:
    coverage = df_clean[field].notna().sum() / len(df_clean) * 100
    print(f"{field}: {coverage:.1f}% complete")

print("\nDataset ready for Solution Families clustering!")

Mounted at /content/drive
Dataset loaded: 2289 studies
Columns available: ['rct_id', 'title', 'final_has_findings', 'final_findings_snippet', 'final_findings_source_type', 'country', 'start_date', 'end_date', 'primary_investigator_name', 'origin_url']... [122 total columns]

After filtering short/missing abstracts: 2105 studies remain
Removed 184 studies with inadequate abstracts

=== AI-Generated Fields Coverage ===
tldr: 100.0% complete
intervention_type: 99.8% complete
outcome_extracted: 99.8% complete
population_extracted: 99.5% complete
country_extracted: 68.3% complete

Dataset ready for Solution Families clustering!


In [3]:
# Prepare combined text for clustering based on intervention characteristics
print("=== Preparing Text for Solution Families Clustering ===\n")

# Combine relevant AI-generated fields to capture intervention essence
df_clean['intervention_essence'] = (
    df_clean['intervention_type'].fillna('') + ' | ' +
    df_clean['tldr'].fillna('') + ' | ' +
    df_clean['outcome_extracted'].fillna('') + ' | ' +
    df_clean['population_extracted'].fillna('')
)

# Remove empty rows
df_analysis = df_clean[df_clean['intervention_essence'].str.len() > 50].copy()
print(f"Studies with sufficient text for analysis: {len(df_analysis)}")

# Create TF-IDF vectors for clustering
print("\nCreating TF-IDF vectors...")
vectorizer = TfidfVectorizer(
    max_features=200,
    stop_words='english',
    min_df=3,
    max_df=0.8,
    ngram_range=(1, 2)
)

intervention_vectors = vectorizer.fit_transform(df_analysis['intervention_essence'])
print(f"Vector shape: {intervention_vectors.shape}")

# Get top features for understanding what drives the clustering
feature_names = vectorizer.get_feature_names_out()
print(f"\nTop 20 most distinctive terms across all interventions:")
print(list(feature_names[:20]))

print("\n✓ Text data prepared for clustering!")

=== Preparing Text for Solution Families Clustering ===

Studies with sufficient text for analysis: 2104

Creating TF-IDF vectors...
Vector shape: (2104, 200)

Top 20 most distinctive terms across all interventions:
['academic', 'adoption', 'adult', 'adults', 'agricultural', 'attitudes', 'based', 'behavior', 'behavioral', 'belief', 'beliefs', 'benefits', 'bias', 'boost', 'boosts', 'business', 'cash', 'change', 'child', 'children']

✓ Text data prepared for clustering!


In [4]:
# Find optimal number of clusters using elbow method
from sklearn.metrics import silhouette_score

print("=== Finding Optimal Number of Solution Families ===\n")

# Test different numbers of clusters
inertias = []
silhouette_scores = []
K_range = range(5, 16)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(intervention_vectors)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(intervention_vectors, kmeans.labels_))
    print(f"k={k}: Silhouette Score = {silhouette_scores[-1]:.3f}")

# Find best k based on silhouette score
best_k = K_range[np.argmax(silhouette_scores)]
print(f"\n✓ Optimal number of Solution Families: {best_k}")

# Perform final clustering with optimal k
print(f"\n=== Creating {best_k} Solution Families ===")
final_kmeans = KMeans(n_clusters=best_k, random_state=42, n_init=20)
df_analysis['solution_family'] = final_kmeans.fit_predict(intervention_vectors)

# Show distribution
family_counts = df_analysis['solution_family'].value_counts().sort_index()
print("\nCluster sizes:")
for cluster_id, count in family_counts.items():
    print(f"  Family {cluster_id}: {count} studies ({count/len(df_analysis)*100:.1f}%)")

print("\n✓ Solution Families created! Ready for AI naming and analysis.")

=== Finding Optimal Number of Solution Families ===

k=5: Silhouette Score = 0.034
k=6: Silhouette Score = 0.038
k=7: Silhouette Score = 0.039
k=8: Silhouette Score = 0.042
k=9: Silhouette Score = 0.044
k=10: Silhouette Score = 0.047
k=11: Silhouette Score = 0.047
k=12: Silhouette Score = 0.045
k=13: Silhouette Score = 0.048
k=14: Silhouette Score = 0.050
k=15: Silhouette Score = 0.054

✓ Optimal number of Solution Families: 15

=== Creating 15 Solution Families ===

Cluster sizes:
  Family 0: 128 studies (6.1%)
  Family 1: 252 studies (12.0%)
  Family 2: 122 studies (5.8%)
  Family 3: 63 studies (3.0%)
  Family 4: 414 studies (19.7%)
  Family 5: 42 studies (2.0%)
  Family 6: 97 studies (4.6%)
  Family 7: 143 studies (6.8%)
  Family 8: 144 studies (6.8%)
  Family 9: 85 studies (4.0%)
  Family 10: 141 studies (6.7%)
  Family 11: 47 studies (2.2%)
  Family 12: 102 studies (4.8%)
  Family 13: 194 studies (9.2%)
  Family 14: 130 studies (6.2%)

✓ Solution Families created! Ready for AI nam

In [7]:
from together import Together
from google.colab import userdata

# Initialize Together AI client
api_key = userdata.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)

print("=== Analyzing Solution Families with Kimi-K2-Instruct ===\n")

# Function to get top terms for each cluster
def get_cluster_top_terms(cluster_id, n_terms=15):
    cluster_docs = df_analysis[df_analysis['solution_family'] == cluster_id]['intervention_essence']

    # Create TF-IDF for this cluster
    cluster_vectorizer = TfidfVectorizer(max_features=n_terms, stop_words='english', ngram_range=(1,2))
    try:
        cluster_vectorizer.fit(cluster_docs)
        return list(cluster_vectorizer.get_feature_names_out())
    except:
        return []

# Function to get representative studies
def get_cluster_samples(cluster_id, n_samples=3):
    cluster_df = df_analysis[df_analysis['solution_family'] == cluster_id]
    samples = cluster_df.sample(min(n_samples, len(cluster_df)), random_state=42)
    sample_list = []
    for _, row in samples.iterrows():
        sample_list.append({
            'title': row['title'],
            'intervention': row['intervention_type'],
            'summary': row['tldr'][:200]  # Truncate for brevity
        })
    return sample_list

# Analyze first 3 families as a test
solution_families = {}

for cluster_id in range(3):  # Start with first 3 to test
    print(f"\n--- Analyzing Solution Family {cluster_id} ---")

    # Get cluster characteristics
    cluster_size = len(df_analysis[df_analysis['solution_family'] == cluster_id])
    top_terms = get_cluster_top_terms(cluster_id)
    samples = get_cluster_samples(cluster_id)

    print(f"Size: {cluster_size} studies")
    print(f"Key terms: {', '.join(top_terms[:8])}")

    # Create prompt for AI
    prompt = f"""Analyze this cluster of development economics interventions and provide:
1. A short, catchy name (2-4 words) that captures the essence
2. A one-sentence description of the theory of change
3. Key mechanism: how these interventions work

Cluster characteristics:
- Size: {cluster_size} studies
- Top terms: {', '.join(top_terms)}
- Sample studies:
{chr(10).join([f"  • {s['title']}: {s['intervention']}" for s in samples[:2]])}

Respond in this exact format:
NAME: [name]
DESCRIPTION: [one sentence]
MECHANISM: [one sentence how it works]"""

    # Get AI response
    try:
        response = client.chat.completions.create(
            model="moonshotai/Kimi-K2-Instruct",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.6,
            max_tokens=200
        )

        analysis = response.choices[0].message.content
        print(f"\n{analysis}")

        solution_families[cluster_id] = {
            'size': cluster_size,
            'top_terms': top_terms[:8],
            'analysis': analysis
        }

    except Exception as e:
        print(f"Error analyzing cluster {cluster_id}: {e}")

print("\n✓ Initial analysis complete! Ready to analyze remaining families.")

=== Analyzing Solution Families with Kimi-K2-Instruct ===


--- Analyzing Solution Family 0 ---
Size: 128 studies
Key terms: business, financial, firm, firms, management, performance, practices, productivity

NAME: Productivity Bootcamps  
DESCRIPTION: Coach micro-entrepreneurs and their workers in modern management, finance, and peer-driven motivation so that better practices lift firm performance.  
MECHANISM: Intensive group training plus follow-up peer interaction transmits codified business know-how and social accountability, which owners and workers adopt, raising productivity and profits.

--- Analyzing Solution Family 1 ---
Size: 252 studies
Key terms: academic, college, education, educational, high, learning, outcomes, performance

NAME: Elite Teacher Pipeline
DESCRIPTION: Recruit and fast-track high-achieving graduates into underserved classrooms to raise student learning outcomes.
MECHANISM: Select top college grads, give them short, intensive training plus ongoing coaching,

In [6]:
# Install Together AI package
!pip install together -q

print("✓ Together AI package installed successfully")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/103.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m5.5 MB/s[0m eta [36m0:00:00[0m
[?25h✓ Together AI package installed successfully


In [8]:
print("=== Examining Actual Cluster Composition ===\n")

# Let's look at Family 1 (the 252-study cluster) in detail
cluster_id = 1
cluster_df = df_analysis[df_analysis['solution_family'] == cluster_id]

print(f"SOLUTION FAMILY {cluster_id}: {len(cluster_df)} studies\n")

# Analyze intervention types
print("Top 10 Intervention Types in this cluster:")
intervention_counts = cluster_df['intervention_type'].value_counts().head(10)
for intervention, count in intervention_counts.items():
    print(f"  {count:3d} studies ({count/len(cluster_df)*100:4.1f}%): {intervention[:80]}")

print("\n" + "="*50)
print("Sample of actual study titles (randomly selected):")
print("="*50)
sample_titles = cluster_df.sample(min(10, len(cluster_df)), random_state=42)['title'].tolist()
for i, title in enumerate(sample_titles, 1):
    print(f"{i:2d}. {title[:100]}")

print("\n" + "="*50)
print("Most common keywords in TLDRs:")
print("="*50)

# Get word frequency from TLDRs
from collections import Counter
import re

all_text = ' '.join(cluster_df['tldr'].fillna('').tolist()).lower()
words = re.findall(r'\b[a-z]{4,}\b', all_text)  # Words 4+ chars
word_freq = Counter(words)

# Remove common words
common_words = {'study', 'intervention', 'impact', 'results', 'effects', 'group', 'treatment', 'control', 'randomized', 'trial', 'find', 'using', 'assess', 'evaluate'}
filtered_words = {w: c for w, c in word_freq.items() if w not in common_words}

top_words = sorted(filtered_words.items(), key=lambda x: x[1], reverse=True)[:20]
for word, freq in top_words:
    print(f"  {word}: {freq} occurrences")

=== Examining Actual Cluster Composition ===

SOLUTION FAMILY 1: 252 studies

Top 10 Intervention Types in this cluster:
    1 studies ( 0.4%): mentoring; career guidance; professional development; educational intervention
    1 studies ( 0.4%): goal setting; financial incentives; educational incentives; performance-based re
    1 studies ( 0.4%): curriculum intervention; socio-emotional skills; grit; deep practice; self-regul
    1 studies ( 0.4%): growth mindset intervention, educational intervention, psychological interventio
    1 studies ( 0.4%): pre-service teacher training, classroom transformation, pedagogical methods, pra
    1 studies ( 0.4%): conditional cash transfers; loss aversion; psychological incentives; behavioral 
    1 studies ( 0.4%): fellowship program; teaching assistants; STEM education; scripted lesson plans; 
    1 studies ( 0.4%): consultancy, coaching, peer coaching, teacher training, skill development, educa
    1 studies ( 0.4%): diagnostic feedback; capac

In [9]:
from together import Together
from google.colab import userdata
import time

# Initialize Together AI client
api_key = userdata.get('TOGETHER_API_KEY')
client = Together(api_key=api_key)

print("=== Comprehensive Solution Family Analysis with Kimi-K2-Instruct ===\n")

solution_families_analysis = {}

# Analyze each cluster with ALL its studies
for cluster_id in range(15):
    cluster_df = df_analysis[df_analysis['solution_family'] == cluster_id]
    cluster_size = len(cluster_df)

    print(f"\n{'='*60}")
    print(f"Analyzing Family {cluster_id} ({cluster_size} studies)...")
    print(f"{'='*60}")

    # Prepare all studies data for this cluster
    studies_list = []
    for idx, row in cluster_df.iterrows():
        study_info = f"• {row['tldr'][:150] if pd.notna(row['tldr']) else 'No summary'} | Intervention: {row['intervention_type'][:50] if pd.notna(row['intervention_type']) else 'N/A'} | Outcome: {row['outcome_extracted'][:50] if pd.notna(row['outcome_extracted']) else 'N/A'}"
        studies_list.append(study_info)

    all_studies_text = "\n".join(studies_list)

    # Create comprehensive prompt
    prompt = f"""You are analyzing a cluster of {cluster_size} development economics RCTs. Read ALL the studies below and identify the common patterns, theories of change, and mechanisms that unite them.

FULL LIST OF STUDIES IN THIS CLUSTER:
{all_studies_text}

Based on analyzing ALL {cluster_size} studies above (not just a sample), provide:

1. SHORT NAME: A catchy 2-4 word name that captures what unites these studies
2. ONE-LINE SUMMARY: One sentence describing the common approach
3. THEORY OF CHANGE: A paragraph explaining the underlying mechanism and logic
4. KEY PATTERNS: What patterns do you see across the studies?
5. DIVERSITY NOTE: If this cluster contains diverse interventions, explain the main sub-types

Format your response exactly like this:
SHORT NAME: [name]
ONE-LINE SUMMARY: [one sentence]
THEORY OF CHANGE: [paragraph]
KEY PATTERNS: [patterns observed]
DIVERSITY NOTE: [sub-types if applicable]"""

    # Get AI response
    try:
        response = client.chat.completions.create(
            model="moonshotai/Kimi-K2-Instruct",
            messages=[{"role": "user", "content": prompt}],
            temperature=0.6,
            max_tokens=500
        )

        analysis = response.choices[0].message.content

        # Parse and display results
        print(f"\n{analysis}")

        # Store results
        solution_families_analysis[cluster_id] = {
            'size': cluster_size,
            'analysis': analysis,
            'studies': cluster_df[['title', 'tldr']].head(5).to_dict('records')  # Keep 5 examples
        }

        # Rate limiting
        time.sleep(1)

    except Exception as e:
        print(f"Error analyzing cluster {cluster_id}: {e}")
        solution_families_analysis[cluster_id] = {
            'size': cluster_size,
            'error': str(e)
        }

print("\n" + "="*60)
print("✓ Complete analysis finished! All clusters analyzed with their FULL content.")
print(f"Total studies analyzed: {len(df_analysis)}")
print(f"Total API cost estimate: ~${len(df_analysis) * 130 * 1.3 / 1_000_000:.2f}")

=== Comprehensive Solution Family Analysis with Kimi-K2-Instruct ===


Analyzing Family 0 (128 studies)...

SHORT NAME: Productivity Boosters
ONE-LINE SUMMARY: RCTs that test small, low-cost levers—training, incentives, technology, monitoring, or managerial tweaks—to raise worker and firm performance.
THEORY OF CHANGE: Small frictions (information gaps, weak incentives, poor management, limited skills, psychological stress, or missing technology) keep productivity below potential.  Delivering targeted, bite-sized fixes—whether a mobile app, a two-hour workshop, performance pay, or a feedback text—removes these frictions, raises marginal returns to effort or capital, and triggers self-reinforcing improvements in profits, output, or well-being without large new investments.
KEY PATTERNS: 1) Training is everywhere (finance, soft skills, leadership, technical, export, AI, stress-management, empathy, cybersecurity). 2) Incentives come in many forms—cash, bonuses, piece-rates, tournament pri