<a href="https://colab.research.google.com/github/leosammallahti/AnalysisCoLab/blob/main/Visualisations1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Import necessary library
import pandas as pd

# Load the CONSOLIDATED CLEAN dataset (not the raw one!)
df = pd.read_csv('/content/drive/MyDrive/AEA_RCT_Parsed/consolidated_rct_dataset_clean.csv')

# Check the shape and columns
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print("\n" + "="*50 + "\n")

# Define the AI-generated keyword columns
ai_keyword_columns = ['keywords_sector', 'keywords_mechanisms',
                      'keywords_implementation', 'keywords_context']

# Create a boolean mask for studies with AI-generated keywords
# A study has AI keywords if at least one of the AI keyword columns is not null/empty
has_ai_keywords_mask = df[ai_keyword_columns].notna().any(axis=1)

# For string columns, also check they're not empty strings
for col in ai_keyword_columns:
    if col in df.columns and df[col].dtype == 'object':
        has_ai_keywords_mask = has_ai_keywords_mask & (df[col].astype(str).str.strip() != '')

# Split the dataset into two groups
studies_with_ai_keywords = df[has_ai_keywords_mask].copy()
studies_without_ai_keywords = df[~has_ai_keywords_mask].copy()

# Display counts for verification
print(f"Total studies: {len(df)}")
print(f"Studies WITH AI-generated keywords: {len(studies_with_ai_keywords)}")
print(f"Studies WITHOUT AI-generated keywords: {len(studies_without_ai_keywords)}")

# Save the two files
studies_with_ai_keywords.to_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_with_ai_keywords.csv',
                                index=False)
studies_without_ai_keywords.to_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_without_ai_keywords.csv',
                                   index=False)

print("\nFiles saved successfully:")
print("1. studies_with_ai_keywords.csv")
print("2. studies_without_ai_keywords.csv")

# Display sample data to verify the split
print("\n--- Sample of studies WITH AI keywords ---")
print(studies_with_ai_keywords[['rct_id', 'Title'] + ai_keyword_columns].head(3))

print("\n--- Sample of studies WITHOUT AI keywords ---")
print(studies_without_ai_keywords[['rct_id', 'Title'] + ai_keyword_columns].head(3))

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Dataset shape: (2281, 21)
Columns: ['Title', 'Country', 'Subregion', 'Abstract', 'Description_of_intervention', 'Primary_outcomes', 'Secondary_outcomes', 'Population', 'Search_vector', 'keywords', 'keywords_additional', 'jel_codes', 'keywords_sector', 'keywords_mechanisms', 'keywords_implementation', 'keywords_context', 'Researchers', 'Researcher_affiliation', 'Year', 'Findings', 'rct_id']


Total studies: 2281
Studies WITH AI-generated keywords: 2117
Studies WITHOUT AI-generated keywords: 164

Files saved successfully:
1. studies_with_ai_keywords.csv
2. studies_without_ai_keywords.csv

--- Sample of studies WITH AI keywords ---
   rct_id                                              Title  \
0  1784.0            Two Approaches to Community Development   
1  1795.0  Consumer Response to New Retirement Income Pro...   
2  1814.0  Bank-Insured RoSCA for Microfin

To activate TogetherAI using your secret key, you need to retrieve the key from Colab secrets and set it as an environment variable or pass it directly to the TogetherAI client.

In [None]:
# Import necessary library
import os
from google.colab import userdata

# Retrieve the API key from Colab secrets
together_api_key = userdata.get('TOGETHER_API_KEY')

# Set the API key as an environment variable
os.environ['TOGETHER_API_KEY'] = together_api_key

# Now you can use the TogetherAI API by importing the necessary library and initializing the client
# For example, if you are using the `together` Python package:
# from together import Together
# client = Together(api_key=together_api_key)
# Or if you are using a library that reads from the environment variable:
# from some_together_library import Client
# client = Client() # It should pick up the API key from the environment variable

print("TogetherAI API key retrieved and set up.")
# Note: Replace the example client initialization with the actual library you are using

TogetherAI API key retrieved and set up.


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import re
from collections import Counter, defaultdict
import json
import ast
import warnings
warnings.filterwarnings('ignore')

# For TogetherAI integration
import together
import requests
from typing import List, Dict, Tuple

# Initialize TogetherAI (you'll need to set your API key)
# together.api_key = "YOUR_API_KEY_HERE"

# Load the data
df = pd.read_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_with_ai_keywords.csv')

# =====================================
# ANALYSIS 1: NATURAL TOPIC CLUSTERING
# =====================================

def cluster_studies_by_embeddings(df, n_clusters_range=(5, 20)):
    """
    Cluster studies using embeddings to discover natural research communities.
    Tests multiple cluster numbers to find optimal grouping.
    """
    print("🔬 ANALYSIS 1: Discovering Natural Research Communities")
    print("=" * 60)

    # Extract embeddings (assuming they're stored as string representations)
    # You may need to adjust this based on how embeddings are stored
    if 'Search_vector' in df.columns:
        # Parse embeddings if stored as strings
        embeddings = []
        for vec in df['Search_vector']:
            try:
                if isinstance(vec, str):
                    # Handle different string formats
                    vec = vec.replace('[', '').replace(']', '')
                    embedding = np.array([float(x) for x in vec.split(',')])
                else:
                    embedding = np.array(vec)
                embeddings.append(embedding)
            except:
                # Fallback: create embeddings from text if vectors are corrupted
                embeddings.append(np.zeros(768))  # Placeholder

        embeddings = np.array(embeddings)
    else:
        # Create embeddings from text if not available
        print("Creating embeddings from text...")
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        text_data = df['Abstract'].fillna('') + ' ' + df['Description_of_intervention'].fillna('')
        embeddings = vectorizer.fit_transform(text_data).toarray()

    # Find optimal number of clusters using silhouette score
    silhouette_scores = []
    for n_clusters in range(n_clusters_range[0], n_clusters_range[1]):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(embeddings)
        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        silhouette_scores.append(silhouette_avg)
        print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.3f}")

    # Use optimal number of clusters
    optimal_n = n_clusters_range[0] + np.argmax(silhouette_scores)
    print(f"\n✨ Optimal number of clusters: {optimal_n}")

    # Perform final clustering
    kmeans = KMeans(n_clusters=optimal_n, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(embeddings)

    # Reduce dimensions for visualization
    print("\nCreating 2D visualization...")
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)
    df['x'] = embeddings_2d[:, 0]
    df['y'] = embeddings_2d[:, 1]

    # Analyze each cluster
    print("\n📊 Cluster Analysis:")
    print("-" * 60)
    cluster_summaries = []

    for cluster_id in range(optimal_n):
        cluster_data = df[df['cluster'] == cluster_id]
        size = len(cluster_data)

        # Get most common keywords
        all_keywords = []
        for keywords in cluster_data['keywords'].dropna():
            if isinstance(keywords, str):
                all_keywords.extend(keywords.split(','))

        keyword_freq = Counter([k.strip() for k in all_keywords if k.strip()])
        top_keywords = keyword_freq.most_common(5)

        # Get most common countries
        countries = cluster_data['Country'].dropna().value_counts().head(3)

        # Sample titles
        sample_titles = cluster_data['Title'].sample(min(3, len(cluster_data))).tolist()

        cluster_summary = {
            'cluster_id': cluster_id,
            'size': size,
            'top_keywords': top_keywords,
            'main_countries': countries.to_dict(),
            'sample_studies': sample_titles
        }
        cluster_summaries.append(cluster_summary)

        print(f"\n🔹 Cluster {cluster_id} ({size} studies)")
        print(f"   Top Keywords: {', '.join([k[0] for k in top_keywords[:3]])}")
        print(f"   Main Countries: {', '.join(countries.index[:3].tolist())}")
        print(f"   Sample Study: {sample_titles[0][:60]}...")

    # Create interactive visualization
    fig = px.scatter(df, x='x', y='y', color='cluster',
                     hover_data=['Title', 'Country', 'Year'],
                     title='Natural Research Communities in Development Economics',
                     labels={'cluster': 'Research Community'},
                     color_continuous_scale='Viridis')
    fig.update_traces(marker=dict(size=8))
    fig.show()

    return df, cluster_summaries

# =====================================
# ANALYSIS 2: TIMELINE OF RESEARCH PARADIGMS
# =====================================

def analyze_research_timeline(df):
    """
    Analyze how research topics have evolved over time.
    Identifies paradigm shifts in development economics.
    """
    print("\n📅 ANALYSIS 2: Evolution of Development Research Paradigms")
    print("=" * 60)

    # Define paradigm keywords
    paradigms = {
        'Microfinance Era': ['microfinance', 'microcredit', 'micro-loan', 'micro loan', 'credit', 'lending'],
        'Behavioral Revolution': ['behavioral', 'nudge', 'psychology', 'cognitive', 'bias', 'framing'],
        'Digital Transformation': ['digital', 'mobile', 'technology', 'app', 'online', 'internet', 'sms', 'text message'],
        'Cash Transfer Wave': ['cash transfer', 'unconditional', 'UBI', 'basic income', 'direct transfer'],
        'Women Empowerment': ['women', 'gender', 'female', 'girl', 'empowerment', 'maternal'],
        'Health Interventions': ['health', 'medical', 'vaccine', 'nutrition', 'disease', 'mortality'],
        'Education Innovation': ['education', 'school', 'learning', 'teacher', 'student', 'literacy'],
        'Agriculture & Climate': ['agriculture', 'farming', 'crop', 'climate', 'weather', 'irrigation'],
        'Governance & Institutions': ['governance', 'corruption', 'accountability', 'transparency', 'institution']
    }

    # Create year bins
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df = df.dropna(subset=['Year'])
    df['Year'] = df['Year'].astype(int)

    year_min = df['Year'].min()
    year_max = df['Year'].max()
    years = range(year_min, year_max + 1)

    # Calculate paradigm prevalence by year
    paradigm_timeline = defaultdict(lambda: defaultdict(int))

    for year in years:
        year_studies = df[df['Year'] == year]
        total_studies = len(year_studies)

        if total_studies > 0:
            for paradigm, keywords in paradigms.items():
                count = 0
                for _, study in year_studies.iterrows():
                    text = (str(study['Abstract']) + ' ' +
                           str(study['Description_of_intervention']) + ' ' +
                           str(study['keywords'])).lower()

                    if any(keyword in text for keyword in keywords):
                        count += 1

                paradigm_timeline[paradigm][year] = (count / total_studies) * 100

    # Create visualization
    fig = go.Figure()

    colors = px.colors.qualitative.Set3
    for i, (paradigm, year_data) in enumerate(paradigm_timeline.items()):
        years_list = sorted(year_data.keys())
        percentages = [year_data[y] for y in years_list]

        fig.add_trace(go.Scatter(
            x=years_list,
            y=percentages,
            mode='lines+markers',
            name=paradigm,
            line=dict(width=2, color=colors[i % len(colors)]),
            marker=dict(size=6)
        ))

    fig.update_layout(
        title='Evolution of Development Research Paradigms Over Time',
        xaxis_title='Year',
        yaxis_title='Percentage of Studies (%)',
        hovermode='x unified',
        height=600
    )
    fig.show()

    # Identify paradigm shifts
    print("\n🔄 Detected Paradigm Shifts:")
    print("-" * 40)

    for paradigm, year_data in paradigm_timeline.items():
        years_sorted = sorted(year_data.items())
        if len(years_sorted) > 5:
            # Find peak year
            peak_year = max(years_sorted, key=lambda x: x[1])

            # Calculate growth rate
            early_avg = np.mean([v for y, v in years_sorted[:3]])
            recent_avg = np.mean([v for y, v in years_sorted[-3:]])
            growth_rate = ((recent_avg - early_avg) / max(early_avg, 0.1)) * 100

            status = "📈 Rising" if growth_rate > 20 else "📉 Declining" if growth_rate < -20 else "➡️ Stable"

            print(f"{paradigm}:")
            print(f"  Peak Year: {peak_year[0]} ({peak_year[1]:.1f}% of studies)")
            print(f"  Trend: {status} ({growth_rate:+.1f}% change)")
            print()

    return paradigm_timeline

# =====================================
# ANALYSIS 3: SIMILAR PROBLEMS, DIFFERENT SOLUTIONS
# =====================================

def find_intervention_menus(df, n_problems=10):
    """
    Identify common problems and create menus of different interventions tried.
    """
    print("\n🎯 ANALYSIS 3: Intervention Menus for Common Challenges")
    print("=" * 60)

    # Extract problem themes from outcomes
    outcome_text = df['Primary_outcomes'].fillna('') + ' ' + df['Secondary_outcomes'].fillna('')

    # Common development challenges
    problem_categories = {
        'School Enrollment': ['enrollment', 'attendance', 'dropout', 'school participation'],
        'Learning Outcomes': ['test scores', 'learning', 'literacy', 'numeracy', 'academic'],
        'Health Service Uptake': ['vaccination', 'immunization', 'clinic visits', 'health service'],
        'Maternal Health': ['maternal', 'pregnancy', 'prenatal', 'antenatal', 'delivery'],
        'Agricultural Productivity': ['yield', 'harvest', 'crop production', 'agricultural output'],
        'Financial Inclusion': ['savings', 'bank account', 'financial access', 'credit'],
        'Income Generation': ['income', 'earnings', 'wages', 'poverty', 'consumption'],
        'Women Empowerment': ['women decision', 'female empowerment', 'gender equality', 'women control'],
        'Nutrition': ['malnutrition', 'stunting', 'nutrition', 'dietary', 'food security'],
        'Employment': ['employment', 'job', 'unemployment', 'work', 'labor']
    }

    intervention_menus = {}

    for problem, keywords in problem_categories.items():
        # Find studies addressing this problem
        relevant_studies = []

        for idx, row in df.iterrows():
            outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
            if any(keyword in outcomes for keyword in keywords):
                relevant_studies.append({
                    'title': row['Title'],
                    'intervention': row['Description_of_intervention'],
                    'country': row['Country'],
                    'year': row['Year'],
                    'findings': row['Findings']
                })

        if len(relevant_studies) >= 3:  # Only include problems with multiple studies
            # Categorize interventions
            intervention_types = defaultdict(list)

            for study in relevant_studies:
                intervention_text = str(study['intervention']).lower()

                # Classify intervention type
                if any(word in intervention_text for word in ['cash', 'payment', 'transfer', 'subsidy']):
                    type_key = 'Financial Incentives'
                elif any(word in intervention_text for word in ['information', 'education', 'training', 'workshop']):
                    type_key = 'Education/Information'
                elif any(word in intervention_text for word in ['nudge', 'reminder', 'sms', 'text', 'message']):
                    type_key = 'Behavioral Nudges'
                elif any(word in intervention_text for word in ['technology', 'digital', 'app', 'mobile']):
                    type_key = 'Technology Solutions'
                elif any(word in intervention_text for word in ['community', 'group', 'peer', 'social']):
                    type_key = 'Social/Community'
                elif any(word in intervention_text for word in ['infrastructure', 'facility', 'building', 'construction']):
                    type_key = 'Infrastructure'
                else:
                    type_key = 'Other Approaches'

                intervention_types[type_key].append(study)

            intervention_menus[problem] = {
                'total_studies': len(relevant_studies),
                'intervention_types': dict(intervention_types)
            }

    # Display intervention menus
    print("\n📋 Intervention Menus for Common Development Challenges:\n")

    for problem, menu in intervention_menus.items():
        print(f"🎯 {problem} ({menu['total_studies']} studies)")
        print("-" * 50)

        for intervention_type, studies in menu['intervention_types'].items():
            print(f"\n  💡 {intervention_type} ({len(studies)} studies):")
            for study in studies[:2]:  # Show top 2 examples
                title_short = study['title'][:60] + '...' if len(study['title']) > 60 else study['title']
                print(f"     • {title_short}")
                print(f"       ({study['country']}, {study['year']})")
        print()

    # Create similarity network for one problem
    if intervention_menus:
        sample_problem = list(intervention_menus.keys())[0]
        print(f"\n🔗 Creating similarity network for: {sample_problem}")
        create_similarity_network(df, sample_problem, problem_categories[sample_problem])

    return intervention_menus

def create_similarity_network(df, problem_name, keywords):
    """
    Create a network showing similar studies for a specific problem.
    """
    # Filter relevant studies
    relevant_indices = []
    for idx, row in df.iterrows():
        outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
        if any(keyword in outcomes for keyword in keywords):
            relevant_indices.append(idx)

    if len(relevant_indices) > 2:
        relevant_df = df.iloc[relevant_indices].copy()

        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        text_data = relevant_df['Abstract'].fillna('') + ' ' + relevant_df['Description_of_intervention'].fillna('')
        tfidf_matrix = vectorizer.fit_transform(text_data)

        # Calculate similarity
        similarity_matrix = cosine_similarity(tfidf_matrix)

        # Find most similar pairs
        print(f"\n  Most similar intervention pairs for {problem_name}:")
        similar_pairs = []
        for i in range(len(similarity_matrix)):
            for j in range(i+1, len(similarity_matrix)):
                if similarity_matrix[i, j] > 0.3:  # Threshold for similarity
                    similar_pairs.append((i, j, similarity_matrix[i, j]))

        similar_pairs.sort(key=lambda x: x[2], reverse=True)

        for i, j, sim in similar_pairs[:5]:
            study1 = relevant_df.iloc[i]
            study2 = relevant_df.iloc[j]
            print(f"\n  Similarity: {sim:.2f}")
            print(f"  Study 1: {study1['Title'][:50]}...")
            print(f"  Study 2: {study2['Title'][:50]}...")

# =====================================
# ANALYSIS 4: SURPRISING RESULTS
# =====================================

def analyze_surprising_findings(df):
    """
    Use sentiment analysis to identify studies with unexpected results.
    """
    print("\n😲 ANALYSIS 4: Studies with Surprising or Unexpected Results")
    print("=" * 60)

    # Keywords indicating surprise or unexpected results
    surprise_keywords = {
        'unexpected': ['unexpected', 'surprising', 'contrary to', 'against expectations', 'paradox'],
        'no_effect': ['no effect', 'no impact', 'no significant', 'null result', 'no difference'],
        'negative': ['negative effect', 'backfired', 'worse', 'reduced', 'decreased', 'harmful'],
        'much_larger': ['larger than expected', 'exceeded', 'remarkable', 'extraordinary', 'substantial'],
        'opposite': ['opposite', 'reverse', 'contrary', 'inverse', 'contradictory']
    }

    surprising_studies = []

    for idx, row in df.iterrows():
        findings = str(row['Findings']).lower()
        abstract = str(row['Abstract']).lower()
        combined_text = findings + ' ' + abstract

        surprise_score = 0
        surprise_types = []

        for surprise_type, keywords in surprise_keywords.items():
            if any(keyword in combined_text for keyword in keywords):
                surprise_score += 1
                surprise_types.append(surprise_type)

        if surprise_score > 0:
            surprising_studies.append({
                'title': row['Title'],
                'year': row['Year'],
                'country': row['Country'],
                'intervention': row['Description_of_intervention'],
                'findings': row['Findings'],
                'surprise_score': surprise_score,
                'surprise_types': surprise_types
            })

    # Sort by surprise score
    surprising_studies.sort(key=lambda x: x['surprise_score'], reverse=True)

    print(f"\nFound {len(surprising_studies)} studies with potentially surprising results")
    print("\n🏆 Top 10 Most Surprising Studies:\n")

    for i, study in enumerate(surprising_studies[:10], 1):
        print(f"{i}. {study['title'][:70]}...")
        print(f"   📍 {study['country']}, {study['year']}")
        print(f"   🎯 Surprise Type: {', '.join(study['surprise_types'])}")
        print(f"   📊 Finding excerpt: {study['findings'][:150]}...")
        print()

    # Categorize by surprise type
    print("\n📊 Distribution of Surprise Types:")
    print("-" * 40)
    surprise_distribution = defaultdict(int)
    for study in surprising_studies:
        for surprise_type in study['surprise_types']:
            surprise_distribution[surprise_type] += 1

    for surprise_type, count in sorted(surprise_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"  {surprise_type.replace('_', ' ').title()}: {count} studies")

    # Create visualization
    if surprising_studies:
        years = [s['year'] for s in surprising_studies if pd.notna(s['year'])]
        if years:
            fig = px.histogram(x=years, nbins=20,
                             title='Distribution of Surprising Results Over Time',
                             labels={'x': 'Year', 'y': 'Number of Surprising Studies'})
            fig.show()

    return surprising_studies

# =====================================
# ENHANCED SENTIMENT ANALYSIS WITH AI
# =====================================

def ai_sentiment_analysis(df, sample_size=50):
    """
    Use TogetherAI to perform deeper sentiment analysis on findings.
    """
    print("\n🤖 Enhanced AI Sentiment Analysis")
    print("=" * 60)

    # Sample studies for AI analysis
    sample_df = df.sample(min(sample_size, len(df)))

    sentiments = []

    for idx, row in sample_df.iterrows():
        prompt = f"""
        Analyze the sentiment and surprise level of this research finding.

        Study: {row['Title']}
        Intervention: {row['Description_of_intervention']}
        Findings: {row['Findings']}

        Rate on a scale of 1-5:
        1. Surprise Level (1=expected, 5=very surprising)
        2. Positivity (1=very negative, 5=very positive)
        3. Confidence (1=uncertain, 5=very confident)

        Provide a brief explanation of why the results might be surprising.

        Format your response as JSON:
        {
            "surprise_level": X,
            "positivity": X,
            "confidence": X,
            "explanation": "..."
        }
        """

        # This is where you'd call TogetherAI
        # response = together.Complete.create(
        #     model="meta-llama/Llama-2-70b-chat-hf",
        #     prompt=prompt,
        #     max_tokens=200,
        #     temperature=0.3
        # )

        # For now, using placeholder
        sentiments.append({
            'title': row['Title'],
            'surprise_level': np.random.randint(1, 6),
            'positivity': np.random.randint(1, 6),
            'confidence': np.random.randint(1, 6)
        })

    # Convert to DataFrame for analysis
    sentiment_df = pd.DataFrame(sentiments)

    # Display most surprising according to AI
    most_surprising = sentiment_df.nlargest(10, 'surprise_level')
    print("\n🎯 Most Surprising Studies (AI Analysis):")
    for _, row in most_surprising.iterrows():
        print(f"  • {row['title'][:60]}...")
        print(f"    Surprise: {'⭐' * row['surprise_level']}")

    return sentiment_df

# =====================================
# MAIN EXECUTION
# =====================================

def run_all_analyses(df):
    """
    Execute all four analyses in sequence.
    """
    print("🚀 Starting Comprehensive RCT Analysis")
    print("=" * 80)
    print(f"Dataset: {len(df)} studies")
    print(f"Years: {df['Year'].min()} - {df['Year'].max()}")
    print(f"Countries: {df['Country'].nunique()} unique countries")
    print("=" * 80)

    # Run analyses
    results = {}

    # 1. Natural Topic Clustering
    df_clustered, clusters = cluster_studies_by_embeddings(df)
    results['clusters'] = clusters

    # 2. Timeline Analysis
    timeline = analyze_research_timeline(df)
    results['timeline'] = timeline

    # 3. Intervention Menus
    menus = find_intervention_menus(df)
    results['intervention_menus'] = menus

    # 4. Surprising Findings
    surprising = analyze_surprising_findings(df)
    results['surprising_studies'] = surprising

    # Save results
    print("\n💾 Saving analysis results...")

    # Save clustered dataframe
    df_clustered.to_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_clustered.csv', index=False)

    # Save analysis summaries
    with open('/content/drive/MyDrive/AEA_RCT_Parsed/analysis_results.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)

    print("\n✅ Analysis complete! Results saved.")
    print("   - studies_clustered.csv: Studies with cluster assignments")
    print("   - analysis_results.json: Detailed analysis summaries")

    return results

# Execute all analyses
if __name__ == "__main__":
    results = run_all_analyses(df)

🚀 Starting Comprehensive RCT Analysis
Dataset: 2117 studies
Years: 1960 - 2025
Countries: 125 unique countries
🔬 ANALYSIS 1: Discovering Natural Research Communities
Clusters: 5, Silhouette Score: 0.032
Clusters: 6, Silhouette Score: 0.034
Clusters: 7, Silhouette Score: 0.038
Clusters: 8, Silhouette Score: 0.035
Clusters: 9, Silhouette Score: 0.037
Clusters: 10, Silhouette Score: 0.037
Clusters: 11, Silhouette Score: 0.037
Clusters: 12, Silhouette Score: 0.037
Clusters: 13, Silhouette Score: 0.036
Clusters: 14, Silhouette Score: 0.035
Clusters: 15, Silhouette Score: 0.035
Clusters: 16, Silhouette Score: 0.036
Clusters: 17, Silhouette Score: 0.036
Clusters: 18, Silhouette Score: 0.035
Clusters: 19, Silhouette Score: 0.036

✨ Optimal number of clusters: 7

Creating 2D visualization...

📊 Cluster Analysis:
------------------------------------------------------------

🔹 Cluster 0 (287 studies)
   Top Keywords: Labor, Gender; Labor, Education; Labor
   Main Countries: United States of Ameri


📅 ANALYSIS 2: Evolution of Development Research Paradigms



🔄 Detected Paradigm Shifts:
----------------------------------------
Microfinance Era:
  Peak Year: 1992 (100.0% of studies)
  Trend: 📈 Rising (+7970.4% change)

Behavioral Revolution:
  Peak Year: 1987 (50.0% of studies)
  Trend: 📈 Rising (+22893.4% change)

Digital Transformation:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-10.1% change)

Cash Transfer Wave:
  Peak Year: 1999 (100.0% of studies)
  Trend: 📈 Rising (+266.7% change)

Women Empowerment:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-19.6% change)

Health Interventions:
  Peak Year: 1960 (100.0% of studies)
  Trend: 📉 Declining (-46.6% change)

Education Innovation:
  Peak Year: 1962 (100.0% of studies)
  Trend: 📉 Declining (-43.8% change)

Agriculture & Climate:
  Peak Year: 1998 (33.3% of studies)
  Trend: 📈 Rising (+7558.8% change)

Governance & Institutions:
  Peak Year: 2002 (66.7% of studies)
  Trend: 📈 Rising (+24417.7% change)


🎯 ANALYSIS 3: Intervention Menus for Common Challenges

📋 


💾 Saving analysis results...

✅ Analysis complete! Results saved.
   - studies_clustered.csv: Studies with cluster assignments
   - analysis_results.json: Detailed analysis summaries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import re
from collections import Counter, defaultdict
import json
import ast
import warnings
warnings.filterwarnings('ignore')

# For TogetherAI integration
import together
import requests
from typing import List, Dict, Tuple

# Initialize TogetherAI (you'll need to set your API key)
# together.api_key = "YOUR_API_KEY_HERE"

# Load the data
df = pd.read_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_with_ai_keywords.csv')

# =====================================
# ANALYSIS 1: NATURAL TOPIC CLUSTERING
# =====================================

def cluster_studies_by_embeddings(df, n_clusters_range=(5, 20)):
    """
    Cluster studies using embeddings to discover natural research communities.
    Tests multiple cluster numbers to find optimal grouping.
    """
    print("🔬 ANALYSIS 1: Discovering Natural Research Communities")
    print("=" * 60)

    # Extract embeddings (assuming they're stored as string representations)
    # You may need to adjust this based on how embeddings are stored
    if 'Search_vector' in df.columns:
        # Parse embeddings if stored as strings
        embeddings = []
        for vec in df['Search_vector']:
            try:
                if isinstance(vec, str):
                    # Handle different string formats
                    vec = vec.replace('[', '').replace(']', '')
                    embedding = np.array([float(x) for x in vec.split(',')])
                else:
                    embedding = np.array(vec)
                embeddings.append(embedding)
            except:
                # Fallback: create embeddings from text if vectors are corrupted
                embeddings.append(np.zeros(768))  # Placeholder

        embeddings = np.array(embeddings)
    else:
        # Create embeddings from text if not available
        print("Creating embeddings from text...")
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        text_data = df['Abstract'].fillna('') + ' ' + df['Description_of_intervention'].fillna('')
        embeddings = vectorizer.fit_transform(text_data).toarray()

    # Find optimal number of clusters using silhouette score
    silhouette_scores = []
    for n_clusters in range(n_clusters_range[0], n_clusters_range[1]):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(embeddings)
        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        silhouette_scores.append(silhouette_avg)
        print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.3f}")

    # Use optimal number of clusters
    optimal_n = n_clusters_range[0] + np.argmax(silhouette_scores)
    print(f"\n✨ Optimal number of clusters: {optimal_n}")

    # Perform final clustering
    kmeans = KMeans(n_clusters=optimal_n, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(embeddings)

    # Reduce dimensions for visualization
    print("\nCreating 2D visualization...")
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)
    df['x'] = embeddings_2d[:, 0]
    df['y'] = embeddings_2d[:, 1]

    # Analyze each cluster
    print("\n📊 Cluster Analysis:")
    print("-" * 60)
    cluster_summaries = []

    for cluster_id in range(optimal_n):
        cluster_data = df[df['cluster'] == cluster_id]
        size = len(cluster_data)

        # Get most common keywords
        all_keywords = []
        for keywords in cluster_data['keywords'].dropna():
            if isinstance(keywords, str):
                all_keywords.extend(keywords.split(','))

        keyword_freq = Counter([k.strip() for k in all_keywords if k.strip()])
        top_keywords = keyword_freq.most_common(5)

        # Get most common countries
        countries = cluster_data['Country'].dropna().value_counts().head(3)

        # Sample titles
        sample_titles = cluster_data['Title'].sample(min(3, len(cluster_data))).tolist()

        cluster_summary = {
            'cluster_id': cluster_id,
            'size': size,
            'top_keywords': top_keywords,
            'main_countries': countries.to_dict(),
            'sample_studies': sample_titles
        }
        cluster_summaries.append(cluster_summary)

        print(f"\n🔹 Cluster {cluster_id} ({size} studies)")
        print(f"   Top Keywords: {', '.join([k[0] for k in top_keywords[:3]])}")
        print(f"   Main Countries: {', '.join(countries.index[:3].tolist())}")
        print(f"   Sample Study: {sample_titles[0][:60]}...")

    # Create interactive visualization
    fig = px.scatter(df, x='x', y='y', color='cluster',
                     hover_data=['Title', 'Country', 'Year'],
                     title='Natural Research Communities in Development Economics',
                     labels={'cluster': 'Research Community'},
                     color_continuous_scale='Viridis')
    fig.update_traces(marker=dict(size=8))
    fig.show()

    return df, cluster_summaries

# =====================================
# ANALYSIS 2: TIMELINE OF RESEARCH PARADIGMS
# =====================================

def analyze_research_timeline(df):
    """
    Analyze how research topics have evolved over time.
    Identifies paradigm shifts in development economics.
    """
    print("\n📅 ANALYSIS 2: Evolution of Development Research Paradigms")
    print("=" * 60)

    # Define paradigm keywords
    paradigms = {
        'Microfinance Era': ['microfinance', 'microcredit', 'micro-loan', 'micro loan', 'credit', 'lending'],
        'Behavioral Revolution': ['behavioral', 'nudge', 'psychology', 'cognitive', 'bias', 'framing'],
        'Digital Transformation': ['digital', 'mobile', 'technology', 'app', 'online', 'internet', 'sms', 'text message'],
        'Cash Transfer Wave': ['cash transfer', 'unconditional', 'UBI', 'basic income', 'direct transfer'],
        'Women Empowerment': ['women', 'gender', 'female', 'girl', 'empowerment', 'maternal'],
        'Health Interventions': ['health', 'medical', 'vaccine', 'nutrition', 'disease', 'mortality'],
        'Education Innovation': ['education', 'school', 'learning', 'teacher', 'student', 'literacy'],
        'Agriculture & Climate': ['agriculture', 'farming', 'crop', 'climate', 'weather', 'irrigation'],
        'Governance & Institutions': ['governance', 'corruption', 'accountability', 'transparency', 'institution']
    }

    # Create year bins
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df = df.dropna(subset=['Year'])
    df['Year'] = df['Year'].astype(int)

    year_min = df['Year'].min()
    year_max = df['Year'].max()
    years = range(year_min, year_max + 1)

    # Calculate paradigm prevalence by year
    paradigm_timeline = defaultdict(lambda: defaultdict(int))

    for year in years:
        year_studies = df[df['Year'] == year]
        total_studies = len(year_studies)

        if total_studies > 0:
            for paradigm, keywords in paradigms.items():
                count = 0
                for _, study in year_studies.iterrows():
                    text = (str(study['Abstract']) + ' ' +
                           str(study['Description_of_intervention']) + ' ' +
                           str(study['keywords'])).lower()

                    if any(keyword in text for keyword in keywords):
                        count += 1

                paradigm_timeline[paradigm][year] = (count / total_studies) * 100

    # Create visualization
    fig = go.Figure()

    colors = px.colors.qualitative.Set3
    for i, (paradigm, year_data) in enumerate(paradigm_timeline.items()):
        years_list = sorted(year_data.keys())
        percentages = [year_data[y] for y in years_list]

        fig.add_trace(go.Scatter(
            x=years_list,
            y=percentages,
            mode='lines+markers',
            name=paradigm,
            line=dict(width=2, color=colors[i % len(colors)]),
            marker=dict(size=6)
        ))

    fig.update_layout(
        title='Evolution of Development Research Paradigms Over Time',
        xaxis_title='Year',
        yaxis_title='Percentage of Studies (%)',
        hovermode='x unified',
        height=600
    )
    fig.show()

    # Identify paradigm shifts
    print("\n🔄 Detected Paradigm Shifts:")
    print("-" * 40)

    for paradigm, year_data in paradigm_timeline.items():
        years_sorted = sorted(year_data.items())
        if len(years_sorted) > 5:
            # Find peak year
            peak_year = max(years_sorted, key=lambda x: x[1])

            # Calculate growth rate
            early_avg = np.mean([v for y, v in years_sorted[:3]])
            recent_avg = np.mean([v for y, v in years_sorted[-3:]])
            growth_rate = ((recent_avg - early_avg) / max(early_avg, 0.1)) * 100

            status = "📈 Rising" if growth_rate > 20 else "📉 Declining" if growth_rate < -20 else "➡️ Stable"

            print(f"{paradigm}:")
            print(f"  Peak Year: {peak_year[0]} ({peak_year[1]:.1f}% of studies)")
            print(f"  Trend: {status} ({growth_rate:+.1f}% change)")
            print()

    return paradigm_timeline

# =====================================
# ANALYSIS 3: SIMILAR PROBLEMS, DIFFERENT SOLUTIONS
# =====================================

def find_intervention_menus(df, n_problems=10):
    """
    Identify common problems and create menus of different interventions tried.
    """
    print("\n🎯 ANALYSIS 3: Intervention Menus for Common Challenges")
    print("=" * 60)

    # Extract problem themes from outcomes
    outcome_text = df['Primary_outcomes'].fillna('') + ' ' + df['Secondary_outcomes'].fillna('')

    # Common development challenges
    problem_categories = {
        'School Enrollment': ['enrollment', 'attendance', 'dropout', 'school participation'],
        'Learning Outcomes': ['test scores', 'learning', 'literacy', 'numeracy', 'academic'],
        'Health Service Uptake': ['vaccination', 'immunization', 'clinic visits', 'health service'],
        'Maternal Health': ['maternal', 'pregnancy', 'prenatal', 'antenatal', 'delivery'],
        'Agricultural Productivity': ['yield', 'harvest', 'crop production', 'agricultural output'],
        'Financial Inclusion': ['savings', 'bank account', 'financial access', 'credit'],
        'Income Generation': ['income', 'earnings', 'wages', 'poverty', 'consumption'],
        'Women Empowerment': ['women decision', 'female empowerment', 'gender equality', 'women control'],
        'Nutrition': ['malnutrition', 'stunting', 'nutrition', 'dietary', 'food security'],
        'Employment': ['employment', 'job', 'unemployment', 'work', 'labor']
    }

    intervention_menus = {}

    for problem, keywords in problem_categories.items():
        # Find studies addressing this problem
        relevant_studies = []

        for idx, row in df.iterrows():
            outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
            if any(keyword in outcomes for keyword in keywords):
                relevant_studies.append({
                    'title': row['Title'],
                    'intervention': row['Description_of_intervention'],
                    'country': row['Country'],
                    'year': row['Year'],
                    'findings': row['Findings']
                })

        if len(relevant_studies) >= 3:  # Only include problems with multiple studies
            # Categorize interventions
            intervention_types = defaultdict(list)

            for study in relevant_studies:
                intervention_text = str(study['intervention']).lower()

                # Classify intervention type
                if any(word in intervention_text for word in ['cash', 'payment', 'transfer', 'subsidy']):
                    type_key = 'Financial Incentives'
                elif any(word in intervention_text for word in ['information', 'education', 'training', 'workshop']):
                    type_key = 'Education/Information'
                elif any(word in intervention_text for word in ['nudge', 'reminder', 'sms', 'text', 'message']):
                    type_key = 'Behavioral Nudges'
                elif any(word in intervention_text for word in ['technology', 'digital', 'app', 'mobile']):
                    type_key = 'Technology Solutions'
                elif any(word in intervention_text for word in ['community', 'group', 'peer', 'social']):
                    type_key = 'Social/Community'
                elif any(word in intervention_text for word in ['infrastructure', 'facility', 'building', 'construction']):
                    type_key = 'Infrastructure'
                else:
                    type_key = 'Other Approaches'

                intervention_types[type_key].append(study)

            intervention_menus[problem] = {
                'total_studies': len(relevant_studies),
                'intervention_types': dict(intervention_types)
            }

    # Display intervention menus
    print("\n📋 Intervention Menus for Common Development Challenges:\n")

    for problem, menu in intervention_menus.items():
        print(f"🎯 {problem} ({menu['total_studies']} studies)")
        print("-" * 50)

        for intervention_type, studies in menu['intervention_types'].items():
            print(f"\n  💡 {intervention_type} ({len(studies)} studies):")
            for study in studies[:2]:  # Show top 2 examples
                title_short = study['title'][:60] + '...' if len(study['title']) > 60 else study['title']
                print(f"     • {title_short}")
                print(f"       ({study['country']}, {study['year']})")
        print()

    # Create similarity network for one problem
    if intervention_menus:
        sample_problem = list(intervention_menus.keys())[0]
        print(f"\n🔗 Creating similarity network for: {sample_problem}")
        create_similarity_network(df, sample_problem, problem_categories[sample_problem])

    return intervention_menus

def create_similarity_network(df, problem_name, keywords):
    """
    Create a network showing similar studies for a specific problem.
    """
    # Filter relevant studies
    relevant_indices = []
    for idx, row in df.iterrows():
        outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
        if any(keyword in outcomes for keyword in keywords):
            relevant_indices.append(idx)

    if len(relevant_indices) > 2:
        relevant_df = df.iloc[relevant_indices].copy()

        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        text_data = relevant_df['Abstract'].fillna('') + ' ' + relevant_df['Description_of_intervention'].fillna('')
        tfidf_matrix = vectorizer.fit_transform(text_data)

        # Calculate similarity
        similarity_matrix = cosine_similarity(tfidf_matrix)

        # Find most similar pairs
        print(f"\n  Most similar intervention pairs for {problem_name}:")
        similar_pairs = []
        for i in range(len(similarity_matrix)):
            for j in range(i+1, len(similarity_matrix)):
                if similarity_matrix[i, j] > 0.3:  # Threshold for similarity
                    similar_pairs.append((i, j, similarity_matrix[i, j]))

        similar_pairs.sort(key=lambda x: x[2], reverse=True)

        for i, j, sim in similar_pairs[:5]:
            study1 = relevant_df.iloc[i]
            study2 = relevant_df.iloc[j]
            print(f"\n  Similarity: {sim:.2f}")
            print(f"  Study 1: {study1['Title'][:50]}...")
            print(f"  Study 2: {study2['Title'][:50]}...")

# =====================================
# ANALYSIS 4: SURPRISING RESULTS
# =====================================

def analyze_surprising_findings(df):
    """
    Use sentiment analysis to identify studies with unexpected results.
    """
    print("\n😲 ANALYSIS 4: Studies with Surprising or Unexpected Results")
    print("=" * 60)

    # Keywords indicating surprise or unexpected results
    surprise_keywords = {
        'unexpected': ['unexpected', 'surprising', 'contrary to', 'against expectations', 'paradox'],
        'no_effect': ['no effect', 'no impact', 'no significant', 'null result', 'no difference'],
        'negative': ['negative effect', 'backfired', 'worse', 'reduced', 'decreased', 'harmful'],
        'much_larger': ['larger than expected', 'exceeded', 'remarkable', 'extraordinary', 'substantial'],
        'opposite': ['opposite', 'reverse', 'contrary', 'inverse', 'contradictory']
    }

    surprising_studies = []

    for idx, row in df.iterrows():
        findings = str(row['Findings']).lower()
        abstract = str(row['Abstract']).lower()
        combined_text = findings + ' ' + abstract

        surprise_score = 0
        surprise_types = []

        for surprise_type, keywords in surprise_keywords.items():
            if any(keyword in combined_text for keyword in keywords):
                surprise_score += 1
                surprise_types.append(surprise_type)

        if surprise_score > 0:
            surprising_studies.append({
                'title': row['Title'],
                'year': row['Year'],
                'country': row['Country'],
                'intervention': row['Description_of_intervention'],
                'findings': row['Findings'],
                'surprise_score': surprise_score,
                'surprise_types': surprise_types
            })

    # Sort by surprise score
    surprising_studies.sort(key=lambda x: x['surprise_score'], reverse=True)

    print(f"\nFound {len(surprising_studies)} studies with potentially surprising results")
    print("\n🏆 Top 10 Most Surprising Studies:\n")

    for i, study in enumerate(surprising_studies[:10], 1):
        print(f"{i}. {study['title'][:70]}...")
        print(f"   📍 {study['country']}, {study['year']}")
        print(f"   🎯 Surprise Type: {', '.join(study['surprise_types'])}")
        print(f"   📊 Finding excerpt: {study['findings'][:150]}...")
        print()

    # Categorize by surprise type
    print("\n📊 Distribution of Surprise Types:")
    print("-" * 40)
    surprise_distribution = defaultdict(int)
    for study in surprising_studies:
        for surprise_type in study['surprise_types']:
            surprise_distribution[surprise_type] += 1

    for surprise_type, count in sorted(surprise_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"  {surprise_type.replace('_', ' ').title()}: {count} studies")

    # Create visualization
    if surprising_studies:
        years = [s['year'] for s in surprising_studies if pd.notna(s['year'])]
        if years:
            fig = px.histogram(x=years, nbins=20,
                             title='Distribution of Surprising Results Over Time',
                             labels={'x': 'Year', 'y': 'Number of Surprising Studies'})
            fig.show()

    return surprising_studies

# =====================================
# ENHANCED SENTIMENT ANALYSIS WITH AI
# =====================================

def ai_sentiment_analysis(df, sample_size=50):
    """
    Use TogetherAI to perform deeper sentiment analysis on findings.
    """
    print("\n🤖 Enhanced AI Sentiment Analysis")
    print("=" * 60)

    # Sample studies for AI analysis
    sample_df = df.sample(min(sample_size, len(df)))

    sentiments = []

    for idx, row in sample_df.iterrows():
        prompt = f"""
        Analyze the sentiment and surprise level of this research finding.

        Study: {row['Title']}
        Intervention: {row['Description_of_intervention']}
        Findings: {row['Findings']}

        Rate on a scale of 1-5:
        1. Surprise Level (1=expected, 5=very surprising)
        2. Positivity (1=very negative, 5=very positive)
        3. Confidence (1=uncertain, 5=very confident)

        Provide a brief explanation of why the results might be surprising.

        Format your response as JSON:
        {
            "surprise_level": X,
            "positivity": X,
            "confidence": X,
            "explanation": "..."
        }
        """

        # This is where you'd call TogetherAI
        # response = together.Complete.create(
        #     model="meta-llama/Llama-2-70b-chat-hf",
        #     prompt=prompt,
        #     max_tokens=200,
        #     temperature=0.3
        # )

        # For now, using placeholder
        sentiments.append({
            'title': row['Title'],
            'surprise_level': np.random.randint(1, 6),
            'positivity': np.random.randint(1, 6),
            'confidence': np.random.randint(1, 6)
        })

    # Convert to DataFrame for analysis
    sentiment_df = pd.DataFrame(sentiments)

    # Display most surprising according to AI
    most_surprising = sentiment_df.nlargest(10, 'surprise_level')
    print("\n🎯 Most Surprising Studies (AI Analysis):")
    for _, row in most_surprising.iterrows():
        print(f"  • {row['title'][:60]}...")
        print(f"    Surprise: {'⭐' * row['surprise_level']}")

    return sentiment_df

# =====================================
# MAIN EXECUTION
# =====================================

def run_all_analyses(df):
    """
    Execute all four analyses in sequence.
    """
    print("🚀 Starting Comprehensive RCT Analysis")
    print("=" * 80)
    print(f"Dataset: {len(df)} studies")
    print(f"Years: {df['Year'].min()} - {df['Year'].max()}")
    print(f"Countries: {df['Country'].nunique()} unique countries")
    print("=" * 80)

    # Run analyses
    results = {}

    # 1. Natural Topic Clustering
    df_clustered, clusters = cluster_studies_by_embeddings(df)
    results['clusters'] = clusters

    # 2. Timeline Analysis
    timeline = analyze_research_timeline(df)
    results['timeline'] = timeline

    # 3. Intervention Menus
    menus = find_intervention_menus(df)
    results['intervention_menus'] = menus

    # 4. Surprising Findings
    surprising = analyze_surprising_findings(df)
    results['surprising_studies'] = surprising
    <
    # Save results
    print("\n💾 Saving analysis results...")

    # Save clustered dataframe
    df_clustered.to_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_clustered.csv', index=False)

    # Save analysis summaries
    with open('/content/drive/MyDrive/AEA_RCT_Parsed/analysis_results.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)

    print("\n✅ Analysis complete! Results saved.")
    print("   - studies_clustered.csv: Studies with cluster assignments")
    print("   - analysis_results.json: Detailed analysis summaries")

    return results

# Execute all analyses
if __name__ == "__main__":
    results = run_all_analyses(df)

🚀 Starting Comprehensive RCT Analysis
Dataset: 2117 studies
Years: 1960 - 2025
Countries: 125 unique countries
🔬 ANALYSIS 1: Discovering Natural Research Communities
Clusters: 5, Silhouette Score: 0.032
Clusters: 6, Silhouette Score: 0.034
Clusters: 7, Silhouette Score: 0.038
Clusters: 8, Silhouette Score: 0.035
Clusters: 9, Silhouette Score: 0.037
Clusters: 10, Silhouette Score: 0.037
Clusters: 11, Silhouette Score: 0.037
Clusters: 12, Silhouette Score: 0.037
Clusters: 13, Silhouette Score: 0.036
Clusters: 14, Silhouette Score: 0.035
Clusters: 15, Silhouette Score: 0.035
Clusters: 16, Silhouette Score: 0.036
Clusters: 17, Silhouette Score: 0.036
Clusters: 18, Silhouette Score: 0.035
Clusters: 19, Silhouette Score: 0.036

✨ Optimal number of clusters: 7

Creating 2D visualization...

📊 Cluster Analysis:
------------------------------------------------------------

🔹 Cluster 0 (287 studies)
   Top Keywords: Labor, Gender; Labor, Education; Labor
   Main Countries: United States of Ameri


📅 ANALYSIS 2: Evolution of Development Research Paradigms



🔄 Detected Paradigm Shifts:
----------------------------------------
Microfinance Era:
  Peak Year: 1992 (100.0% of studies)
  Trend: 📈 Rising (+7970.4% change)

Behavioral Revolution:
  Peak Year: 1987 (50.0% of studies)
  Trend: 📈 Rising (+22893.4% change)

Digital Transformation:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-10.1% change)

Cash Transfer Wave:
  Peak Year: 1999 (100.0% of studies)
  Trend: 📈 Rising (+266.7% change)

Women Empowerment:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-19.6% change)

Health Interventions:
  Peak Year: 1960 (100.0% of studies)
  Trend: 📉 Declining (-46.6% change)

Education Innovation:
  Peak Year: 1962 (100.0% of studies)
  Trend: 📉 Declining (-43.8% change)

Agriculture & Climate:
  Peak Year: 1998 (33.3% of studies)
  Trend: 📈 Rising (+7558.8% change)

Governance & Institutions:
  Peak Year: 2002 (66.7% of studies)
  Trend: 📈 Rising (+24417.7% change)


🎯 ANALYSIS 3: Intervention Menus for Common Challenges

📋 


💾 Saving analysis results...

✅ Analysis complete! Results saved.
   - studies_clustered.csv: Studies with cluster assignments
   - analysis_results.json: Detailed analysis summaries


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
import umap
from sklearn.metrics import silhouette_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime
import re
from collections import Counter, defaultdict
import json
import ast
import warnings
warnings.filterwarnings('ignore')

# For TogetherAI integration
import together
import requests
from typing import List, Dict, Tuple

# Initialize TogetherAI (you'll need to set your API key)
# together.api_key = "YOUR_API_KEY_HERE"

# Load the data
df = pd.read_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_with_ai_keywords.csv')

# =====================================
# ANALYSIS 1: NATURAL TOPIC CLUSTERING
# =====================================

def cluster_studies_by_embeddings(df, n_clusters_range=(5, 20)):
    """
    Cluster studies using embeddings to discover natural research communities.
    Tests multiple cluster numbers to find optimal grouping.
    """
    print("🔬 ANALYSIS 1: Discovering Natural Research Communities")
    print("=" * 60)

    # Extract embeddings (assuming they're stored as string representations)
    # You may need to adjust this based on how embeddings are stored
    if 'Search_vector' in df.columns:
        # Parse embeddings if stored as strings
        embeddings = []
        for vec in df['Search_vector']:
            try:
                if isinstance(vec, str):
                    # Handle different string formats
                    vec = vec.replace('[', '').replace(']', '')
                    embedding = np.array([float(x) for x in vec.split(',')])
                else:
                    embedding = np.array(vec)
                embeddings.append(embedding)
            except:
                # Fallback: create embeddings from text if vectors are corrupted
                embeddings.append(np.zeros(768))  # Placeholder

        embeddings = np.array(embeddings)
    else:
        # Create embeddings from text if not available
        print("Creating embeddings from text...")
        vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')
        text_data = df['Abstract'].fillna('') + ' ' + df['Description_of_intervention'].fillna('')
        embeddings = vectorizer.fit_transform(text_data).toarray()

    # Find optimal number of clusters using silhouette score
    silhouette_scores = []
    for n_clusters in range(n_clusters_range[0], n_clusters_range[1]):
        kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
        cluster_labels = kmeans.fit_predict(embeddings)
        silhouette_avg = silhouette_score(embeddings, cluster_labels)
        silhouette_scores.append(silhouette_avg)
        print(f"Clusters: {n_clusters}, Silhouette Score: {silhouette_avg:.3f}")

    # Use optimal number of clusters
    optimal_n = n_clusters_range[0] + np.argmax(silhouette_scores)
    print(f"\n✨ Optimal number of clusters: {optimal_n}")

    # Perform final clustering
    kmeans = KMeans(n_clusters=optimal_n, random_state=42, n_init=10)
    df['cluster'] = kmeans.fit_predict(embeddings)

    # Reduce dimensions for visualization
    print("\nCreating 2D visualization...")
    reducer = umap.UMAP(n_components=2, random_state=42)
    embeddings_2d = reducer.fit_transform(embeddings)
    df['x'] = embeddings_2d[:, 0]
    df['y'] = embeddings_2d[:, 1]

    # Analyze each cluster
    print("\n📊 Cluster Analysis:")
    print("-" * 60)
    cluster_summaries = []

    for cluster_id in range(optimal_n):
        cluster_data = df[df['cluster'] == cluster_id]
        size = len(cluster_data)

        # Get most common keywords
        all_keywords = []
        for keywords in cluster_data['keywords'].dropna():
            if isinstance(keywords, str):
                all_keywords.extend(keywords.split(','))

        keyword_freq = Counter([k.strip() for k in all_keywords if k.strip()])
        top_keywords = keyword_freq.most_common(5)

        # Get most common countries
        countries = cluster_data['Country'].dropna().value_counts().head(3)

        # Sample titles
        sample_titles = cluster_data['Title'].sample(min(3, len(cluster_data))).tolist()

        cluster_summary = {
            'cluster_id': cluster_id,
            'size': size,
            'top_keywords': top_keywords,
            'main_countries': countries.to_dict(),
            'sample_studies': sample_titles
        }
        cluster_summaries.append(cluster_summary)

        print(f"\n🔹 Cluster {cluster_id} ({size} studies)")
        print(f"   Top Keywords: {', '.join([k[0] for k in top_keywords[:3]])}")
        print(f"   Main Countries: {', '.join(countries.index[:3].tolist())}")
        print(f"   Sample Study: {sample_titles[0][:60]}...")

    # Create interactive visualization
    fig = px.scatter(df, x='x', y='y', color='cluster',
                     hover_data=['Title', 'Country', 'Year'],
                     title='Randomized Social Science Trial Researcher Community Clusters',
                     labels={'cluster': 'Research Community',
                             'x': 'Similarity Dimension 1 (studies closer together are more similar)',
                             'y': 'Similarity Dimension 2 (studies closer together are more similar)'},
                     color_continuous_scale='Viridis')
    fig.update_traces(marker=dict(size=8))
    fig.show()

    return df, cluster_summaries

# =====================================
# ANALYSIS 2: TIMELINE OF RESEARCH PARADIGMS
# =====================================

def analyze_research_timeline(df):
    """
    Analyze how research topics have evolved over time.
    Identifies paradigm shifts in development economics.
    """
    print("\n📅 ANALYSIS 2: Evolution of Development Research Paradigms")
    print("=" * 60)

    # Define paradigm keywords
    paradigms = {
        'Microfinance Era': ['microfinance', 'microcredit', 'micro-loan', 'micro loan', 'credit', 'lending'],
        'Behavioral Revolution': ['behavioral', 'nudge', 'psychology', 'cognitive', 'bias', 'framing'],
        'Digital Transformation': ['digital', 'mobile', 'technology', 'app', 'online', 'internet', 'sms', 'text message'],
        'Cash Transfer Wave': ['cash transfer', 'unconditional', 'UBI', 'basic income', 'direct transfer'],
        'Women Empowerment': ['women', 'gender', 'female', 'girl', 'empowerment', 'maternal'],
        'Health Interventions': ['health', 'medical', 'vaccine', 'nutrition', 'disease', 'mortality'],
        'Education Innovation': ['education', 'school', 'learning', 'teacher', 'student', 'literacy'],
        'Agriculture & Climate': ['agriculture', 'farming', 'crop', 'climate', 'weather', 'irrigation'],
        'Governance & Institutions': ['governance', 'corruption', 'accountability', 'transparency', 'institution']
    }

    # Create year bins
    df['Year'] = pd.to_numeric(df['Year'], errors='coerce')
    df = df.dropna(subset=['Year'])
    df['Year'] = df['Year'].astype(int)

    year_min = df['Year'].min()
    year_max = df['Year'].max()
    years = range(year_min, year_max + 1)

    # Calculate paradigm prevalence by year
    paradigm_timeline = defaultdict(lambda: defaultdict(int))
    total_studies_by_year = {}

    for year in years:
        year_studies = df[df['Year'] == year]
        total_studies = len(year_studies)
        total_studies_by_year[year] = total_studies

        if total_studies > 0:
            for paradigm, keywords in paradigms.items():
                count = 0
                for _, study in year_studies.iterrows():
                    text = (str(study['Abstract']) + ' ' +
                           str(study['Description_of_intervention']) + ' ' +
                           str(study['keywords'])).lower()

                    if any(keyword in text for keyword in keywords):
                        count += 1

                paradigm_timeline[paradigm][year] = (count / total_studies) * 100

    # Filter out years with too few studies
    years_to_include = [year for year in years if total_studies_by_year[year] >= 10]

    # Create visualization
    fig = go.Figure()

    colors = px.colors.qualitative.Set3
    for i, (paradigm, year_data) in enumerate(paradigm_timeline.items()):
        # Filter years to only include those with sufficient data
        years_list = [year for year in sorted(year_data.keys()) if year in years_to_include]
        percentages = [year_data[y] for y in years_list]

        fig.add_trace(go.Scatter(
            x=years_list,
            y=percentages,
            mode='lines+markers',
            name=paradigm,
            line=dict(width=2, color=colors[i % len(colors)]),
            marker=dict(size=6)
        ))

    fig.update_layout(
        title='Evolution of Development Research Paradigms Over Time',
        xaxis_title='Year',
        yaxis_title='Percentage of Studies (%)',
        hovermode='x unified',
        height=600
    )
    fig.show()

    # Identify paradigm shifts
    print("\n🔄 Detected Paradigm Shifts:")
    print("-" * 40)

    for paradigm, year_data in paradigm_timeline.items():
        years_sorted = sorted(year_data.items())
        if len(years_sorted) > 5:
            # Find peak year
            peak_year = max(years_sorted, key=lambda x: x[1])

            # Calculate growth rate
            early_avg = np.mean([v for y, v in years_sorted[:3]])
            recent_avg = np.mean([v for y, v in years_sorted[-3:]])
            growth_rate = ((recent_avg - early_avg) / max(early_avg, 0.1)) * 100

            status = "📈 Rising" if growth_rate > 20 else "📉 Declining" if growth_rate < -20 else "➡️ Stable"

            print(f"{paradigm}:")
            print(f"  Peak Year: {peak_year[0]} ({peak_year[1]:.1f}% of studies)")
            print(f"  Trend: {status} ({growth_rate:+.1f}% change)")
            print()

    return paradigm_timeline

# =====================================
# ANALYSIS 3: SIMILAR PROBLEMS, DIFFERENT SOLUTIONS
# =====================================

def find_intervention_menus(df, n_problems=10):
    """
    Identify common problems and create menus of different interventions tried.
    """
    print("\n🎯 ANALYSIS 3: Intervention Menus for Common Challenges")
    print("=" * 60)

    # Extract problem themes from outcomes
    outcome_text = df['Primary_outcomes'].fillna('') + ' ' + df['Secondary_outcomes'].fillna('')

    # Common development challenges
    problem_categories = {
        'School Enrollment': ['enrollment', 'attendance', 'dropout', 'school participation'],
        'Learning Outcomes': ['test scores', 'learning', 'literacy', 'numeracy', 'academic'],
        'Health Service Uptake': ['vaccination', 'immunization', 'clinic visits', 'health service'],
        'Maternal Health': ['maternal', 'pregnancy', 'prenatal', 'antenatal', 'delivery'],
        'Agricultural Productivity': ['yield', 'harvest', 'crop production', 'agricultural output'],
        'Financial Inclusion': ['savings', 'bank account', 'financial access', 'credit'],
        'Income Generation': ['income', 'earnings', 'wages', 'poverty', 'consumption'],
        'Women Empowerment': ['women decision', 'female empowerment', 'gender equality', 'women control'],
        'Nutrition': ['malnutrition', 'stunting', 'nutrition', 'dietary', 'food security'],
        'Employment': ['employment', 'job', 'unemployment', 'work', 'labor']
    }

    intervention_menus = {}

    for problem, keywords in problem_categories.items():
        # Find studies addressing this problem
        relevant_studies = []

        for idx, row in df.iterrows():
            outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
            if any(keyword in outcomes for keyword in keywords):
                relevant_studies.append({
                    'title': row['Title'],
                    'intervention': row['Description_of_intervention'],
                    'country': row['Country'],
                    'year': row['Year'],
                    'findings': row['Findings']
                })

        if len(relevant_studies) >= 3:  # Only include problems with multiple studies
            # Categorize interventions
            intervention_types = defaultdict(list)

            for study in relevant_studies:
                intervention_text = str(study['intervention']).lower()

                # Classify intervention type
                if any(word in intervention_text for word in ['cash', 'payment', 'transfer', 'subsidy']):
                    type_key = 'Financial Incentives'
                elif any(word in intervention_text for word in ['information', 'education', 'training', 'workshop']):
                    type_key = 'Education/Information'
                elif any(word in intervention_text for word in ['nudge', 'reminder', 'sms', 'text', 'message']):
                    type_key = 'Behavioral Nudges'
                elif any(word in intervention_text for word in ['technology', 'digital', 'app', 'mobile']):
                    type_key = 'Technology Solutions'
                elif any(word in intervention_text for word in ['community', 'group', 'peer', 'social']):
                    type_key = 'Social/Community'
                elif any(word in intervention_text for word in ['infrastructure', 'facility', 'building', 'construction']):
                    type_key = 'Infrastructure'
                else:
                    type_key = 'Other Approaches'

                intervention_types[type_key].append(study)

            intervention_menus[problem] = {
                'total_studies': len(relevant_studies),
                'intervention_types': dict(intervention_types)
            }

    # Display intervention menus
    print("\n📋 Intervention Menus for Common Development Challenges:\n")

    for problem, menu in intervention_menus.items():
        print(f"🎯 {problem} ({menu['total_studies']} studies)")
        print("-" * 50)

        for intervention_type, studies in menu['intervention_types'].items():
            print(f"\n  💡 {intervention_type} ({len(studies)} studies):")
            for study in studies[:2]:  # Show top 2 examples
                title_short = study['title'][:60] + '...' if len(study['title']) > 60 else study['title']
                print(f"     • {title_short}")
                print(f"       ({study['country']}, {study['year']})")
        print()

    # Create similarity network for one problem
    if intervention_menus:
        sample_problem = list(intervention_menus.keys())[0]
        print(f"\n🔗 Creating similarity network for: {sample_problem}")
        create_similarity_network(df, sample_problem, problem_categories[sample_problem])

    return intervention_menus

def create_similarity_network(df, problem_name, keywords):
    """
    Create a network showing similar studies for a specific problem.
    """
    # Filter relevant studies
    relevant_indices = []
    for idx, row in df.iterrows():
        outcomes = str(row['Primary_outcomes']).lower() + ' ' + str(row['Secondary_outcomes']).lower()
        if any(keyword in outcomes for keyword in keywords):
            relevant_indices.append(idx)

    if len(relevant_indices) > 2:
        relevant_df = df.iloc[relevant_indices].copy()

        # Create TF-IDF vectors
        vectorizer = TfidfVectorizer(max_features=100, stop_words='english')
        text_data = relevant_df['Abstract'].fillna('') + ' ' + relevant_df['Description_of_intervention'].fillna('')
        tfidf_matrix = vectorizer.fit_transform(text_data)

        # Calculate similarity
        similarity_matrix = cosine_similarity(tfidf_matrix)

        # Find most similar pairs
        print(f"\n  Most similar intervention pairs for {problem_name}:")
        similar_pairs = []
        for i in range(len(similarity_matrix)):
            for j in range(i+1, len(similarity_matrix)):
                if similarity_matrix[i, j] > 0.3:  # Threshold for similarity
                    similar_pairs.append((i, j, similarity_matrix[i, j]))

        similar_pairs.sort(key=lambda x: x[2], reverse=True)

        for i, j, sim in similar_pairs[:5]:
            study1 = relevant_df.iloc[i]
            study2 = relevant_df.iloc[j]
            print(f"\n  Similarity: {sim:.2f}")
            print(f"  Study 1: {study1['Title'][:50]}...")
            print(f"  Study 2: {study2['Title'][:50]}...")

# =====================================
# ANALYSIS 4: SURPRISING RESULTS
# =====================================

def analyze_surprising_findings(df):
    """
    Use sentiment analysis to identify studies with unexpected results.
    """
    print("\n😲 ANALYSIS 4: Studies with Surprising or Unexpected Results")
    print("=" * 60)

    # Keywords indicating surprise or unexpected results
    surprise_keywords = {
        'unexpected': ['unexpected', 'surprising', 'contrary to', 'against expectations', 'paradox'],
        'no_effect': ['no effect', 'no impact', 'no significant', 'null result', 'no difference'],
        'negative': ['negative effect', 'backfired', 'worse', 'reduced', 'decreased', 'harmful'],
        'much_larger': ['larger than expected', 'exceeded', 'remarkable', 'extraordinary', 'substantial'],
        'opposite': ['opposite', 'reverse', 'contrary', 'inverse', 'contradictory']
    }

    surprising_studies = []

    for idx, row in df.iterrows():
        findings = str(row['Findings']).lower()
        abstract = str(row['Abstract']).lower()
        combined_text = findings + ' ' + abstract

        surprise_score = 0
        surprise_types = []

        for surprise_type, keywords in surprise_keywords.items():
            if any(keyword in combined_text for keyword in keywords):
                surprise_score += 1
                surprise_types.append(surprise_type)

        if surprise_score > 0:
            surprising_studies.append({
                'title': row['Title'],
                'year': row['Year'],
                'country': row['Country'],
                'intervention': row['Description_of_intervention'],
                'findings': row['Findings'],
                'surprise_score': surprise_score,
                'surprise_types': surprise_types
            })

    # Sort by surprise score
    surprising_studies.sort(key=lambda x: x['surprise_score'], reverse=True)

    print(f"\nFound {len(surprising_studies)} studies with potentially surprising results")
    print("\n🏆 Top 10 Most Surprising Studies:\n")

    for i, study in enumerate(surprising_studies[:10], 1):
        print(f"{i}. {study['title'][:70]}...")
        print(f"   📍 {study['country']}, {study['year']}")
        print(f"   🎯 Surprise Type: {', '.join(study['surprise_types'])}")
        print(f"   📊 Finding excerpt: {study['findings'][:150]}...")
        print()

    # Categorize by surprise type
    print("\n📊 Distribution of Surprise Types:")
    print("-" * 40)
    surprise_distribution = defaultdict(int)
    for study in surprising_studies:
        for surprise_type in study['surprise_types']:
            surprise_distribution[surprise_type] += 1

    for surprise_type, count in sorted(surprise_distribution.items(), key=lambda x: x[1], reverse=True):
        print(f"  {surprise_type.replace('_', ' ').title()}: {count} studies")

    # Create visualization
    if surprising_studies:
        years = [s['year'] for s in surprising_studies if pd.notna(s['year'])]
        if years:
            fig = px.histogram(x=years, nbins=20,
                             title='Distribution of Surprising Results Over Time',
                             labels={'x': 'Year', 'y': 'Number of Surprising Studies'})
            fig.show()

    return surprising_studies

# =====================================
# ENHANCED SENTIMENT ANALYSIS WITH AI
# =====================================

def ai_sentiment_analysis(df, sample_size=50):
    """
    Use TogetherAI to perform deeper sentiment analysis on findings.
    """
    print("\n🤖 Enhanced AI Sentiment Analysis")
    print("=" * 60)

    # Sample studies for AI analysis
    sample_df = df.sample(min(sample_size, len(df)))

    sentiments = []

    for idx, row in sample_df.iterrows():
        prompt = f"""
        Analyze the sentiment and surprise level of this research finding.

        Study: {row['Title']}
        Intervention: {row['Description_of_intervention']}
        Findings: {row['Findings']}

        Rate on a scale of 1-5:
        1. Surprise Level (1=expected, 5=very surprising)
        2. Positivity (1=very negative, 5=very positive)
        3. Confidence (1=uncertain, 5=very confident)

        Provide a brief explanation of why the results might be surprising.

        Format your response as JSON:
        {
            "surprise_level": X,
            "positivity": X,
            "confidence": X,
            "explanation": "..."
        }
        """

        # This is where you'd call TogetherAI
        # response = together.Complete.create(
        #     model="meta-llama/Llama-2-70b-chat-hf",
        #     prompt=prompt,
        #     max_tokens=200,
        #     temperature=0.3
        # )

        # For now, using placeholder
        sentiments.append({
            'title': row['Title'],
            'surprise_level': np.random.randint(1, 6),
            'positivity': np.random.randint(1, 6),
            'confidence': np.random.randint(1, 6)
        })

    # Convert to DataFrame for analysis
    sentiment_df = pd.DataFrame(sentiments)

    # Display most surprising according to AI
    most_surprising = sentiment_df.nlargest(10, 'surprise_level')
    print("\n🎯 Most Surprising Studies (AI Analysis):")
    for _, row in most_surprising.iterrows():
        print(f"  • {row['title'][:60]}...")
        print(f"    Surprise: {'⭐' * row['surprise_level']}")

    return sentiment_df

# =====================================
# MAIN EXECUTION
# =====================================

def run_all_analyses(df):
    """
    Execute all four analyses in sequence.
    """
    print("🚀 Starting Comprehensive RCT Analysis")
    print("=" * 80)
    print(f"Dataset: {len(df)} studies")
    print(f"Years: {df['Year'].min()} - {df['Year'].max()}")
    print(f"Countries: {df['Country'].nunique()} unique countries")
    print("=" * 80)

    # Run analyses
    results = {}

    # 1. Natural Topic Clustering
    df_clustered, clusters = cluster_studies_by_embeddings(df)
    results['clusters'] = clusters

    # 2. Timeline Analysis
    timeline = analyze_research_timeline(df)
    results['timeline'] = timeline

    # 3. Intervention Menus
    menus = find_intervention_menus(df)
    results['intervention_menus'] = menus

    # 4. Surprising Findings
    surprising = analyze_surprising_findings(df)
    results['surprising_studies'] = surprising

    # Save results
    print("\n💾 Saving analysis results...")

    # Save clustered dataframe
    df_clustered.to_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_clustered.csv', index=False)

    # Save analysis summaries
    with open('/content/drive/MyDrive/AEA_RCT_Parsed/analysis_results.json', 'w') as f:
        json.dump(results, f, indent=2, default=str)

    print("\n✅ Analysis complete! Results saved.")
    print("   - studies_clustered.csv: Studies with cluster assignments")
    print("   - analysis_results.json: Detailed analysis summaries")

    return results

# Execute all analyses
if __name__ == "__main__":
    results = run_all_analyses(df)

🚀 Starting Comprehensive RCT Analysis
Dataset: 2117 studies
Years: 1960 - 2025
Countries: 125 unique countries
🔬 ANALYSIS 1: Discovering Natural Research Communities
Clusters: 5, Silhouette Score: 0.032
Clusters: 6, Silhouette Score: 0.034
Clusters: 7, Silhouette Score: 0.038
Clusters: 8, Silhouette Score: 0.035
Clusters: 9, Silhouette Score: 0.037
Clusters: 10, Silhouette Score: 0.037
Clusters: 11, Silhouette Score: 0.037
Clusters: 12, Silhouette Score: 0.037
Clusters: 13, Silhouette Score: 0.036
Clusters: 14, Silhouette Score: 0.035
Clusters: 15, Silhouette Score: 0.035
Clusters: 16, Silhouette Score: 0.036
Clusters: 17, Silhouette Score: 0.036
Clusters: 18, Silhouette Score: 0.035
Clusters: 19, Silhouette Score: 0.036

✨ Optimal number of clusters: 7

Creating 2D visualization...

📊 Cluster Analysis:
------------------------------------------------------------

🔹 Cluster 0 (287 studies)
   Top Keywords: Labor, Gender; Labor, Education; Labor
   Main Countries: United States of Ameri


📅 ANALYSIS 2: Evolution of Development Research Paradigms



🔄 Detected Paradigm Shifts:
----------------------------------------
Microfinance Era:
  Peak Year: 1992 (100.0% of studies)
  Trend: 📈 Rising (+7970.4% change)

Behavioral Revolution:
  Peak Year: 1987 (50.0% of studies)
  Trend: 📈 Rising (+22893.4% change)

Digital Transformation:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-10.1% change)

Cash Transfer Wave:
  Peak Year: 1999 (100.0% of studies)
  Trend: 📈 Rising (+266.7% change)

Women Empowerment:
  Peak Year: 1960 (100.0% of studies)
  Trend: ➡️ Stable (-19.6% change)

Health Interventions:
  Peak Year: 1960 (100.0% of studies)
  Trend: 📉 Declining (-46.6% change)

Education Innovation:
  Peak Year: 1962 (100.0% of studies)
  Trend: 📉 Declining (-43.8% change)

Agriculture & Climate:
  Peak Year: 1998 (33.3% of studies)
  Trend: 📈 Rising (+7558.8% change)

Governance & Institutions:
  Peak Year: 2002 (66.7% of studies)
  Trend: 📈 Rising (+24417.7% change)


🎯 ANALYSIS 3: Intervention Menus for Common Challenges

📋 


💾 Saving analysis results...

✅ Analysis complete! Results saved.
   - studies_clustered.csv: Studies with cluster assignments
   - analysis_results.json: Detailed analysis summaries


In [None]:
import pandas as pd, json, plotly.express as px, plotly.graph_objects as go

# Adjust paths if not in Colab
df_clustered = pd.read_csv('/content/drive/MyDrive/AEA_RCT_Parsed/studies_clustered.csv')
with open('/content/drive/MyDrive/AEA_RCT_Parsed/analysis_results.json', 'r') as f:
    results = json.load(f)

In [None]:
# You can change color scale, marker size, hover data, etc.
fig = px.scatter(
    df_clustered, x='x', y='y', color=df_clustered['cluster'].astype(str),
    hover_data=['Title','Country','Year'],
    title='Research Community Clusters',
    labels={'color':'Cluster','x':'Similarity Dim 1','y':'Similarity Dim 2'},
    color_discrete_sequence=px.colors.qualitative.Set3
)
fig.update_traces(marker=dict(size=8, opacity=0.85))
fig.update_layout(legend=dict(title='Cluster', itemsizing='constant'))
fig.show()

In [None]:
menus = results['intervention_menus']  # {problem: {total_studies, intervention_types:{type:[...]}}}

# Pick a problem to visualize (or loop/make a dropdown)
problem = next(iter(menus.keys()))
types = menus[problem]['intervention_types']
menu_df = pd.DataFrame([
    {'type': t, 'count': len(studies)} for t, studies in types.items()
]).sort_values('count', ascending=False)

fig = px.bar(menu_df, x='type', y='count', title=f'Intervention Types for {problem}',
             labels={'type':'Intervention Type', 'count':'Number of Studies'})
fig.update_layout(xaxis_tickangle=-30)
fig.show()

In [None]:
surprising = pd.DataFrame(results['surprising_studies'])
surprising['year'] = pd.to_numeric(surprising['year'], errors='coerce')

# Histogram over time (bins, colors, labels, etc. are easy to tweak)
fig = px.histogram(
    surprising.dropna(subset=['year']),
    x='year', nbins=20,
    title='Distribution of Surprising Results Over Time',
    labels={'year':'Year', 'count':'Number of Studies'}
)
fig.show()

# Optional: bar of surprise types
counts = (surprising.explode('surprise_types')
                     .groupby('surprise_types').size()
                     .reset_index(name='n')
                     .sort_values('n', ascending=False))
fig2 = px.bar(counts, x='surprise_types', y='n',
              title='Surprise Types Count',
              labels={'surprise_types':'Type','n':'Studies'})
fig2.update_layout(xaxis_tickangle=-20)
fig2.show()

In [None]:
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go

# Load results saved by 04. analysis
# results = io_utils.load_results_json() # Removed the line causing the error
menus = results.get("intervention_menus", {})

# Flatten to counts per (problem, intervention_type)
records = []
for problem, menu in menus.items():
    types = menu.get("intervention_types", {})
    for itype, studies in types.items():
        records.append({"problem": problem, "intervention_type": itype, "count": len(studies)})

df_counts = pd.DataFrame(records).sort_values(["problem", "count"], ascending=[True, False]).reset_index(drop=True)
df_counts

Unnamed: 0,problem,intervention_type,count
0,Agricultural Productivity,Education/Information,19
1,Agricultural Productivity,Other Approaches,13
2,Agricultural Productivity,Financial Incentives,9
3,Agricultural Productivity,Behavioral Nudges,2
4,Agricultural Productivity,Social/Community,2
...,...,...,...
58,School Enrollment,Behavioral Nudges,2
59,School Enrollment,Infrastructure,1
60,Women Empowerment,Financial Incentives,5
61,Women Empowerment,Education/Information,3


In [None]:
# Sunburst
fig_sb = px.sunburst(
    df_counts,
    path=["problem", "intervention_type"],
    values="count",
    color="intervention_type",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Menus: Problems → Types (Sunburst)"
)
fig_sb.update_traces(textinfo="label+percent entry")
fig_sb.show()

# Icicle (alternate hierarchical view)
fig_ic = px.icicle(
    df_counts,
    path=["problem", "intervention_type"],
    values="count",
    color="intervention_type",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Menus: Problems → Types (Icicle)"
)
fig_ic.show()

In [None]:
fig_tm = px.treemap(
    df_counts,
    path=["problem", "intervention_type"],
    values="count",
    color="intervention_type",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Menus: Problems → Types (Treemap)"
)
fig_tm.update_traces(textinfo="label+value")
fig_tm.show()

In [None]:
matrix = (
    df_counts
    .pivot(index="problem", columns="intervention_type", values="count")
    .fillna(0)
    .astype(int)
)

fig_hm = px.imshow(
    matrix,
    color_continuous_scale="Blues",
    aspect="auto",
    title="Counts by Problem × Intervention Type"
)
fig_hm.update_layout(xaxis_title="Intervention Type", yaxis_title="Problem")
fig_hm.show()

In [None]:
df_share = (
    df_counts
    .assign(total=lambda d: d.groupby("problem")["count"].transform("sum"))
    .assign(share=lambda d: d["count"] / d["total"])
)

fig_stacked = px.bar(
    df_share,
    x="problem",
    y="share",
    color="intervention_type",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Menu Composition by Problem (Shares)",
    text=df_share["share"].map(lambda v: f"{v:.0%}").where(df_share["share"] >= 0.12, None)
)
fig_stacked.update_layout(barmode="stack", yaxis_tickformat=".0%")
fig_stacked.show()

In [None]:
fig_facets = px.bar(
    df_counts,
    x="intervention_type",
    y="count",
    color="intervention_type",
    facet_col="problem",
    facet_col_wrap=3,
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Types per Problem (Faceted)",
    text="count"
)
fig_facets.update_xaxes(tickangle=-30, matches=None, showticklabels=True)
fig_facets.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig_facets.update_layout(showlegend=False)
fig_facets.show()

In [None]:
# Sankey: flows from problems to intervention types (fixed target indexing)
import plotly.graph_objects as go

problems = df_counts["problem"].unique().tolist()
types = df_counts["intervention_type"].unique().tolist()
labels = problems + types

idx = {label: i for i, label in enumerate(labels)}

sources, targets, values = [], [], []
for _, row in df_counts.iterrows():
    # direct indices from the unified label map (no offset)
    sources.append(idx[row["problem"]])
    targets.append(idx[row["intervention_type"]])
    values.append(int(row["count"]))

fig_sk = go.Figure(data=[go.Sankey(
    node=dict(label=labels, pad=12, thickness=16),
    link=dict(source=sources, target=targets, value=values)
)])
fig_sk.update_layout(title_text="Evidence Flows: Problems → Intervention Types", font_size=12)
fig_sk.show()

In [None]:
fig_anim = px.bar(
    df_counts.sort_values(["problem", "count"], ascending=[True, False]),
    x="intervention_type",
    y="count",
    color="intervention_type",
    animation_frame="problem",
    color_discrete_sequence=px.colors.qualitative.Set3,
    title="Intervention Menus (Animate by Problem)"
)
fig_anim.update_xaxes(tickangle=-30)
fig_anim.show()