<a href="https://colab.research.google.com/github/leosammallahti/AnalysisCoLab/blob/main/RCTAnalysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter
import re
import warnings
warnings.filterwarnings('ignore')

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Set up paths
base_path = '/content/drive/MyDrive/AEA_RCT_Parsed/'

# Load the enriched dataset with AI-generated fields
df = pd.read_csv(base_path + 'structured_studies_full_v2.csv')

print(f"Loaded {len(df)} studies")
print(f"Columns: {df.columns.tolist()}")

# Filter out studies with inadequate abstracts (as mentioned in your pipeline)
# These are marked with "No abstract available" in tldr_error
df_clean = df[df['tldr_error'] != 'No abstract available'].copy()
print(f"After filtering inadequate abstracts: {len(df_clean)} studies")

# Quick data inspection
print("\nSample of key fields:")
print(df_clean[['title', 'intervention_type', 'outcome_extracted', 'country_extracted']].head())

Mounted at /content/drive
Loaded 2289 studies
Columns: ['rct_id', 'title', 'final_has_findings', 'final_findings_snippet', 'final_findings_source_type', 'country', 'start_date', 'end_date', 'primary_investigator_name', 'origin_url', 'initial_registration_date', 'first_published', 'last_updated', 'region', 'status', 'keywords', 'keywords_additional', 'jel_codes', 'secondary_ids', 'abstract', 'registration_citation', 'interventions', 'intervention_hidden', 'intervention_start_date', 'intervention_end_date', 'primary_outcomes_endpoints', 'primary_outcomes_explanation', 'secondary_outcomes_endpoints', 'secondary_outcomes_explanation', 'experimental_design', 'experimental_design_details', 'randomization_method', 'randomization_unit', 'treatment_clustered', 'sample_size_clusters', 'sample_size_observations', 'sample_size_by_arm', 'minimum_detectable_effect', 'external_links', 'emails_found', 'primary_investigator_affiliation', 'other_pi_1_name', 'other_pi_1_affiliation', 'relevant_1_abstract

In [None]:
# INTERVENTION DNA ANALYSIS
# Breaking down interventions into their component parts

def extract_intervention_dna(df):
    """
    Extract binary features for intervention components
    Using intervention_structured field (or fallback to other intervention fields)
    """

    # Create a combined intervention text field for analysis
    df['intervention_text'] = df['intervention_structured'].fillna('') + ' ' + \
                               df['interventions'].fillna('') + ' ' + \
                               df['intervention_type'].fillna('')

    # Define DNA components with search patterns
    dna_patterns = {
        'cash_transfer': r'cash|money|payment|transfer|grant|subsidy|unconditional|CCT|UCT',
        'training': r'training|education|workshop|teach|capacity building|skill|curriculum',
        'digital_tech': r'mobile|SMS|app|digital|phone|internet|online|platform|software',
        'health_service': r'health|medical|clinic|doctor|nurse|vaccine|treatment|medicine',
        'information': r'information|awareness|campaign|messaging|communication|media',
        'microfinance': r'credit|loan|savings|microfinance|microcredit|financial',
        'infrastructure': r'infrastructure|building|road|water|electricity|sanitation|construction',
        'nudge_behavioral': r'nudge|reminder|behavioral|psychology|framing|social norm',
        'community_based': r'community|group|collective|peer|social|network|participat',
        'governance': r'governance|accountability|transparency|corruption|monitoring|audit',
        'agriculture': r'agricultur|farm|crop|seed|fertilizer|irrigation|livestock',
        'women_focused': r'women|female|gender|girl|maternal|empowerment'
    }

    # Extract DNA components
    for component, pattern in dna_patterns.items():
        df[f'dna_{component}'] = df['intervention_text'].str.contains(
            pattern, case=False, na=False
        ).astype(int)

    return df

# Apply DNA extraction
df_dna = extract_intervention_dna(df_clean)

# Get DNA columns
dna_cols = [col for col in df_dna.columns if col.startswith('dna_')]

# Calculate DNA statistics
dna_stats = pd.DataFrame({
    'Component': [col.replace('dna_', '').replace('_', ' ').title() for col in dna_cols],
    'Count': [df_dna[col].sum() for col in dna_cols],
    'Percentage': [df_dna[col].mean() * 100 for col in dna_cols]
}).sort_values('Count', ascending=False)

# Visualize DNA components
fig_dna = px.bar(dna_stats,
                 x='Count',
                 y='Component',
                 orientation='h',
                 title='Intervention DNA: Most Common Components in RCTs',
                 labels={'Count': 'Number of Studies', 'Component': 'Intervention Component'},
                 color='Percentage',
                 color_continuous_scale='Viridis',
                 text='Count')

fig_dna.update_traces(texttemplate='%{text} (%{color:.1f}%)', textposition='outside')
fig_dna.update_layout(height=600, width=900)
fig_dna.show()

print("\nIntervention DNA Statistics:")
print(dna_stats.to_string(index=False))

KeyError: 'intervention_structured'

In [None]:
# First, let's see what columns we actually have
print("Available columns in the dataset:")
print(df_clean.columns.tolist())

# Let's specifically look for intervention-related columns
intervention_cols = [col for col in df_clean.columns if 'intervention' in col.lower()]
print("\nIntervention-related columns found:")
print(intervention_cols)

# Let's also check for abstract and findings columns
abstract_cols = [col for col in df_clean.columns if 'abstract' in col.lower()]
findings_cols = [col for col in df_clean.columns if 'finding' in col.lower()]

print("\nAbstract-related columns:")
print(abstract_cols)
print("\nFindings-related columns:")
print(findings_cols)

# Look at a sample of the data to understand the structure
print("\nSample of intervention data:")
for col in intervention_cols[:3]:  # Show first 3 intervention columns
    print(f"\n{col}:")
    print(df_clean[col].dropna().head(2).tolist())

Available columns in the dataset:
['rct_id', 'title', 'final_has_findings', 'final_findings_snippet', 'final_findings_source_type', 'country', 'start_date', 'end_date', 'primary_investigator_name', 'origin_url', 'initial_registration_date', 'first_published', 'last_updated', 'region', 'status', 'keywords', 'keywords_additional', 'jel_codes', 'secondary_ids', 'abstract', 'registration_citation', 'interventions', 'intervention_hidden', 'intervention_start_date', 'intervention_end_date', 'primary_outcomes_endpoints', 'primary_outcomes_explanation', 'secondary_outcomes_endpoints', 'secondary_outcomes_explanation', 'experimental_design', 'experimental_design_details', 'randomization_method', 'randomization_unit', 'treatment_clustered', 'sample_size_clusters', 'sample_size_observations', 'sample_size_by_arm', 'minimum_detectable_effect', 'external_links', 'emails_found', 'primary_investigator_affiliation', 'other_pi_1_name', 'other_pi_1_affiliation', 'relevant_1_abstract', 'relevant_1_prelim

In [None]:
# INTERVENTION DNA ANALYSIS - Using YOUR AI-GENERATED KEYWORDS
# This leverages the work you've already done!

# First, let's examine what AI-generated fields we have
print("AI-Generated Fields Available:")
ai_fields = ['intervention_type', 'outcome_extracted', 'population_extracted',
             'country_extracted', 'tldr']

# Also check for the enhanced keyword columns
keyword_cols = [col for col in df_clean.columns if 'keywords' in col.lower()]
print(f"\nKeyword columns: {keyword_cols}")

# Let's look at samples of these AI-extracted fields
print("\n" + "="*60)
print("SAMPLES OF AI-EXTRACTED INTERVENTION TYPES:")
print("="*60)
intervention_types = df_clean['intervention_type'].dropna().value_counts().head(20)
print(intervention_types)

# Now let's analyze the distribution and patterns
print("\n" + "="*60)
print("ANALYZING INTERVENTION PATTERNS FROM AI KEYWORDS:")
print("="*60)

# Create intervention DNA from the AI-extracted intervention_type field
def create_dna_from_ai_keywords(df):
    """
    Create intervention DNA using the AI-extracted intervention_type field
    Instead of guessing keywords, we use what the AI already identified
    """

    # Get all unique intervention types (these are AI-generated)
    all_interventions = df['intervention_type'].dropna().str.lower().unique()

    print(f"Found {len(all_interventions)} unique AI-identified intervention types")

    # Group similar interventions using the AI's own categorizations
    # We can look for common themes in what the AI extracted
    intervention_categories = {}

    # Count frequency of key terms in the AI-generated intervention types
    from collections import Counter

    # Split all intervention types into words and count
    all_words = []
    for intervention in all_interventions:
        words = str(intervention).lower().split()
        all_words.extend(words)

    word_freq = Counter(all_words)

    # Get the most common intervention terms (as identified by AI)
    common_terms = word_freq.most_common(30)

    print("\nMost common terms in AI-extracted intervention types:")
    for term, count in common_terms[:15]:
        if len(term) > 3:  # Skip short words
            print(f"  {term}: {count} occurrences")

    return df, common_terms

df_analyzed, intervention_terms = create_dna_from_ai_keywords(df_clean)

# Analyze the enhanced keywords if available
if 'keywords_methodology' in df_clean.columns:
    print("\n" + "="*60)
    print("ANALYZING ENHANCED AI KEYWORDS:")
    print("="*60)

    # These are your 12 categories of enhanced keywords
    enhanced_categories = [
        'keywords_methodology', 'keywords_sector', 'keywords_mechanisms',
        'keywords_implementation', 'keywords_context', 'keywords_partners'
    ]

    for category in enhanced_categories:
        if category in df_clean.columns:
            # Get sample of keywords in this category
            sample_keywords = df_clean[category].dropna().head(3)
            print(f"\n{category.replace('keywords_', '').upper()}:")
            for keywords in sample_keywords:
                print(f"  • {str(keywords)[:100]}...")

# Create a matrix of intervention types vs outcomes (both AI-extracted)
print("\n" + "="*60)
print("INTERVENTION-OUTCOME MATRIX (from AI extractions):")
print("="*60)

# Create crosstab of AI-extracted fields
intervention_outcome_matrix = pd.crosstab(
    df_clean['intervention_type'].fillna('Not specified'),
    df_clean['outcome_extracted'].fillna('Not specified')
)

# Show top combinations
top_interventions = df_clean['intervention_type'].value_counts().head(10).index
top_outcomes = df_clean['outcome_extracted'].value_counts().head(10).index

matrix_subset = intervention_outcome_matrix.loc[
    intervention_outcome_matrix.index.isin(top_interventions),
    intervention_outcome_matrix.columns.isin(top_outcomes)
]

# Visualize the matrix
import plotly.graph_objects as go

fig = go.Figure(data=go.Heatmap(
    z=matrix_subset.values,
    x=matrix_subset.columns,
    y=matrix_subset.index,
    colorscale='Blues',
    text=matrix_subset.values,
    texttemplate='%{text}',
    textfont={"size": 10}
))

fig.update_layout(
    title='AI-Identified Intervention Types vs Outcomes',
    xaxis_title='Outcome (AI-extracted)',
    yaxis_title='Intervention Type (AI-extracted)',
    height=600,
    width=1000
)

fig.show()

print("\nTop Intervention-Outcome Combinations:")
for i in range(min(10, len(matrix_subset.index))):
    for j in range(min(5, len(matrix_subset.columns))):
        count = matrix_subset.iloc[i, j]
        if count > 0:
            print(f"  • {matrix_subset.index[i]} → {matrix_subset.columns[j]}: {count} studies")

AI-Generated Fields Available:

Keyword columns: ['keywords', 'keywords_additional', 'keywords_methodology', 'keywords_sector', 'keywords_mechanisms', 'keywords_implementation', 'keywords_context', 'keywords_partners']

SAMPLES OF AI-EXTRACTED INTERVENTION TYPES:
intervention_type
social identity manipulation; moral values alignment; group similarity; rule-following task                                          3
information treatments, fiscal policy uncertainty, exogenous changes, randomized                                                     2
export procedure training, business training, capacity building, skill development                                                   2
online survey experiment; information provision; random assignment                                                                   2
construal level manipulation; psychological distance; social distance; temporal distance; advertising appeals                        2
savings lockboxes, financial access innovat


Top Intervention-Outcome Combinations:
  • savings lockboxes, financial access innovations, promotion of savings → alcohol consumption, blood pressure, temptation spending, health outcomes: 2 studies


In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from openai import OpenAI, OpenAIError
import json
import os

# --- PASTE YOUR API KEY HERE ---
# Since you are in a Colab environment, the easiest way is to set the key directly.
# Replace "your-api-key-goes-here" with your actual Together AI key.
TOGETHER_API_KEY = "your-api-key-goes-here"

# --- 1. Create a Sample DataFrame ---
# In a real scenario, you would load your data here.
# This sample DataFrame mimics the structure of your 'df_clustered'.
data = {
    'intervention_full_text': [
        "cash transfer program for poor households to improve nutrition",
        "microfinance loans for small business owners in rural areas",
        "agricultural training for farmers on new crop varieties",
        "conditional cash transfers linked to school attendance",
        "small loans and financial literacy training for women entrepreneurs",
        "providing fertilizer and seeds to improve crop yields for farmers",
        "unconditional cash aid to families in poverty",
        "business grants for new startups in urban centers",
        "irrigation system improvements for local farming communities",
        "scholarships for girls to encourage secondary education enrollment"
    ],
    'title': [
        "Nutrition Impact of Cash Transfers",
        "Microfinance and Rural Business",
        "Modern Farming Techniques Study",
        "School Attendance and CCTs",
        "Women's Entrepreneurship Support",
        "Crop Yield Improvement Program",
        "Poverty Alleviation with UCT",
        "Urban Startup Grant Effects",
        "Farming Irrigation Project",
        "Girls' Education Scholarships"
    ],
    'intervention_type': [
        "Cash Transfer", "Finance", "Agriculture", "Cash Transfer", "Finance",
        "Agriculture", "Cash Transfer", "Finance", "Agriculture", "Education"
    ]
}
df_clustered = pd.DataFrame(data)

# --- 2. Create TF-IDF Vectors from Text Data ---
# This block creates the 'intervention_vectors' variable that was previously undefined.
print("Creating TF-IDF vectors from intervention text...")
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
intervention_vectors = vectorizer.fit_transform(df_clustered['intervention_full_text'])
print(f"Successfully created a sparse matrix with shape: {intervention_vectors.shape}")
print("-" * 30)


# --- 3. Function to Find Optimal Clusters ---
# This is the function you provided, now fully runnable with the data prepared above.

# Note: The client is now configured for Together AI.
try:
    # Switched to Together AI client
    # The openai library is compatible, we just change the base_url and api_key.
    client = OpenAI(
        api_key=TOGETHER_API_KEY,
        base_url="https://api.together.xyz/v1",
    )
except (ImportError, OpenAIError):
    print("OpenAI library not found or API key is invalid. LLM analysis will be skipped.")
    client = None

def find_optimal_clusters_with_llm(df, vectors, min_k=3, max_k=8):
    """
    Use multiple metrics and optional LLM analysis to find the optimal number of clusters.

    Args:
        df (pd.DataFrame): The dataframe containing the source data.
        vectors (scipy.sparse.matrix): The TF-IDF vectors to cluster.
        min_k (int): The minimum number of clusters to test.
        max_k (int): The maximum number of clusters to test.

    Returns:
        tuple: A tuple containing the optimal k, a dataframe of metrics, and the LLM recommendation.
    """
    print("Finding optimal number of clusters...")

    # Adjust max_k if it's larger than the number of samples
    if max_k >= vectors.shape[0]:
        max_k = vectors.shape[0] - 1
        print(f"Warning: max_k is too high for the number of samples. Adjusting to {max_k}.")

    # Step 1: Calculate clustering metrics for different k values
    metrics = []

    for k in range(min_k, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(vectors)

        # Ensure there's more than one cluster to calculate metrics
        if len(set(labels)) > 1:
            silhouette = silhouette_score(vectors, labels)
            davies_bouldin = davies_bouldin_score(vectors.toarray(), labels)
            inertia = kmeans.inertia_

            cluster_sizes = np.bincount(labels)
            size_std = np.std(cluster_sizes)
            min_size = np.min(cluster_sizes)

            metrics.append({
                'k': k,
                'silhouette': silhouette,
                'davies_bouldin': davies_bouldin,
                'inertia': inertia,
                'size_std': size_std,
                'min_cluster_size': min_size
            })

            print(f"k={k}: silhouette={silhouette:.3f}, DB={davies_bouldin:.3f}, min_size={min_size}")
        else:
            print(f"k={k}: Only one cluster was found. Metrics cannot be calculated.")

    if not metrics:
        print("Could not calculate metrics for any k value. Aborting.")
        return min_k, pd.DataFrame(), None

    metrics_df = pd.DataFrame(metrics)

    # Step 2: Use LLM to analyze the metrics and suggest optimal k
    if not client or TOGETHER_API_KEY == "your-api-key-goes-here":
        if not client:
             print("\nOpenAI client library not initialized. Skipping LLM analysis.")
        else:
             print("\nAPI key not set. Skipping LLM analysis.")
        # Fallback: simple logic based on silhouette score
        optimal_k = int(metrics_df.loc[metrics_df['silhouette'].idxmax()]['k'])
        return optimal_k, metrics_df, None

    prompt = f"""Analyze these clustering metrics for an RCT intervention dataset:

{metrics_df.to_string()}

Context: We're clustering {len(df)} development economics RCT studies based on their interventions.

Consider:
1. Silhouette score (higher is better, measures cluster separation)
2. Davies-Bouldin index (lower is better, measures cluster compactness)
3. Minimum cluster size (avoid too many tiny clusters)
4. Size standard deviation (prefer balanced clusters)

Recommend the optimal number of clusters (k) that:
- Provides meaningful groupings for policy analysis
- Avoids too many micro-clusters
- Balances statistical metrics with interpretability

Respond with ONLY the JSON object, without any additional text or formatting:
{{
    "optimal_k": <number>,
    "reasoning": "...",
    "alternatives": [<number>, <number>],
    "expected_interpretation": "..."
}}"""

    try:
        print("\nQuerying Together AI with Llama 3.3 model for optimal k recommendation...")
        response = client.chat.completions.create(
            # Using the correct Meta Llama 3.3 model name for Together AI
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            messages=[
                {"role": "system", "content": "You are an expert in clustering analysis and development economics research. Respond only with a valid JSON object."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            response_format={"type": "json_object"}
        )

        recommendation = json.loads(response.choices[0].message.content)

        print(f"\nLLM Recommendation: {recommendation['optimal_k']} clusters")
        print(f"Reasoning: {recommendation['reasoning']}")

        return recommendation['optimal_k'], metrics_df, recommendation

    except Exception as e:
        print(f"\nLLM analysis failed: {e}")
        # Fallback: find k with the best silhouette score
        print("Using fallback logic to determine optimal k.")
        optimal_k = int(metrics_df.loc[metrics_df['silhouette'].idxmax()]['k'])
        return optimal_k, metrics_df, None

# --- 4. Run the Optimization ---
# The function is now called with the dataframe and the newly created vectors.
# Note: The sample data is small, so we test a smaller range of k.
optimal_k, metrics_df, recommendation = find_optimal_clusters_with_llm(
    df_clustered,
    intervention_vectors,
    min_k=2,
    max_k=5 # Adjusted for small sample size
)

print("-" * 30)
print(f"\nFinal Analysis: Using {optimal_k} clusters.")
if not metrics_df.empty:
    print("\nMetrics Table:")
    print(metrics_df)


Creating TF-IDF vectors from intervention text...
Successfully created a sparse matrix with shape: (10, 9)
------------------------------
Finding optimal number of clusters...
k=2: silhouette=0.290, DB=1.067, min_size=3
k=3: silhouette=0.325, DB=1.057, min_size=2
k=4: silhouette=0.355, DB=0.956, min_size=2
k=5: silhouette=0.388, DB=0.601, min_size=1

API key not set. Skipping LLM analysis.
------------------------------

Final Analysis: Using 5 clusters.

Metrics Table:
   k  silhouette  davies_bouldin   inertia  size_std  min_cluster_size
0  2    0.289613        1.066546  4.085539  2.000000                 3
1  3    0.324597        1.056938  2.816366  1.247219                 2
2  4    0.355044        0.955539  1.878044  0.500000                 2
3  5    0.387522        0.601293  1.211377  0.632456                 1


In [None]:
TOGETHER_API_KEY = "tgp_v1_BsE1iA9fN4XlbJUbqcaDSDomw8saxT3WmcVA7jh_cis"

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from openai import OpenAI, OpenAIError
import json
import os

# --- PASTE YOUR API KEY HERE ---
# Since you are in a Colab environment, the easiest way is to set the key directly.
# Replace "your-api-key-goes-here" with your actual Together AI key.
TOGETHER_API_KEY = "tgp_v1_BsE1iA9fN4XlbJUbqcaDSDomw8saxT3WmcVA7jh_cis"

# --- 1. Create a Sample DataFrame ---
# In a real scenario, you would load your data here.
# This sample DataFrame mimics the structure of your 'df_clustered'.
data = {
    'intervention_full_text': [
        "cash transfer program for poor households to improve nutrition",
        "microfinance loans for small business owners in rural areas",
        "agricultural training for farmers on new crop varieties",
        "conditional cash transfers linked to school attendance",
        "small loans and financial literacy training for women entrepreneurs",
        "providing fertilizer and seeds to improve crop yields for farmers",
        "unconditional cash aid to families in poverty",
        "business grants for new startups in urban centers",
        "irrigation system improvements for local farming communities",
        "scholarships for girls to encourage secondary education enrollment"
    ],
    'title': [
        "Nutrition Impact of Cash Transfers",
        "Microfinance and Rural Business",
        "Modern Farming Techniques Study",
        "School Attendance and CCTs",
        "Women's Entrepreneurship Support",
        "Crop Yield Improvement Program",
        "Poverty Alleviation with UCT",
        "Urban Startup Grant Effects",
        "Farming Irrigation Project",
        "Girls' Education Scholarships"
    ],
    'intervention_type': [
        "Cash Transfer", "Finance", "Agriculture", "Cash Transfer", "Finance",
        "Agriculture", "Cash Transfer", "Finance", "Agriculture", "Education"
    ]
}
df_clustered = pd.DataFrame(data)

# --- 2. Create TF-IDF Vectors from Text Data ---
# This block creates the 'intervention_vectors' variable that was previously undefined.
print("Creating TF-IDF vectors from intervention text...")
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
intervention_vectors = vectorizer.fit_transform(df_clustered['intervention_full_text'])
print(f"Successfully created a sparse matrix with shape: {intervention_vectors.shape}")
print("-" * 30)


# --- 3. Function to Find Optimal Clusters ---
# This is the function you provided, now fully runnable with the data prepared above.

# Note: The client is now configured for Together AI.
try:
    # Switched to Together AI client
    # The openai library is compatible, we just change the base_url and api_key.
    client = OpenAI(
        api_key=TOGETHER_API_KEY,
        base_url="https://api.together.xyz/v1",
    )
except (ImportError, OpenAIError):
    print("OpenAI library not found or API key is invalid. LLM analysis will be skipped.")
    client = None

def find_optimal_clusters_with_llm(df, vectors, min_k=3, max_k=8):
    """
    Use multiple metrics and optional LLM analysis to find the optimal number of clusters.

    Args:
        df (pd.DataFrame): The dataframe containing the source data.
        vectors (scipy.sparse.matrix): The TF-IDF vectors to cluster.
        min_k (int): The minimum number of clusters to test.
        max_k (int): The maximum number of clusters to test.

    Returns:
        tuple: A tuple containing the optimal k, a dataframe of metrics, and the LLM recommendation.
    """
    print("Finding optimal number of clusters...")

    # Adjust max_k if it's larger than the number of samples
    if max_k >= vectors.shape[0]:
        max_k = vectors.shape[0] - 1
        print(f"Warning: max_k is too high for the number of samples. Adjusting to {max_k}.")

    # Step 1: Calculate clustering metrics for different k values
    metrics = []

    for k in range(min_k, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(vectors)

        # Ensure there's more than one cluster to calculate metrics
        if len(set(labels)) > 1:
            silhouette = silhouette_score(vectors, labels)
            davies_bouldin = davies_bouldin_score(vectors.toarray(), labels)
            inertia = kmeans.inertia_

            cluster_sizes = np.bincount(labels)
            size_std = np.std(cluster_sizes)
            min_size = np.min(cluster_sizes)

            metrics.append({
                'k': k,
                'silhouette': silhouette,
                'davies_bouldin': davies_bouldin,
                'inertia': inertia,
                'size_std': size_std,
                'min_cluster_size': min_size
            })

            print(f"k={k}: silhouette={silhouette:.3f}, DB={davies_bouldin:.3f}, min_size={min_size}")
        else:
            print(f"k={k}: Only one cluster was found. Metrics cannot be calculated.")

    if not metrics:
        print("Could not calculate metrics for any k value. Aborting.")
        return min_k, pd.DataFrame(), None

    metrics_df = pd.DataFrame(metrics)

    # Step 2: Use LLM to analyze the metrics and suggest optimal k
    if not client or TOGETHER_API_KEY == "your-api-key-goes-here":
        if not client:
             print("\nOpenAI client library not initialized. Skipping LLM analysis.")
        else:
             print("\nAPI key not set. Skipping LLM analysis.")
        # Fallback: simple logic based on silhouette score
        optimal_k = int(metrics_df.loc[metrics_df['silhouette'].idxmax()]['k'])
        return optimal_k, metrics_df, None

    prompt = f"""Analyze these clustering metrics for an RCT intervention dataset:

{metrics_df.to_string()}

Context: We're clustering {len(df)} development economics RCT studies based on their interventions.

Consider:
1. Silhouette score (higher is better, measures cluster separation)
2. Davies-Bouldin index (lower is better, measures cluster compactness)
3. Minimum cluster size (avoid too many tiny clusters)
4. Size standard deviation (prefer balanced clusters)

Recommend the optimal number of clusters (k) that:
- Provides meaningful groupings for policy analysis
- Avoids too many micro-clusters
- Balances statistical metrics with interpretability

Respond with ONLY the JSON object, without any additional text or formatting:
{{
    "optimal_k": <number>,
    "reasoning": "...",
    "alternatives": [<number>, <number>],
    "expected_interpretation": "..."
}}"""

    try:
        print("\nQuerying Together AI with Llama 3.3 model for optimal k recommendation...")
        response = client.chat.completions.create(
            # Using the correct Meta Llama 3.3 model name for Together AI
            model="meta-llama/Llama-3.3-70B-Instruct-Turbo-Free",
            messages=[
                {"role": "system", "content": "You are an expert in clustering analysis and development economics research. Respond only with a valid JSON object."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            response_format={"type": "json_object"}
        )

        recommendation = json.loads(response.choices[0].message.content)

        print(f"\nLLM Recommendation: {recommendation['optimal_k']} clusters")
        print(f"Reasoning: {recommendation['reasoning']}")

        return recommendation['optimal_k'], metrics_df, recommendation

    except Exception as e:
        print(f"\nLLM analysis failed: {e}")
        # Fallback: find k with the best silhouette score
        print("Using fallback logic to determine optimal k.")
        optimal_k = int(metrics_df.loc[metrics_df['silhouette'].idxmax()]['k'])
        return optimal_k, metrics_df, None

# --- 4. Run the Optimization ---
# The function is now called with the dataframe and the newly created vectors.
# Note: The sample data is small, so we test a smaller range of k.
optimal_k, metrics_df, recommendation = find_optimal_clusters_with_llm(
    df_clustered,
    intervention_vectors,
    min_k=2,
    max_k=5 # Adjusted for small sample size
)

print("-" * 30)
print(f"\nFinal Analysis: Using {optimal_k} clusters.")
if not metrics_df.empty:
    print("\nMetrics Table:")
    print(metrics_df)


Creating TF-IDF vectors from intervention text...
Successfully created a sparse matrix with shape: (10, 9)
------------------------------
Finding optimal number of clusters...
k=2: silhouette=0.290, DB=1.067, min_size=3
k=3: silhouette=0.325, DB=1.057, min_size=2
k=4: silhouette=0.355, DB=0.956, min_size=2
k=5: silhouette=0.388, DB=0.601, min_size=1

Querying Together AI with Llama 3.3 model for optimal k recommendation...

LLM Recommendation: 4 clusters
Reasoning: The optimal k is chosen based on a balance of statistical metrics and interpretability. The silhouette score increases as k increases, indicating better separation between clusters. The Davies-Bouldin index decreases as k increases, indicating more compact clusters. However, the minimum cluster size decreases as k increases, which could lead to too many micro-clusters. The size standard deviation is relatively low for k=4, indicating balanced clusters. Considering these factors, k=4 provides a good balance between statistica

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
import numpy as np
import pandas as pd
import time

def create_intervention_clusters_with_llm(df, n_clusters=12):
    """
    Group interventions into clusters and use LLM to name them meaningfully
    """

    # Step 1: Create text representation for clustering
    print("Step 1: Preparing intervention data for clustering...")

    # Combine all intervention-related text, handling NaN values
    df = df.copy()
    df['intervention_full_text'] = (
        df['intervention_type'].fillna('') + ' ' +
        df['interventions'].fillna('').str[:500] + ' ' +  # Limit length and handle NaN
        df['tldr'].fillna('') + ' ' +
        df['outcome_extracted'].fillna('')
    )

    # Remove rows with no intervention data
    df_valid = df[df['intervention_full_text'].str.strip() != ''].copy()

    # Double-check for any remaining NaN values
    df_valid['intervention_full_text'] = df_valid['intervention_full_text'].fillna('')
    df_valid = df_valid[df_valid['intervention_full_text'] != '']

    print(f"Working with {len(df_valid)} studies with intervention data")

    # Step 2: Create TF-IDF vectors for clustering
    print("\nStep 2: Creating TF-IDF vectors...")
    vectorizer = TfidfVectorizer(max_features=100, stop_words='english', min_df=2)

    # Convert to list to ensure no NaN issues
    texts = df_valid['intervention_full_text'].tolist()
    intervention_vectors = vectorizer.fit_transform(texts)

    # Step 3: Perform clustering
    print(f"\nStep 3: Clustering into {n_clusters} groups...")
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_valid['cluster'] = kmeans.fit_predict(intervention_vectors)

    # Step 4: Analyze each cluster (without LLM for now to test)
    print("\nStep 4: Analyzing clusters...")

    cluster_analysis = []

    # Get feature names for interpretation
    feature_names = vectorizer.get_feature_names_out()

    for cluster_id in range(n_clusters):
        cluster_studies = df_valid[df_valid['cluster'] == cluster_id]
        n_studies = len(cluster_studies)

        # Get top terms for this cluster
        cluster_center = kmeans.cluster_centers_[cluster_id]
        top_indices = cluster_center.argsort()[-10:][::-1]
        top_terms = [feature_names[i] for i in top_indices]

        # Get most common intervention types in this cluster
        top_intervention_types = cluster_studies['intervention_type'].value_counts().head(3)

        # Get sample studies
        sample_size = min(5, n_studies)
        sample_studies = cluster_studies.sample(n=sample_size, random_state=42) if n_studies > 0 else pd.DataFrame()

        cluster_info = {
            'cluster_id': cluster_id,
            'n_studies': n_studies,
            'top_terms': top_terms[:5],
            'top_intervention_types': top_intervention_types.to_dict() if len(top_intervention_types) > 0 else {},
            'sample_titles': sample_studies['title'].head(3).tolist() if len(sample_studies) > 0 else []
        }

        cluster_analysis.append(cluster_info)

        print(f"Cluster {cluster_id}: {n_studies} studies")
        print(f"  Top terms: {', '.join(top_terms[:5])}")
        if len(top_intervention_types) > 0:
            print(f"  Most common type: {top_intervention_types.index[0]}")

    return df_valid, cluster_analysis

# Run the clustering
df_clustered, clusters = create_intervention_clusters_with_llm(df_clean, n_clusters=12)

# Display detailed results
print("\n" + "="*80)
print("INTERVENTION CLUSTER ANALYSIS RESULTS")
print("="*80)

for cluster in clusters:
    if cluster['n_studies'] > 0:
        print(f"\n📊 CLUSTER {cluster['cluster_id']} ({cluster['n_studies']} studies)")
        print(f"   Key terms: {', '.join(cluster['top_terms'])}")

        if cluster['top_intervention_types']:
            print(f"   Top intervention types:")
            for itype, count in list(cluster['top_intervention_types'].items())[:3]:
                print(f"      • {itype}: {count} studies")

        if cluster['sample_titles']:
            print(f"   Example studies:")
            for title in cluster['sample_titles']:
                print(f"      - {title[:80]}...")

# Create a summary DataFrame
cluster_summary = pd.DataFrame(clusters)
print("\n" + "="*80)
print("CLUSTER SIZE DISTRIBUTION")
print("="*80)
print(cluster_summary[['cluster_id', 'n_studies']].sort_values('n_studies', ascending=False))

# Save the clustered data
df_clustered.to_csv(base_path + 'studies_with_clusters.csv', index=False)
print(f"\nClustered data saved to: {base_path}studies_with_clusters.csv")

Step 1: Preparing intervention data for clustering...
Working with 2289 studies with intervention data

Step 2: Creating TF-IDF vectors...

Step 3: Clustering into 12 groups...

Step 4: Analyzing clusters...
Cluster 0: 102 studies
  Top terms: school, schools, program, students, education
  Most common type: school-based intervention; youth empowerment; entrepreneurship training; employment skills
Cluster 1: 167 studies
  Top terms: available, hidden, intervention, effectiveness, experiment
Cluster 2: 127 studies
  Top terms: training, skills, program, business, employment
  Most common type: export procedure training, business training, capacity building, skill development
Cluster 3: 145 studies
  Top terms: health, intervention, program, social, information
  Most common type: community monitoring; health scorecards; non-financial awards; public commendations
Cluster 4: 161 studies
  Top terms: students, learning, student, academic, performance
  Most common type: mentoring; career g

In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, davies_bouldin_score
from openai import OpenAI
import json

# --- 1. Create a Sample DataFrame ---
# In a real scenario, you would load your data here.
# This sample DataFrame mimics the structure of your 'df_clustered'.
data = {
    'intervention_full_text': [
        "cash transfer program for poor households to improve nutrition",
        "microfinance loans for small business owners in rural areas",
        "agricultural training for farmers on new crop varieties",
        "conditional cash transfers linked to school attendance",
        "small loans and financial literacy training for women entrepreneurs",
        "providing fertilizer and seeds to improve crop yields for farmers",
        "unconditional cash aid to families in poverty",
        "business grants for new startups in urban centers",
        "irrigation system improvements for local farming communities",
        "scholarships for girls to encourage secondary education enrollment"
    ],
    'title': [
        "Nutrition Impact of Cash Transfers",
        "Microfinance and Rural Business",
        "Modern Farming Techniques Study",
        "School Attendance and CCTs",
        "Women's Entrepreneurship Support",
        "Crop Yield Improvement Program",
        "Poverty Alleviation with UCT",
        "Urban Startup Grant Effects",
        "Farming Irrigation Project",
        "Girls' Education Scholarships"
    ],
    'intervention_type': [
        "Cash Transfer", "Finance", "Agriculture", "Cash Transfer", "Finance",
        "Agriculture", "Cash Transfer", "Finance", "Agriculture", "Education"
    ]
}
df_clustered = pd.DataFrame(data)

# --- 2. Create TF-IDF Vectors from Text Data ---
# This block creates the 'intervention_vectors' variable that was previously undefined.
print("Creating TF-IDF vectors from intervention text...")
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)
intervention_vectors = vectorizer.fit_transform(df_clustered['intervention_full_text'])
print(f"Successfully created a sparse matrix with shape: {intervention_vectors.shape}")
print("-" * 30)


# --- 3. Function to Find Optimal Clusters ---
# This is the function you provided, now fully runnable with the data prepared above.

# Note: The OpenAI client requires a valid API key for the LLM analysis part to work.
# The code includes a fallback mechanism if the API call fails.
try:
    # It's recommended to use environment variables for API keys
    # For this example, we use a placeholder.
    client = OpenAI(api_key='your-api-key-here')  # Replace with your API key
except ImportError:
    print("OpenAI library not found. LLM analysis will be skipped.")
    client = None

def find_optimal_clusters_with_llm(df, vectors, min_k=3, max_k=8):
    """
    Use multiple metrics and optional LLM analysis to find the optimal number of clusters.

    Args:
        df (pd.DataFrame): The dataframe containing the source data.
        vectors (scipy.sparse.matrix): The TF-IDF vectors to cluster.
        min_k (int): The minimum number of clusters to test.
        max_k (int): The maximum number of clusters to test.

    Returns:
        tuple: A tuple containing the optimal k, a dataframe of metrics, and the LLM recommendation.
    """
    print("Finding optimal number of clusters...")

    # Adjust max_k if it's larger than the number of samples
    if max_k >= vectors.shape[0]:
        max_k = vectors.shape[0] - 1
        print(f"Warning: max_k is too high for the number of samples. Adjusting to {max_k}.")

    # Step 1: Calculate clustering metrics for different k values
    metrics = []

    for k in range(min_k, max_k + 1):
        kmeans = KMeans(n_clusters=k, random_state=42, n_init='auto')
        labels = kmeans.fit_predict(vectors)

        # Ensure there's more than one cluster to calculate metrics
        if len(set(labels)) > 1:
            silhouette = silhouette_score(vectors, labels)
            davies_bouldin = davies_bouldin_score(vectors.toarray(), labels)
            inertia = kmeans.inertia_

            cluster_sizes = np.bincount(labels)
            size_std = np.std(cluster_sizes)
            min_size = np.min(cluster_sizes)

            metrics.append({
                'k': k,
                'silhouette': silhouette,
                'davies_bouldin': davies_bouldin,
                'inertia': inertia,
                'size_std': size_std,
                'min_cluster_size': min_size
            })

            print(f"k={k}: silhouette={silhouette:.3f}, DB={davies_bouldin:.3f}, min_size={min_size}")
        else:
            print(f"k={k}: Only one cluster was found. Metrics cannot be calculated.")

    if not metrics:
        print("Could not calculate metrics for any k value. Aborting.")
        return min_k, pd.DataFrame(), None

    metrics_df = pd.DataFrame(metrics)

    # Step 2: Use LLM to analyze the metrics and suggest optimal k
    if not client:
        print("\nOpenAI client not initialized. Skipping LLM analysis.")
        # Fallback: simple logic based on silhouette score
        optimal_k = metrics_df.loc[metrics_df['silhouette'].idxmax()]['k']
        return optimal_k, metrics_df, None

    prompt = f"""Analyze these clustering metrics for an RCT intervention dataset:

{metrics_df.to_string()}

Context: We're clustering {len(df)} development economics RCT studies based on their interventions.

Consider:
1. Silhouette score (higher is better, measures cluster separation)
2. Davies-Bouldin index (lower is better, measures cluster compactness)
3. Minimum cluster size (avoid too many tiny clusters)
4. Size standard deviation (prefer balanced clusters)

Recommend the optimal number of clusters (k) that:
- Provides meaningful groupings for policy analysis
- Avoids too many micro-clusters
- Balances statistical metrics with interpretability

Respond with ONLY the JSON object, without any additional text or formatting:
{{
    "optimal_k": <number>,
    "reasoning": "...",
    "alternatives": [<number>, <number>],
    "expected_interpretation": "..."
}}"""

    try:
        print("\nQuerying LLM for optimal k recommendation...")
        response = client.chat.completions.create(
            model="gpt-4o-mini",
            messages=[
                {"role": "system", "content": "You are an expert in clustering analysis and development economics research."},
                {"role": "user", "content": prompt}
            ],
            temperature=0.3,
            response_format={"type": "json_object"}
        )

        recommendation = json.loads(response.choices[0].message.content)

        print(f"\nLLM Recommendation: {recommendation['optimal_k']} clusters")
        print(f"Reasoning: {recommendation['reasoning']}")

        return recommendation['optimal_k'], metrics_df, recommendation

    except Exception as e:
        print(f"\nLLM analysis failed: {e}")
        # Fallback: find k with the best silhouette score
        print("Using fallback logic to determine optimal k.")
        optimal_k = int(metrics_df.loc[metrics_df['silhouette'].idxmax()]['k'])
        return optimal_k, metrics_df, None

# --- 4. Run the Optimization ---
# The function is now called with the dataframe and the newly created vectors.
# Note: The sample data is small, so we test a smaller range of k.
optimal_k, metrics_df, recommendation = find_optimal_clusters_with_llm(
    df_clustered,
    intervention_vectors,
    min_k=2,
    max_k=5 # Adjusted for small sample size
)

print("-" * 30)
print(f"\nFinal Analysis: Using {optimal_k} clusters.")
print("\nMetrics Table:")
print(metrics_df)


NameError: name 'intervention_vectors' is not defined

In [None]:
def identify_cluster_outliers(df_valid, intervention_vectors, n_clusters):
    """
    Identify studies that don't fit well into any cluster
    """

    print("\nIdentifying cluster outliers and borderline cases...")

    # Perform clustering with optimal k
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_valid['cluster'] = kmeans.fit_predict(intervention_vectors)

    # Calculate distance to cluster centers
    distances = kmeans.transform(intervention_vectors)
    df_valid['distance_to_center'] = distances.min(axis=1)

    # Calculate probability of belonging to each cluster (soft clustering)
    # Convert distances to probabilities using softmax
    exp_distances = np.exp(-distances)
    probabilities = exp_distances / exp_distances.sum(axis=1, keepdims=True)
    df_valid['max_probability'] = probabilities.max(axis=1)
    df_valid['second_best_prob'] = np.sort(probabilities, axis=1)[:, -2]

    # Identify different types of outliers

    # 1. Studies far from all clusters (true outliers)
    distance_threshold = df_valid['distance_to_center'].quantile(0.95)
    true_outliers = df_valid[df_valid['distance_to_center'] > distance_threshold].copy()

    # 2. Studies between clusters (low max probability)
    prob_threshold = 0.5
    borderline_studies = df_valid[df_valid['max_probability'] < prob_threshold].copy()

    # 3. Studies that could belong to multiple clusters
    ambiguous_studies = df_valid[
        (df_valid['max_probability'] - df_valid['second_best_prob']) < 0.2
    ].copy()

    print(f"\nOutlier Analysis:")
    print(f"True outliers (far from all clusters): {len(true_outliers)} studies")
    print(f"Borderline cases (between clusters): {len(borderline_studies)} studies")
    print(f"Ambiguous cases (could fit multiple clusters): {len(ambiguous_studies)} studies")

    # Show examples of outliers
    print("\n" + "="*60)
    print("EXAMPLES OF STUDIES THAT DON'T FIT WELL")
    print("="*60)

    print("\n🔴 TRUE OUTLIERS (most unique studies):")
    for _, row in true_outliers.nlargest(5, 'distance_to_center').iterrows():
        print(f"  • {row['title'][:80]}...")
        print(f"    Distance from nearest cluster: {row['distance_to_center']:.3f}")
        print(f"    Intervention: {row['intervention_type'][:100]}...")
        print()

    print("\n🟡 BORDERLINE CASES (between clusters):")
    for _, row in borderline_studies.nsmallest(5, 'max_probability').iterrows():
        print(f"  • {row['title'][:80]}...")
        print(f"    Max cluster probability: {row['max_probability']:.2%}")
        print(f"    Could belong to clusters: {row['cluster']} or others")
        print()

    # Create outlier summary
    outlier_summary = pd.DataFrame({
        'outlier_type': ['True Outliers', 'Borderline', 'Ambiguous'],
        'count': [len(true_outliers), len(borderline_studies), len(ambiguous_studies)],
        'percentage': [
            len(true_outliers) / len(df_valid) * 100,
            len(borderline_studies) / len(df_valid) * 100,
            len(ambiguous_studies) / len(df_valid) * 100
        ]
    })

    return df_valid, outlier_summary, true_outliers

# Identify outliers
df_with_outliers, outlier_summary, outliers = identify_cluster_outliers(
    df_clustered,
    intervention_vectors,
    optimal_k
)

print("\n" + "="*60)
print("OUTLIER SUMMARY")
print("="*60)
print(outlier_summary)

# Save results
df_with_outliers.to_csv(base_path + 'studies_with_clusters_and_outliers.csv', index=False)
outliers.to_csv(base_path + 'unique_outlier_studies.csv', index=False)

NameError: name 'intervention_vectors' is not defined

In [None]:
from scipy.cluster.hierarchy import dendrogram, linkage, fcluster
from sklearn.preprocessing import normalize

def hierarchical_clustering_analysis(intervention_vectors, df_valid, max_clusters=20):
    """
    Use hierarchical clustering to find natural groupings
    """

    print("\nPerforming hierarchical clustering analysis...")

    # Normalize vectors for better clustering
    vectors_normalized = normalize(intervention_vectors.toarray())

    # Perform hierarchical clustering
    linkage_matrix = linkage(vectors_normalized, method='ward')

    # Find natural cut points
    distances = linkage_matrix[:, 2]
    distance_diffs = np.diff(distances)

    # Find large jumps in distance (natural cluster boundaries)
    jump_indices = np.where(distance_diffs > np.percentile(distance_diffs, 90))[0]

    # Calculate number of clusters at each jump
    natural_clusters = []
    for idx in jump_indices[-10:]:  # Look at last 10 big jumps
        n_clusters = len(df_valid) - idx
        if 5 <= n_clusters <= max_clusters:
            natural_clusters.append(n_clusters)

    print(f"Natural cluster numbers based on dendrogram: {natural_clusters}")

    # Test each natural clustering
    for n in natural_clusters[:3]:  # Test top 3 options
        clusters = fcluster(linkage_matrix, n, criterion='maxclust')
        df_valid[f'hier_cluster_{n}'] = clusters

        # Calculate cluster quality
        cluster_sizes = np.bincount(clusters)
        print(f"\n{n} clusters: sizes range from {cluster_sizes.min()} to {cluster_sizes.max()}")

    return df_valid, natural_clusters

# Run hierarchical analysis
df_hier, natural_ns = hierarchical_clustering_analysis(
    intervention_vectors,
    df_clustered,
    max_clusters=20
)

print(f"\nSuggested natural cluster numbers: {natural_ns[:3]}")

NameError: name 'intervention_vectors' is not defined

In [None]:
!pip install together

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from together import Together
import os

# --- 1. Setup and Sample Data Generation ---
# In a real scenario, you would load your data here.
# For this example, we create a sample DataFrame.
print("Setting up sample data...")
data = {
    'title': [f'Study {i+1}' for i in range(100)],
    'intervention_type': [
        'Cognitive Behavioral Therapy', 'Mindfulness Training', 'Pharmacological Treatment',
        'Dietary Intervention', 'Exercise Program', 'Surgical Procedure', 'Placebo Control',
        'Virtual Reality Exposure', 'Music Therapy', 'Art Therapy'
    ] * 10,
    'intervention_full_text': ([
        "Participants received weekly sessions of cognitive behavioral therapy focusing on negative thought patterns.",
        "A daily mindfulness meditation program was implemented for all participants.",
        "The experimental group was administered a new SSRI medication daily.",
        "Subjects were placed on a strict ketogenic diet for the duration of the study.",
        "A high-intensity interval training (HIIT) regimen was followed three times a week.",
        "Patients underwent a minimally invasive laparoscopic surgery.",
        "The control group received a sugar pill with no active ingredients.",
        "Exposure therapy was conducted using a custom-built virtual reality environment.",
        "Classical music sessions were held to measure effects on anxiety.",
        "Weekly art therapy allowed participants to express emotions through painting.",
        # Add some unique text to create outliers
        "A novel intervention using quantum entanglement biofeedback was tested on a small cohort.",
        "This study explored the effects of zero-gravity on cellular regeneration.",
        "Participants consumed only fermented cabbage to study gut microbiome changes.",
    ] * (100 // 13 + 1))[:100] # Ensure the list has exactly 100 items
}
df_clustered = pd.DataFrame(data)
print(f"Sample DataFrame created with {len(df_clustered)} entries.")

# --- 2. TF-IDF Vectorization and Parameter Setup ---
# This section creates the numerical vectors from text and sets the number of clusters.
print("\nCreating TF-IDF vectors from intervention text...")
vectorizer = TfidfVectorizer(max_features=100, stop_words='english', min_df=2)
texts = df_clustered['intervention_full_text'].tolist()
intervention_vectors = vectorizer.fit_transform(texts)

# Set optimal_k to a default value if not defined
if 'optimal_k' not in locals():
    optimal_k = 5  # Default number of clusters for this example
print(f"Using {optimal_k} clusters for analysis.")

# --- 3. Outlier Identification Function ---
# This is the core function for performing clustering and identifying outliers.
def identify_cluster_outliers(df_valid, intervention_vectors, n_clusters):
    """
    Identify studies that don't fit well into any cluster using K-Means.
    """
    print("\nPerforming clustering and identifying outliers...")

    # Perform clustering with optimal k
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_valid['cluster'] = kmeans.fit_predict(intervention_vectors)

    # Calculate distance to cluster centers
    distances = kmeans.transform(intervention_vectors)
    df_valid['distance_to_center'] = distances.min(axis=1)

    # Calculate probability of belonging to each cluster (soft clustering)
    # Convert distances to probabilities using softmax
    exp_distances = np.exp(-distances)
    probabilities = exp_distances / exp_distances.sum(axis=1, keepdims=True)
    df_valid['max_probability'] = probabilities.max(axis=1)
    df_valid['second_best_prob'] = np.sort(probabilities, axis=1)[:, -2]

    # Identify different types of outliers

    # 1. Studies far from all clusters (true outliers)
    distance_threshold = df_valid['distance_to_center'].quantile(0.95)
    true_outliers = df_valid[df_valid['distance_to_center'] > distance_threshold].copy()

    # 2. Studies between clusters (low max probability)
    prob_threshold = 0.5
    borderline_studies = df_valid[df_valid['max_probability'] < prob_threshold].copy()

    # 3. Studies that could belong to multiple clusters
    ambiguous_studies = df_valid[
        (df_valid['max_probability'] - df_valid['second_best_prob']) < 0.2
    ].copy()

    print(f"\nOutlier Analysis Complete:")
    print(f"True outliers (far from all clusters): {len(true_outliers)} studies")
    print(f"Borderline cases (between clusters): {len(borderline_studies)} studies")
    print(f"Ambiguous cases (could fit multiple clusters): {len(ambiguous_studies)} studies")

    # Create a string buffer to hold the report for the AI
    report_buffer = []

    # Show examples of outliers and add to buffer
    report_buffer.append("="*60)
    report_buffer.append("EXAMPLES OF STUDIES THAT DON'T FIT WELL")
    report_buffer.append("="*60)

    report_buffer.append("\n🔴 TRUE OUTLIERS (most unique studies):")
    for _, row in true_outliers.nlargest(5, 'distance_to_center').iterrows():
        line1 = f"  • {row['title'][:80]}..."
        line2 = f"    Distance from nearest cluster: {row['distance_to_center']:.3f}"
        line3 = f"    Intervention: {row['intervention_type'][:100]}..."
        report_buffer.append(line1)
        report_buffer.append(line2)
        report_buffer.append(line3)
        report_buffer.append("")

    report_buffer.append("\n🟡 BORDERLINE CASES (between clusters):")
    for _, row in borderline_studies.nsmallest(5, 'max_probability').iterrows():
        line1 = f"  • {row['title'][:80]}..."
        line2 = f"    Max cluster probability: {row['max_probability']:.2%}"
        line3 = f"    Could belong to clusters: {row['cluster']} or others"
        report_buffer.append(line1)
        report_buffer.append(line2)
        report_buffer.append(line3)
        report_buffer.append("")

    # Create outlier summary DataFrame
    outlier_summary = pd.DataFrame({
        'outlier_type': ['True Outliers', 'Borderline', 'Ambiguous'],
        'count': [len(true_outliers), len(borderline_studies), len(ambiguous_studies)],
        'percentage': [
            len(true_outliers) / len(df_valid) * 100,
            len(borderline_studies) / len(df_valid) * 100,
            len(ambiguous_studies) / len(df_valid) * 100
        ]
    })

    # Add summary to the report buffer
    report_buffer.append("\n" + "="*60)
    report_buffer.append("OUTLIER SUMMARY")
    report_buffer.append("="*60)
    report_buffer.append(outlier_summary.to_string())

    # Print the full report to the console
    full_report = "\n".join(report_buffer)
    print(full_report)

    return df_valid, outlier_summary, true_outliers, full_report

# --- 4. Run Analysis and Prepare for AI ---
df_with_outliers, outlier_summary, outliers, report_for_ai = identify_cluster_outliers(
    df_clustered,
    intervention_vectors,
    optimal_k
)

# --- 5. TogetherAI Integration ---
# This section sends the analysis report to the Kimi model for interpretation.
print("\n" + "="*60)
print("SENDING REPORT TO TogetherAI FOR INSIGHTS...")
print("="*60)

# IMPORTANT: Make sure you have set your TOGETHER_API_KEY as an environment variable
# For example: export TOGETHER_API_KEY='your_api_key_here'
if not os.getenv("TOGETHER_API_KEY"):
    print("ERROR: TOGETHER_API_KEY environment variable not set.")
    print("Please set your API key to use the TogetherAI service.")
else:
    try:
        client = Together()

        # Create the prompt for the AI model
        prompt = f"""
        You are a data science research assistant. Below is a report on cluster outlier analysis
        for a set of scientific studies based on their intervention descriptions.

        Your task is to:
        1. Briefly summarize the findings in simple terms.
        2. Explain what 'True Outliers' and 'Borderline Cases' mean in this context.
        3. Suggest a potential next step for a human researcher based on these findings.

        Here is the report:
        {report_for_ai}
        """

        print("Querying model: moonshotai/Kimi-K2-Instruct...")
        response = client.chat.completions.create(
            model="moonshotai/Kimi-K2-Instruct",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        ai_insights = response.choices[0].message.content

        print("\n🤖 AI-Generated Insights:")
        print("-" * 25)
        print(ai_insights)

    except Exception as e:
        print(f"\nAn error occurred while contacting TogetherAI: {e}")

# --- 6. Save Final Results ---
# The script concludes by saving the processed data.
# base_path = './' # Define your output path
# df_with_outliers.to_csv(base_path + 'studies_with_clusters_and_outliers.csv', index=False)
# outliers.to_csv(base_path + 'unique_outlier_studies.csv', index=False)
# print("\nAnalysis complete. Results saved to CSV files.")


Setting up sample data...
Sample DataFrame created with 100 entries.

Creating TF-IDF vectors from intervention text...
Using 5 clusters for analysis.

Performing clustering and identifying outliers...

Outlier Analysis Complete:
True outliers (far from all clusters): 0 studies
Borderline cases (between clusters): 100 studies
Ambiguous cases (could fit multiple clusters): 84 studies
EXAMPLES OF STUDIES THAT DON'T FIT WELL

🔴 TRUE OUTLIERS (most unique studies):

🟡 BORDERLINE CASES (between clusters):
  • Study 11...
    Max cluster probability: 26.88%
    Could belong to clusters: 1 or others

  • Study 24...
    Max cluster probability: 26.88%
    Could belong to clusters: 1 or others

  • Study 37...
    Max cluster probability: 26.88%
    Could belong to clusters: 1 or others

  • Study 50...
    Max cluster probability: 26.88%
    Could belong to clusters: 1 or others

  • Study 63...
    Max cluster probability: 26.88%
    Could belong to clusters: 1 or others


OUTLIER SUMMARY
  

In [None]:
!pip install together

import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from together import Together
import os

# --- 1. Setup and Sample Data Generation ---
# In a real scenario, you would load your data here.
# For this example, we create a sample DataFrame.
print("Setting up sample data...")
data = {
    'title': [f'Study {i+1}' for i in range(100)],
    'intervention_type': [
        'Cognitive Behavioral Therapy', 'Mindfulness Training', 'Pharmacological Treatment',
        'Dietary Intervention', 'Exercise Program', 'Surgical Procedure', 'Placebo Control',
        'Virtual Reality Exposure', 'Music Therapy', 'Art Therapy'
    ] * 10,
    'intervention_full_text': [
        "Participants received weekly sessions of cognitive behavioral therapy focusing on negative thought patterns.",
        "A daily mindfulness meditation program was implemented for all participants.",
        "The experimental group was administered a new SSRI medication daily.",
        "Subjects were placed on a strict ketogenic diet for the duration of the study.",
        "A high-intensity interval training (HIIT) regimen was followed three times a week.",
        "Patients underwent a minimally invasive laparoscopic surgery.",
        "The control group received a sugar pill with no active ingredients.",
        "Exposure therapy was conducted using a custom-built virtual reality environment.",
        "Classical music sessions were held to measure effects on anxiety.",
        "Weekly art therapy allowed participants to express emotions through painting.",
        # Add some unique text to create outliers
        "A novel intervention using quantum entanglement biofeedback was tested on a small cohort.",
        "This study explored the effects of zero-gravity on cellular regeneration.",
        "Participants consumed only fermented cabbage to study gut microbiome changes.",
    ] * (100 // 13 + 1)
}
df_clustered = pd.DataFrame(data)[:100]
print(f"Sample DataFrame created with {len(df_clustered)} entries.")

# --- 2. TF-IDF Vectorization and Parameter Setup ---
# This section creates the numerical vectors from text and sets the number of clusters.
print("\nCreating TF-IDF vectors from intervention text...")
vectorizer = TfidfVectorizer(max_features=100, stop_words='english', min_df=2)
texts = df_clustered['intervention_full_text'].tolist()
intervention_vectors = vectorizer.fit_transform(texts)

# Set optimal_k to a default value if not defined
if 'optimal_k' not in locals():
    optimal_k = 5  # Default number of clusters for this example
print(f"Using {optimal_k} clusters for analysis.")

# --- 3. Outlier Identification Function ---
# This is the core function for performing clustering and identifying outliers.
def identify_cluster_outliers(df_valid, intervention_vectors, n_clusters):
    """
    Identify studies that don't fit well into any cluster using K-Means.
    """
    print("\nPerforming clustering and identifying outliers...")

    # Perform clustering with optimal k
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
    df_valid['cluster'] = kmeans.fit_predict(intervention_vectors)

    # Calculate distance to cluster centers
    distances = kmeans.transform(intervention_vectors)
    df_valid['distance_to_center'] = distances.min(axis=1)

    # Calculate probability of belonging to each cluster (soft clustering)
    # Convert distances to probabilities using softmax
    exp_distances = np.exp(-distances)
    probabilities = exp_distances / exp_distances.sum(axis=1, keepdims=True)
    df_valid['max_probability'] = probabilities.max(axis=1)
    df_valid['second_best_prob'] = np.sort(probabilities, axis=1)[:, -2]

    # Identify different types of outliers

    # 1. Studies far from all clusters (true outliers)
    distance_threshold = df_valid['distance_to_center'].quantile(0.95)
    true_outliers = df_valid[df_valid['distance_to_center'] > distance_threshold].copy()

    # 2. Studies between clusters (low max probability)
    prob_threshold = 0.5
    borderline_studies = df_valid[df_valid['max_probability'] < prob_threshold].copy()

    # 3. Studies that could belong to multiple clusters
    ambiguous_studies = df_valid[
        (df_valid['max_probability'] - df_valid['second_best_prob']) < 0.2
    ].copy()

    print(f"\nOutlier Analysis Complete:")
    print(f"True outliers (far from all clusters): {len(true_outliers)} studies")
    print(f"Borderline cases (between clusters): {len(borderline_studies)} studies")
    print(f"Ambiguous cases (could fit multiple clusters): {len(ambiguous_studies)} studies")

    # Create a string buffer to hold the report for the AI
    report_buffer = []

    # Show examples of outliers and add to buffer
    report_buffer.append("="*60)
    report_buffer.append("EXAMPLES OF STUDIES THAT DON'T FIT WELL")
    report_buffer.append("="*60)

    report_buffer.append("\n🔴 TRUE OUTLIERS (most unique studies):")
    for _, row in true_outliers.nlargest(5, 'distance_to_center').iterrows():
        line1 = f"  • {row['title'][:80]}..."
        line2 = f"    Distance from nearest cluster: {row['distance_to_center']:.3f}"
        line3 = f"    Intervention: {row['intervention_type'][:100]}..."
        report_buffer.append(line1)
        report_buffer.append(line2)
        report_buffer.append(line3)
        report_buffer.append("")

    report_buffer.append("\n🟡 BORDERLINE CASES (between clusters):")
    for _, row in borderline_studies.nsmallest(5, 'max_probability').iterrows():
        line1 = f"  • {row['title'][:80]}..."
        line2 = f"    Max cluster probability: {row['max_probability']:.2%}"
        line3 = f"    Could belong to clusters: {row['cluster']} or others"
        report_buffer.append(line1)
        report_buffer.append(line2)
        report_buffer.append(line3)
        report_buffer.append("")

    # Create outlier summary DataFrame
    outlier_summary = pd.DataFrame({
        'outlier_type': ['True Outliers', 'Borderline', 'Ambiguous'],
        'count': [len(true_outliers), len(borderline_studies), len(ambiguous_studies)],
        'percentage': [
            len(true_outliers) / len(df_valid) * 100,
            len(borderline_studies) / len(df_valid) * 100,
            len(ambiguous_studies) / len(df_valid) * 100
        ]
    })

    # Add summary to the report buffer
    report_buffer.append("\n" + "="*60)
    report_buffer.append("OUTLIER SUMMARY")
    report_buffer.append("="*60)
    report_buffer.append(outlier_summary.to_string())

    # Print the full report to the console
    full_report = "\n".join(report_buffer)
    print(full_report)

    return df_valid, outlier_summary, true_outliers, full_report

# --- 4. Run Analysis and Prepare for AI ---
df_with_outliers, outlier_summary, outliers, report_for_ai = identify_cluster_outliers(
    df_clustered,
    intervention_vectors,
    optimal_k
)

# --- 5. TogetherAI Integration ---
# This section sends the analysis report to the Kimi model for interpretation.
print("\n" + "="*60)
print("SENDING REPORT TO TogetherAI FOR INSIGHTS...")
print("="*60)

# IMPORTANT: Make sure you have set your TOGETHER_API_KEY as an environment variable
# For example: export TOGETHER_API_KEY='your_api_key_here'
if not os.getenv("TOGETHER_API_KEY"):
    print("ERROR: TOGETHER_API_KEY environment variable not set.")
    print("Please set your API key to use the TogetherAI service.")
else:
    try:
        client = Together()

        # Create the prompt for the AI model
        prompt = f"""
        You are a data science research assistant. Below is a report on cluster outlier analysis
        for a set of scientific studies based on their intervention descriptions.

        Your task is to:
        1. Briefly summarize the findings in simple terms.
        2. Explain what 'True Outliers' and 'Borderline Cases' mean in this context.
        3. Suggest a potential next step for a human researcher based on these findings.

        Here is the report:
        {report_for_ai}
        """

        print("Querying model: moonshotai/Kimi-K2-Instruct...")
        response = client.chat.completions.create(
            model="moonshotai/Kimi-K2-Instruct",
            messages=[
                {"role": "user", "content": prompt}
            ]
        )

        ai_insights = response.choices[0].message.content

        print("\n🤖 AI-Generated Insights:")
        print("-" * 25)
        print(ai_insights)

    except Exception as e:
        print(f"\nAn error occurred while contacting TogetherAI: {e}")

# --- 6. Save Final Results ---
# The script concludes by saving the processed data.
# base_path = './' # Define your output path
# df_with_outliers.to_csv(base_path + 'studies_with_clusters_and_outliers.csv', index=False)
# outliers.to_csv(base_path + 'unique_outlier_studies.csv', index=False)
# print("\nAnalysis complete. Results saved to CSV files.")


Collecting together
  Downloading together-1.5.25-py3-none-any.whl.metadata (16 kB)
Collecting eval-type-backport<0.3.0,>=0.1.3 (from together)
  Downloading eval_type_backport-0.2.2-py3-none-any.whl.metadata (2.2 kB)
Collecting typer<0.16,>=0.9 (from together)
  Downloading typer-0.15.4-py3-none-any.whl.metadata (15 kB)
Collecting click<9.0.0,>=8.1.7 (from together)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading together-1.5.25-py3-none-any.whl (103 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m103.3/103.3 kB[0m [31m1.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading eval_type_backport-0.2.2-py3-none-any.whl (5.8 kB)
Downloading typer-0.15.4-py3-none-any.whl (45 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m45.3/45.3 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m4.6 MB/s[0m eta [

Setting up sample data...


ValueError: All arrays must be of the same length