In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
# --- Step 1: Load the Two Datasets ---

# IMPORTANT: Ensure these filenames match your EXPORTED CSV files.
projects_file = 'project_list.csv'
outcomes_file = 'WBG_merged.csv'

print(f"Attempting to load Projects data from: {projects_file}")
print(f"Attempting to load Outcomes data from: {outcomes_file}")


Attempting to load Projects data from: project_list.csv
Attempting to load Outcomes data from: WBG_merged.csv


In [3]:
# --- V V V --- VERIFY/ADJUST LOADING PARAMETERS BELOW --- V V V ---
# Assuming header is on row 1 (the second row). Verify this!
# Also verify the delimiter (',' or ';' or '\t').

try:
    # --- Parameters for project_list.csv ---
    projects_delimiter = ','   # Verify this
    projects_header_row = 1   # Verify this (row 0=first, 1=second, etc.)
    projects_skiprows = None

    print(f"\nLoading {projects_file} with: delimiter='{projects_delimiter}', header={projects_header_row}, skiprows={projects_skiprows}")
    df_projects = pd.read_csv(
        projects_file,
        header=projects_header_row,
        delimiter=projects_delimiter,
        skiprows=projects_skiprows
    )
    print(f"Loaded {projects_file} successfully. Shape: {df_projects.shape}")
except Exception as e:
    print(f"\n--- ERROR LOADING {projects_file} ---"); print(f"Error message: {e}"); exit()

try:
    # --- Parameters for WBG_merged.csv ---
    outcomes_delimiter = ','   # Verify this
    outcomes_header_row = 1   # Verify this
    outcomes_skiprows = None

    print(f"\nLoading {outcomes_file} with: delimiter='{outcomes_delimiter}', header={outcomes_header_row}, skiprows={outcomes_skiprows}")
    df_outcomes = pd.read_csv(
        outcomes_file,
        header=outcomes_header_row,
        delimiter=outcomes_delimiter,
        skiprows=outcomes_skiprows
    )
    print(f"Loaded {outcomes_file} successfully. Shape: {df_outcomes.shape}")
except Exception as e:
    print(f"\n--- ERROR LOADING {outcomes_file} ---"); print(f"Error message: {e}"); exit()



Loading project_list.csv with: delimiter=',', header=1, skiprows=None
Loaded project_list.csv successfully. Shape: (445, 16)

Loading WBG_merged.csv with: delimiter=',', header=1, skiprows=None
Loaded WBG_merged.csv successfully. Shape: (166, 34)


In [4]:
# --- Step 2: Clean and Standardize Column Names & Define Variables ---

def standardize_cols(df):
    """Converts column names to lowercase, replaces spaces/special chars with underscores."""
    df.columns = df.columns.astype(str).str.strip().str.lower()
    df.columns = df.columns.str.replace(' ', '_', regex=False)
    df.columns = df.columns.str.replace(r'[ /$.]+', '_', regex=True)
    df.columns = df.columns.str.replace(r'[^a-z0-9_]+', '', regex=True)
    df.columns = df.columns.str.strip('_')
    return df


In [5]:
#Apply standardization
df_projects = standardize_cols(df_projects.copy())
df_outcomes = standardize_cols(df_outcomes.copy())

# Print standardized names for verification by user
print("\n--- Standardized Column Names ---")
print("Projects Dataset Columns (standardized):", df_projects.columns.tolist())
print("Outcomes Dataset Columns (standardized):", df_outcomes.columns.tolist())
print("--- >>>> Please verify the variable assignments below match these lists <<< ---")



--- Standardized Column Names ---
Projects Dataset Columns (standardized): ['project_id', 'region', 'country', 'project_name', 'project_url', 'year', 'month', 'day', 'sum_of_grant_amount__us', 'last_stage_reached', 'financing_type', 'effective_date', 'year_1', 'quarter', 'month_1', 'day_1']
Outcomes Dataset Columns (standardized): ['as_of_date', 'project_id', 'project_name_x', 'wb_region', 'country___economy', 'country___economy_fcs_lending_group', 'country___economy_lending_group', 'practice_group', 'global_practice', 'agreement_type', 'lending_instrument_type', 'sum_of_approval_fy', 'sum_of_final_closing_fy', 'evaluation_type', 'outcome', 'outcome_1', 'quality_at_entry', 'quality_of_supervision', 'bank_performance', 'sum_of_evaluation_fy', 'region', 'country', 'project_name_y', 'project_url', 'year', 'month', 'day', 'last_stage_reached', 'financing_type', 'effective_date', 'year_1', 'quarter', 'month_1', 'day_1']
--- >>>> Please verify the variable assignments below match these list

In [6]:
# --- Variable Assignments (Focus on H3 Relevant Columns) ---

# Project ID (Merge Key)
project_id_col_projects = 'project_id'
project_id_col_outcomes = 'project_id'

# Outcome Column (Using 'outcome_1' for text ratings)
outcome_col = 'outcome_1'
print(f"\nUsing '{outcome_col}' as the outcome column (for text ratings).")



Using 'outcome_1' as the outcome column (for text ratings).


In [7]:
# --- List of Project Type Candidates for H3 ---
# Add or remove standardized column names here based on your data and desired tests
# Ensure these names EXACTLY match the standardized lists printed above.
project_type_candidates = [
    'lending_instrument_type', # From Outcomes
    'practice_group',          # From Outcomes
    'global_practice',         # From Outcomes
    'financing_type'           # From Projects
]
print(f"\nPotential Project Type columns for H3 analysis: {project_type_candidates}")



Potential Project Type columns for H3 analysis: ['lending_instrument_type', 'practice_group', 'global_practice', 'financing_type']


In [8]:
# --- Prepare for Merge: Standardize Merge Key ---
merge_key = 'project_id_merged'
if project_id_col_projects in df_projects.columns: df_projects.rename(columns={project_id_col_projects: merge_key}, inplace=True)
else: print(f"\nError: Column '{project_id_col_projects}' not found in Projects DF."); exit()
if project_id_col_outcomes in df_outcomes.columns: df_outcomes.rename(columns={project_id_col_outcomes: merge_key}, inplace=True)
else: print(f"\nError: Column '{project_id_col_outcomes}' not found in Outcomes DF."); exit()
print(f"Renamed project ID columns to '{merge_key}'.")


Renamed project ID columns to 'project_id_merged'.


In [9]:
# Define ONLY columns needed for H3 analysis
essential_cols_needed = [outcome_col] + project_type_candidates # Outcome and all type candidates
essential_locations = {}

# Check each essential column's location
if outcome_col in df_outcomes.columns: essential_locations[outcome_col] = 'outcomes'
# Check for project type candidates
for pt_col in project_type_candidates:
    if pt_col in df_outcomes.columns: essential_locations[pt_col] = 'outcomes'
    elif pt_col in df_projects.columns: essential_locations[pt_col] = 'projects'

# Verify all essential columns were located
found_cols = list(essential_locations.keys())
missing_essential = [col for col in essential_cols_needed if col not in found_cols]
if missing_essential:
    print(f"\nError: Could not locate all essential analysis columns needed for H3:")
    print(f"  Missing: {missing_essential}")
    print(f"  Check the variable assignments and candidate list match the standardized lists exactly.")
    exit()
else:
    print("\nVerified essential analysis columns locations:")
    #for col, source in essential_locations.items(): print(f"  '{col}' found in {source} dataset.") # Optional detail

# Define columns to select for merge (merge key + located essential columns for H3)
essential_cols_projects = list(set([merge_key] + [col for col, source in essential_locations.items() if source == 'projects']))
essential_cols_outcomes = list(set([merge_key] + [col for col, source in essential_locations.items() if source == 'outcomes']))

print(f"\nColumns intended to be kept from Projects: {essential_cols_projects}")
print(f"Columns intended to be kept frm Outcomes: {essential_cols_outcomes}")


Verified essential analysis columns locations:

Columns intended to be kept from Projects: ['project_id_merged']
Columns intended to be kept frm Outcomes: ['outcome_1', 'global_practice', 'practice_group', 'lending_instrument_type', 'financing_type', 'project_id_merged']


In [10]:
# --- Step 3: Merge the Datasets ---

cols_to_keep_projects = essential_cols_projects
cols_to_keep_outcomes = essential_cols_outcomes
print(f"\nSelecting columns for merge...")

df_merged = pd.merge(df_projects[cols_to_keep_projects], df_outcomes[cols_to_keep_outcomes], on=merge_key, how='inner')

print(f"\nMerged DataFrame shape: {df_merged.shape}")
print(f"DEBUG: Columns present in df_merged: {df_merged.columns.tolist()}")
if df_merged.empty: print("\nError: Merged DataFrame is empty."); exit()




Selecting columns for merge...

Merged DataFrame shape: (166, 6)
DEBUG: Columns present in df_merged: ['project_id_merged', 'outcome_1', 'global_practice', 'practice_group', 'lending_instrument_type', 'financing_type']


In [11]:
# --- Step 4: Preprocess Merged Data (Focus on H3 Columns) ---

# Handle missing values in H3 essential columns
print("\nMissing values in merged data (before handling):")
key_analysis_cols_h3 = [outcome_col] + project_type_candidates # Check NAs for outcome and all type candidates
print(df_merged[key_analysis_cols_h3].isnull().sum())

rows_before_drop = df_merged.shape[0]
# Drop rows where ANY H3 key analysis column is missing
df_merged.dropna(subset=key_analysis_cols_h3, inplace=True)
rows_after_drop = df_merged.shape[0]
print(f"\nDropped {rows_before_drop - rows_after_drop} rows due to missing values in H3 key columns.")
print(f"Merged DataFrame shape after dropping NAs: {df_merged.shape}")
if df_merged.empty: print("\nError: DataFrame empty after dropping NAs for H3 columns."); exit()

# Analyze and Binarize the outcome column ('outcome_1') using TEXT mapping
print(f"\nUnique values found in outcome column '{outcome_col}' (expected text):")
df_merged[outcome_col] = df_merged[outcome_col].astype(str).str.strip()
unique_outcomes = df_merged[outcome_col].unique()
print(unique_outcomes)



Missing values in merged data (before handling):
outcome_1                  0
lending_instrument_type    0
practice_group             0
global_practice            0
financing_type             0
dtype: int64

Dropped 0 rows due to missing values in H3 key columns.
Merged DataFrame shape after dropping NAs: (166, 6)

Unique values found in outcome column 'outcome_1' (expected text):
['Unsatisfactory' 'Satisfactory' 'Moderately Satisfactory'
 'Moderately Unsatisfactory' 'Highly Satisfactory' 'Highly Unsatisfactory']


In [12]:
# *** REVIEW UNIQUE OUTCOMES ABOVE AND ADJUST MAPPING LISTS IF NEEDED ***
satisfactory_ratings = ['Satisfactory', 'Moderately Satisfactory', 'Highly Satisfactory', 'SA', 'MS', 'HS']
unsatisfactory_ratings = ['Unsatisfactory', 'Moderately Unsatisfactory', 'Highly Unsatisfactory', 'U', 'MU', 'HU']

def map_text_outcome(outcome_text):
    if outcome_text in unsatisfactory_ratings: return 1
    elif outcome_text in satisfactory_ratings: return 0
    else: return np.nan
df_merged['outcome_binary'] = df_merged[outcome_col].apply(map_text_outcome)

print(f"\nOutcome mapping summary (using text ratings from '{outcome_col}'):")
print(f"  Mapped to 0 (Satisfactory): { (df_merged['outcome_binary'] == 0).sum() }")
print(f"  Mapped to 1 (Unsatisfactory): { (df_merged['outcome_binary'] == 1).sum() }")
unmapped_values = df_merged[df_merged['outcome_binary'].isnull()][outcome_col].unique()
print(f"  Became NaN (unmapped: {unmapped_values}): { df_merged['outcome_binary'].isnull().sum() }")
if df_merged['outcome_binary'].isnull().sum() > 0: print("  >> If unmapped values exist, update satisfactory/unsatisfactory_ratings lists! <<")

df_merged.dropna(subset=['outcome_binary'], inplace=True)
df_merged['outcome_binary'] = df_merged['outcome_binary'].astype(int)

print(f"\nFinal shape after outcome mapping: {df_merged.shape}")
print("Value counts for binary outcome (1=Unsatisfactory, 0=Satisfactory):")
print(df_merged['outcome_binary'].value_counts(dropna=False))
if df_merged['outcome_binary'].nunique() < 2: print(f"\nError: Only one outcome category exists after text mapping."); exit()




Outcome mapping summary (using text ratings from 'outcome_1'):
  Mapped to 0 (Satisfactory): 117
  Mapped to 1 (Unsatisfactory): 49
  Became NaN (unmapped: []): 0

Final shape after outcome mapping: (166, 7)
Value counts for binary outcome (1=Unsatisfactory, 0=Satisfactory):
outcome_binary
0    117
1     49
Name: count, dtype: int64


In [13]:
# --- Step 5: Hypothesis Testing (H3 Only) ---

print("\n--- Hypothesis Testing (H3 Only) ---")
alpha = 0.05

# --- H3 (Loop): Testing different Project Type columns vs Outcome ---
print(f"\n--- H3 (Loop): Testing Potential Project Type Columns vs. Outcome ---")

# Loop through the candidate columns defined in Step 2
for candidate_col in project_type_candidates:
    print(f"\n--- H3 Testing: '{candidate_col}' vs. Outcome ---")

    # Check if column exists and has enough unique values in the final data
    if candidate_col not in df_merged.columns:
        print(f"Skipping H3 test: Column '{candidate_col}' not found in final merged/cleaned data.")
        continue
    # Also check for sufficient variation after NA removal for this specific column
    if df_merged[candidate_col].nunique() < 2:
        print(f"Skipping H3 test: Column '{candidate_col}' has less than 2 unique values after cleaning.")
        continue

    # Create contingency table for this candidate
    contingency_table_h3 = pd.crosstab(df_merged[candidate_col], df_merged['outcome_binary'])

    # Check if table is valid
    if contingency_table_h3.shape[0] < 2 or contingency_table_h3.shape[1] < 2:
         print(f"Warning H3 ({candidate_col}): Contingency table too small for Chi-squared test (Shape: {contingency_table_h3.shape}).")
         continue # Skip to next candidate in the loop

    # Perform the Chi-Squared test
    try:
        chi2_stat, p_value_h3, dof, expected_freq = stats.chi2_contingency(contingency_table_h3)
        print(f"Chi-Squared Statistic: {chi2_stat:.4f}, P-value: {p_value_h3:.4g}, DoF: {dof}")

        # Check assumption: Expected frequencies >= 5
        low_freq_count = (expected_freq < 5).sum() if expected_freq is not None else -1
        if expected_freq is not None and expected_freq.size > 0:
            perc_low = (low_freq_count / expected_freq.size) * 100
            if low_freq_count > 0: print(f"Warning H3 ({candidate_col}): {low_freq_count} cells ({perc_low:.1f}%) have expected frequency < 5. Caution advised.")
            else: print(f"Check H3 ({candidate_col}): Expected frequency assumption met.")
        else: print(f"Warning H3 ({candidate_col}): Could not check expected frequencies.")

        # Interpretation
        if p_value_h3 < alpha:
            print(f"Result H3 ({candidate_col}): Reject null hypothesis (p < {alpha}). Significant association between {candidate_col} and outcome.")
            contingency_table_h3['unsat_rate'] = contingency_table_h3[1] / (contingency_table_h3[0] + contingency_table_h3[1])
            print(f"\nUnsatisfactory rates by {candidate_col}:"); print(contingency_table_h3.sort_values('unsat_rate', ascending=False) if contingency_table_h3.shape[0] > 1 else contingency_table_h3)
            # Optional Visualization for significant results
            try:
                if 0 in contingency_table_h3.columns and 1 in contingency_table_h3.columns:
                    props = contingency_table_h3[[0, 1]].apply(lambda x: x/x.sum() if x.sum() > 0 else 0, axis=1)
                    props.plot(kind='bar', stacked=True, figsize=(10, 6)); plt.title(f'Outcome Proportion by {candidate_col}')
                    plt.xlabel(f'{candidate_col}'); plt.ylabel('Proportion'); plt.xticks(rotation=45, ha='right')
                    plt.legend(['Satisfactory (0)', 'Unsatisfactory (1)'], title='Outcome'); plt.tight_layout(); plt.show()
                else: print(f"Could not generate plot for H3 ({candidate_col}): Missing outcome categories.")
            except Exception as e: print(f"\nCould not generate plot for H3 ({candidate_col}): {e}")
        else:
            print(f"Result H3 ({candidate_col}): Fail to reject null hypothesis (p >= {alpha}). No significant association between {candidate_col} and outcome.")
    except ValueError as ve:
        print(f"Error H3 ({candidate_col}): Chi-Squared test failed. ValueError: {ve}")
# --- (End of H3 loop) ---



--- Hypothesis Testing (H3 Only) ---

--- H3 (Loop): Testing Potential Project Type Columns vs. Outcome ---

--- H3 Testing: 'lending_instrument_type' vs. Outcome ---
Chi-Squared Statistic: 1.3722, P-value: 0.5035, DoF: 2
Result H3 (lending_instrument_type): Fail to reject null hypothesis (p >= 0.05). No significant association between lending_instrument_type and outcome.

--- H3 Testing: 'practice_group' vs. Outcome ---
Chi-Squared Statistic: 6.5825, P-value: 0.1597, DoF: 4
Result H3 (practice_group): Fail to reject null hypothesis (p >= 0.05). No significant association between practice_group and outcome.

--- H3 Testing: 'global_practice' vs. Outcome ---
Chi-Squared Statistic: 14.6138, P-value: 0.3321, DoF: 13
Result H3 (global_practice): Fail to reject null hypothesis (p >= 0.05). No significant association between global_practice and outcome.

--- H3 Testing: 'financing_type' vs. Outcome ---
Chi-Squared Statistic: 1.0159, P-value: 0.6017, DoF: 2
Check H3 (financing_type): Expecte