In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
df = pd.read_csv('/kaggle/input/sa2-datasets/reduced_dengue_by_top10region.csv')

In [3]:
# Extract data for two regions to compare
ncr = df[df['Region'] == 'NCR']['Cases'].values
region3 = df[df['Region'] == 'III']['Cases'].values

In [4]:
# Observed difference in means
obs_diff = np.mean(ncr) - np.mean(region3)
print(f"Observed difference in mean cases (NCR - III): {obs_diff:.1f}\n")

Observed difference in mean cases (NCR - III): -873.0



In [5]:
# Permutation test
combined = np.concatenate([ncr, region3])
n_permutations = 10000
perm_diffs = np.zeros(n_permutations)

for i in range(n_permutations):
    np.random.shuffle(combined)
    perm_ncr = combined[:len(ncr)]
    perm_region3 = combined[len(ncr):]
    perm_diffs[i] = np.mean(perm_ncr) - np.mean(perm_region3)

In [6]:
# Calculate p-value
p_value = (np.sum(np.abs(perm_diffs) >= np.abs(obs_diff)) + 1) / (n_permutations + 1)
print(f"Permutation p-value: {p_value:.3f}")

Permutation p-value: 0.871


In [7]:
# Calculate observed death rates
ncr_deaths = df[df['Region'] == 'NCR']['Deaths'].sum()
ncr_cases = df[df['Region'] == 'NCR']['Cases'].sum()
iva_deaths = df[df['Region'] == 'IVA']['Deaths'].sum()
iva_cases = df[df['Region'] == 'IVA']['Cases'].sum()

ncr_rate = ncr_deaths / ncr_cases
iva_rate = iva_deaths / iva_cases
obs_diff = ncr_rate - iva_rate
print(f"\nObserved death rates:\n- NCR: {ncr_rate:.4f}\n- IVA: {iva_rate:.4f}")
print(f"Observed difference (NCR - IVA): {obs_diff:.4f}\n")


Observed death rates:
- NCR: 0.0042
- IVA: 0.0033
Observed difference (NCR - IVA): 0.0008



In [8]:
# Bootstrap function
def bootstrap_rate_diff(data, region1, region2, n_boot=10000):
    rate_diffs = []
    for _ in range(n_boot):
        sampled_years = np.random.choice(data['Year'].unique(), size=len(data['Year'].unique()), replace=True)
        boot_data = data[data['Year'].isin(sampled_years)]
        rate1 = boot_data[boot_data['Region'] == region1]['Deaths'].sum() / boot_data[boot_data['Region'] == region1]['Cases'].sum()
        rate2 = boot_data[boot_data['Region'] == region2]['Deaths'].sum() / boot_data[boot_data['Region'] == region2]['Cases'].sum()
        rate_diffs.append(rate1 - rate2)
    return np.array(rate_diffs)

boot_diffs = bootstrap_rate_diff(df, 'NCR', 'IVA')

# 95% CI and significance
ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])
print(f"Bootstrap 95% CI for rate difference: [{ci_low:.4f}, {ci_high:.4f}]")

Bootstrap 95% CI for rate difference: [0.0003, 0.0014]
