In [1]:
import numpy as np
import pandas as pd
from scipy import stats

In [2]:
df = pd.read_csv('/kaggle/input/sa2-datasets/reduced_dengue_by_top10region.csv')

In [3]:
# Extract data for two regions to compare
ncr = df[df['Region'] == 'NCR']['Cases'].values
region3 = df[df['Region'] == 'III']['Cases'].values

In [4]:
# Observed difference in means
obs_diff = np.mean(ncr) - np.mean(region3)

# Combined data
combined = np.concatenate([ncr, region3])

In [5]:
# Permutation test
n_permutations = 10000
perm_diffs = np.zeros(n_permutations)

for i in range(n_permutations):
    np.random.shuffle(combined)
    perm_ncr = combined[:len(ncr)]
    perm_region3 = combined[len(ncr):]
    perm_diffs[i] = np.mean(perm_ncr) - np.mean(perm_region3)

In [6]:
# Calculate p-value
p_value = (np.sum(np.abs(perm_diffs) >= np.abs(obs_diff)) + 1) / (n_permutations + 1)

In [7]:
# Bootstrap function
def bootstrap_rate_diff(data, region1, region2, n_boot=10000):
    rate_diffs = []
    for _ in range(n_boot):
        # Resample years with replacement
        sampled_years = np.random.choice(data['Year'].unique(), size=len(data['Year'].unique()), replace=True)
        boot_data = data[data['Year'].isin(sampled_years)]
        
        # Calculate rates
        rate1 = boot_data[boot_data['Region'] == region1]['Deaths'].sum() / boot_data[boot_data['Region'] == region1]['Cases'].sum()
        rate2 = boot_data[boot_data['Region'] == region2]['Deaths'].sum() / boot_data[boot_data['Region'] == region2]['Cases'].sum()
        rate_diffs.append(rate1 - rate2)
    return np.array(rate_diffs)

boot_diffs = bootstrap_rate_diff(df, 'NCR', 'IVA')

# 95% CI
ci_low, ci_high = np.percentile(boot_diffs, [2.5, 97.5])