In [225]:

MARKET_SHARE = 0.05 # from 
AVERAGE_HOUSEHOLD_SIZE = 2.5 # US average according to ...
# recommended default baseline (actuarial starting point). Tune / replace with real data if available.
BASE_RATE = 1 / 18  # US average annual claim probability per property (sensible starting point)
NUM_TRIALS = 10000  # Monte Carlo draws per county/state

In [226]:
# imports 
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from scipy.stats import gamma
from scipy.stats import gamma
from scipy.integrate import quad
from numpy.linalg import cond

In [227]:
# ----------------------------
# Cell A – Data Loading + Preprocessing
# ----------------------------

# Relevant property information
FLORIDA = {
    "name": "Florida",
    "average_property_cost": 325000, 
}
WASHINGTON = {
    "name": "Washington",
    "average_property_cost": 610000, 
}

# Load FEMA CSVs
FLORIDA['df'] = pd.read_csv('fl_county.csv')
WASHINGTON['df'] = pd.read_csv('wa_county.csv')

# ----------------------------
# CLEAN + PREPARE FEMA DATA
# ----------------------------
def clean_fema_df(state_dict):
    df = state_dict["df"].copy()

    # Keep only the necessary columns
    needed = ["COUNTY", "POPULATION", "BUILDVALUE", "EAL_VALT", "RISK_VALUE", "RISK_SCORE"]
    df = df[needed]

    # Rename for clarity
    df = df.rename(columns={
        "BUILDVALUE": "exposure",  # total building value
        "EAL_VALT": "EAL",         # total expected annual loss
        "POPULATION": "population"
    })

    # Convert numeric
    df["exposure"] = pd.to_numeric(df["exposure"], errors="coerce")
    df["EAL"] = pd.to_numeric(df["EAL"], errors="coerce")
    df["population"] = pd.to_numeric(df["population"], errors="coerce")
    df["RISK_VALUE"] = pd.to_numeric(df["RISK_VALUE"], errors="coerce")
    df["RISK_SCORE"] = pd.to_numeric(df["RISK_SCORE"], errors="coerce")

    # Drop rows with missing exposure or EAL
    df = df.dropna(subset=["exposure", "EAL"])

    state_dict["df_clean"] = df
    return state_dict

FLORIDA = clean_fema_df(FLORIDA)
WASHINGTON = clean_fema_df(WASHINGTON)

# ----------------------------
# Preprocessing for GLM / Monte Carlo
# ----------------------------

def preprocess_location_df(location):
    """
    Produces county-level variables for frequency × severity modeling:
    - insured_properties: population / average household size
    - lambda_base: expected claims per county based on empirical claim rate (~1/18)
    - severity_mean: EAL per expected claim
    - risk_scaled: normalized FEMA RISK_VALUE
    """
    df = location["df_clean"].copy()

    # Estimate # of properties
    df["properties"] = df["population"] / AVERAGE_HOUSEHOLD_SIZE

    # Assume ~1/18 of properties claim per year
    base_claim_rate = 1/18
    df["insured_properties"] = df["properties"] * base_claim_rate

    # baseline lambda = insured properties * empirical base rate
    df["lambda_base"] = df["insured_properties"] * BASE_RATE


    # severity per claim
    df["severity_mean"] = df["EAL"] / df["lambda_base"]

    # normalized risk factor
    df["risk_scaled"] = df["RISK_VALUE"] / df["RISK_VALUE"].max()

    location["df_glm"] = df
    return df

FLORIDA["df_glm"] = preprocess_location_df(FLORIDA)
WASHINGTON["df_glm"] = preprocess_location_df(WASHINGTON)

# Quick sanity check
print("Sample counties for Florida:")
print(FLORIDA["df_glm"][["COUNTY", "insured_properties", "lambda_base", "severity_mean"]].head())


Sample counties for Florida:
     COUNTY  insured_properties  lambda_base  severity_mean
0   Alachua         6177.422222   343.190123  196545.729472
1     Baker          618.933333    34.385185  137107.148543
2       Bay         3885.977778   215.887654  712090.788201
3  Bradford          627.400000    34.855556  239810.046710
4   Brevard        13468.155556   748.230864  553818.713457


In [228]:
#Cell B

def generate_county_level_parameters(location, perturb_std_lambda=0.05, perturb_std_severity=0.1, random_state=None):
    """
    Produce county-level stochastic frequency (lambda) and severity.
    
    Args:
        location: dict containing df_clean / df_glm
        perturb_std_lambda: relative std dev for lambda perturbation (5% default)
        perturb_std_severity: relative std dev for severity perturbation (10% default)
        random_state: seed for reproducibility
    """
    rng = np.random.default_rng(random_state)
    df = location["df_glm"].copy()
    
    # --- Perturb lambda around base ---
    df["lambda"] = df["lambda_base"] * (1 + rng.normal(0, perturb_std_lambda, size=len(df)))
    df["lambda"] = df["lambda"].clip(lower=0.01)  # prevent negative lambda

    # --- Perturb severity around mean ---
    df["severity_mean"] = df["severity_mean"] * (1 + rng.normal(0, perturb_std_severity, size=len(df)))
    df["severity_mean"] = df["severity_mean"].clip(lower=1000)  # prevent absurdly low severity

    
    location["df_sim"] = df
    return location

# Apply to both states
FLORIDA = generate_county_level_parameters(FLORIDA, random_state=42)
WASHINGTON = generate_county_level_parameters(WASHINGTON, random_state=42)

# Quick check
FLORIDA["df_sim"][["COUNTY", "insured_properties", "lambda", "severity_mean"]].head()


Unnamed: 0,COUNTY,insured_properties,lambda,severity_mean
0,Alachua,6177.422222,348.418918,187458.402435
1,Baker,618.933333,32.597183,148870.611203
2,Bay,3885.977778,223.988312,698468.183451
3,Bradford,627.4,36.494751,209217.807032
4,Brevard,13468.155556,675.239627,491055.146773


In [229]:
# ----------------------------
# Cell D – Monte Carlo simulation of county losses
# ----------------------------

def simulate_claims_mc(location, n_sim=NUM_TRIALS, random_state=None):
    """
    Generate n_sim Monte Carlo draws of total state losses:
      - Poisson(lambda) per county
      - Multiply by severity_mean per claim
    Returns:
      - state_losses: array of total state losses for n_sim draws
      - updates location["df_sim"] with per-county mean loss (optional)
    """
    rng = np.random.default_rng(random_state)
    df = location["df_sim"].copy()
    
    state_losses = np.zeros(n_sim)
    
    county_mean_losses = []
    
    for _, row in df.iterrows():
        n_claims = rng.poisson(lam=row['lambda'], size=n_sim)  # shape (n_sim,)
        county_loss = n_claims * row['severity_mean']          # shape (n_sim,)
        state_losses += county_loss
        
        # store mean county loss (optional, sense check)
        county_mean_losses.append(county_loss.mean())
    
    # Update df_sim with mean county loss for reference
    df['mean_total_loss'] = county_mean_losses
    location["df_sim"] = df
    
    return state_losses

# Run MC simulations using global NUM_TRIALS
FL_state_losses = simulate_claims_mc(FLORIDA, n_sim=NUM_TRIALS, random_state=42)
WA_state_losses = simulate_claims_mc(WASHINGTON, n_sim=NUM_TRIALS, random_state=42)

# Quick sense check
print("Florida total losses (MC):")
print(pd.Series(FL_state_losses).describe())
print("\nWashington total losses (MC):")
print(pd.Series(WA_state_losses).describe())

# Optional: view per-county mean total losses
print("\nFlorida per-county mean total losses (first 5 counties):")
print(FLORIDA["df_sim"][["COUNTY", "mean_total_loss"]].head())

#Diagnostics
print("Number of MC draws:", len(FL_state_losses))  # should be 10000
print("Sample of state losses:", FL_state_losses[:5])
FL_df = FLORIDA['df_sim']
print("Lambda per county (first 5):", FL_df['lambda'].head(5))
print("Severity_mean per county (first 5):", FL_df['severity_mean'].head(5))

# Compute Poisson relative std dev per county
print("Relative Poisson std dev per county (first 5):", np.sqrt(FL_df['lambda'].head(5)) / FL_df['lambda'].head(5))
rng = np.random.default_rng(42)
n_claims = rng.poisson(lam=FL_df['lambda'].iloc[0], size=10_000)
county_losses = n_claims * FL_df['severity_mean'].iloc[0]
print("County 0 losses: mean", county_losses.mean(), "std", county_losses.std())


Florida total losses (MC):
count    1.000000e+04
mean     8.770492e+09
std      5.947307e+07
min      8.559936e+09
25%      8.730771e+09
50%      8.770819e+09
75%      8.809872e+09
max      9.082146e+09
dtype: float64

Washington total losses (MC):
count    1.000000e+04
mean     2.133658e+09
std      2.466344e+07
min      2.035570e+09
25%      2.116966e+09
50%      2.133714e+09
75%      2.150284e+09
max      2.221199e+09
dtype: float64

Florida per-county mean total losses (first 5 counties):
     COUNTY  mean_total_loss
0   Alachua     6.533210e+07
1     Baker     4.851068e+06
2       Bay     1.564930e+08
3  Bradford     7.614712e+06
4   Brevard     3.315520e+08
Number of MC draws: 10000
Sample of state losses: [8.68338076e+09 8.79279320e+09 8.83236533e+09 8.86787140e+09
 8.70840571e+09]
Lambda per county (first 5): 0    348.418918
1     32.597183
2    223.988312
3     36.494751
4    675.239627
Name: lambda, dtype: float64
Severity_mean per county (first 5): 0    187458.402435
1    14

In [230]:
# --- Cell E: Deductible & Coinsurance ---

def apply_deductible_coinsurance_vectorized(county_df, deductible=10000, coinsurance=0.8, n_sim=NUM_TRIALS, random_state=42):
    """
    Apply deductible and coinsurance to county-level loss simulation.
    Inputs:
        county_df: DataFrame with columns ['lambda', 'severity_mean', 'insured_properties']
        deductible: per-claim deductible
        coinsurance: proportion insurer pays above deductible
        n_sim: number of Monte Carlo draws per county
    Returns:
        state_losses: array of total state-level losses for n_sim draws
    """
    rng = np.random.default_rng(random_state)
    state_losses = np.zeros(n_sim)

    for _, row in county_df.iterrows():
        # --- simulate number of claims for this county ---
        n_claims = rng.poisson(lam=row['lambda'], size=n_sim)
        
        # --- simulate individual claim severities ---
        # assume gamma with mean=severity_mean, var=(0.5*severity_mean)^2 for variability
        severity_shape = 4  # can be tuned
        severity_scale = row['severity_mean'] / severity_shape
        claim_severities = rng.gamma(shape=severity_shape, scale=severity_scale, size=(n_sim, n_claims.max()))
        
        # zero out excess columns if fewer claims
        mask = np.arange(n_claims.max()) < n_claims[:, None]
        claim_severities = claim_severities * mask
        
        # --- apply deductible & coinsurance ---
        payout = np.maximum(claim_severities - deductible, 0) * coinsurance
        
        # sum across claims per draw
        county_total = payout.sum(axis=1)
        
        # add to state total
        state_losses += county_total

    return state_losses


# Quick sense check with first 5 counties
FL_sim_losses = apply_deductible_coinsurance_vectorized(FLORIDA['df_sim'].head(5), n_sim=NUM_TRIALS)
WA_sim_losses = apply_deductible_coinsurance_vectorized(WASHINGTON['df_sim'].head(5), n_sim=NUM_TRIALS)

print(f"Florida sample losses (first 5 counties, {NUM_TRIALS} draws):")
print(pd.Series(FL_sim_losses).describe())

print(f"\nWashington sample losses (first 5 counties, {NUM_TRIALS} draws):")
print(pd.Series(WA_sim_losses).describe())



Florida sample losses (first 5 counties, 10000 draws):
count    1.000000e+04
mean     4.425066e+08
std      1.519984e+07
min      3.838450e+08
25%      4.322039e+08
50%      4.425425e+08
75%      4.528414e+08
max      4.991036e+08
dtype: float64

Washington sample losses (first 5 counties, 10000 draws):
count    1.000000e+04
mean     6.521152e+07
std      3.684424e+06
min      5.270021e+07
25%      6.268817e+07
50%      6.513936e+07
75%      6.765580e+07
max      7.950740e+07
dtype: float64


In [231]:
# --- Cell F: Metrics ---
def compute_metrics(state_losses, loading=1.2, n_bootstrap=NUM_TRIALS, random_state=42):
    """
    Compute premium, profit samples, VaR, TVaR, and bootstrap CI.
    Inputs:
        state_losses: array of total state-level losses (after deductible & coinsurance)
        loading: premium multiplier
        n_bootstrap: number of bootstrap resamples for mean profit CI
    Returns:
        metrics: dict with expected loss, premium, VaR, TVaR, profit samples, bootstrap CI
    """
    rng = np.random.default_rng(random_state)
    state_losses = np.array(state_losses)
    
    # expected loss and premium
    expected_loss = state_losses.mean()
    premium = expected_loss * loading
    
    # profit samples
    profit_samples = premium - state_losses
    
    # risk metrics
    VaR_95 = np.percentile(profit_samples, 5)
    TVaR_95 = profit_samples[profit_samples <= VaR_95].mean()
    
    # bootstrap CI for mean profit
    bootstrap_means = np.array([rng.choice(profit_samples, size=len(profit_samples), replace=True).mean() for _ in range(n_bootstrap)])
    ci_lower, ci_upper = np.percentile(bootstrap_means, [2.5, 97.5])
    
    metrics = {
        'expected_loss': expected_loss,
        'premium': premium,
        'VaR_95': VaR_95,
        'TVaR_95': TVaR_95,
        'profit_mean': profit_samples.mean(),
        'profit_std': profit_samples.std(),
        'profit_bootstrap_CI': (ci_lower, ci_upper)
    }
    
    return metrics

# --- Quick sense check ---
FL_metrics = compute_metrics(FL_sim_losses)
WA_metrics = compute_metrics(WA_sim_losses)

print("Florida metrics:")
for k, v in FL_metrics.items():
    print(k, v)

print("\nWashington metrics:")
for k, v in WA_metrics.items():
    print(k, v)


Florida metrics:
expected_loss 442506591.7526041
premium 531007910.1031249
VaR_95 63676677.19751305
TVaR_95 56972608.47487817
profit_mean 88501318.35052085
profit_std 15199081.138163047
profit_bootstrap_CI (np.float64(88201673.60618936), np.float64(88796423.96507366))

Washington metrics:
expected_loss 65211516.89780401
premium 78253820.2773648
VaR_95 6814496.882143637
TVaR_95 5237562.866146011
profit_mean 13042303.37956081
profit_std 3684240.1879088366
profit_bootstrap_CI (np.float64(12970178.307748068), np.float64(13114509.267337078))


In [232]:
# ----------------------------
# Cell G – Diagnostics
# ----------------------------

epsilon = 1e-5  # for finite-difference gradient

# --- Frequency GLM (Poisson) ---
df = FLORIDA['df_glm'].copy()
y_freq = df['insured_properties']
X_freq = sm.add_constant(df[['risk_scaled']])  # example covariate

freq_model = sm.GLM(y_freq, X_freq, family=sm.families.Poisson())
freq_results = freq_model.fit()
print("Frequency GLM summary:")
print(freq_results.summary())

# Finite-difference gradient check
beta = freq_results.params.values
grad_approx = np.zeros_like(beta)
for i in range(len(beta)):
    beta_plus = beta.copy(); beta_plus[i] += epsilon
    beta_minus = beta.copy(); beta_minus[i] -= epsilon
    mu_plus = np.exp(X_freq @ beta_plus)
    mu_minus = np.exp(X_freq @ beta_minus)
    grad_approx[i] = (mu_plus.sum() - mu_minus.sum()) / (2*epsilon)

# Analytical gradient (score function)
mu = freq_results.fittedvalues
grad_analytical = X_freq.values.T @ (y_freq.values - mu)

print("\nFrequency GLM gradients:")
print("Approximate (finite-diff):", grad_approx)
print("Analytical:", grad_analytical)

# Condition number
X_freq_cond = cond(X_freq.values)
print("Frequency GLM design matrix condition number:", X_freq_cond)

# --- Severity GLM (Gamma, log link) ---
y_sev = df['severity_mean']
X_sev = sm.add_constant(df[['risk_scaled']])  # same covariate

sev_model = sm.GLM(y_sev, X_sev, family=sm.families.Gamma(sm.families.links.log()))
sev_results = sev_model.fit()
print("\nSeverity GLM summary:")
print(sev_results.summary())

# Finite-difference gradient check
beta_sev = sev_results.params.values
grad_approx_sev = np.zeros_like(beta_sev)
for i in range(len(beta_sev)):
    beta_plus = beta_sev.copy(); beta_plus[i] += epsilon
    beta_minus = beta_sev.copy(); beta_minus[i] -= epsilon
    mu_plus = np.exp(X_sev @ beta_plus)
    mu_minus = np.exp(X_sev @ beta_minus)
    grad_approx_sev[i] = (mu_plus.sum() - mu_minus.sum()) / (2*epsilon)

# Analytical gradient (simplified)
mu_sev = sev_results.fittedvalues
grad_analytical_sev = X_sev.values.T @ ((y_sev.values - mu_sev) / (mu_sev ** 2))

print("\nSeverity GLM gradients:")
print("Approximate (finite-diff):", grad_approx_sev)
print("Analytical:", grad_analytical_sev)

# Condition number
X_sev_cond = cond(X_sev.values)
print("Severity GLM design matrix condition number:", X_sev_cond)


Frequency GLM summary:
                 Generalized Linear Model Regression Results                  
Dep. Variable:     insured_properties   No. Observations:                   67
Model:                            GLM   Df Residuals:                       65
Model Family:                 Poisson   Df Model:                            1
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:            -1.3547e+05
Date:                Thu, 11 Dec 2025   Deviance:                   2.7028e+05
Time:                        20:02:21   Pearson chi2:                 3.09e+05
No. Iterations:                     6   Pseudo R-squ. (CS):              1.000
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
const           8.1538     

