# Employment and Fertility

H0: There is no statistical association between fertility and employment (female labor-force participation or unemployment), across countries and over time.

In [16]:
import numpy as np
import pandas as pd
from scipy.stats import pearsonr
from sqlalchemy import create_engine

# Load panel & add log income
engine = create_engine("sqlite:///analytics_panel.sqlite")
panel = pd.read_sql("SELECT * FROM v_panel_all;", con=engine)
panel["log_gdppc"] = np.log(panel["current_usd"]).replace([np.inf, -np.inf], np.nan)

# Build Employment slice (keep wide window; WB coverage ~1990+)
emp = (
    panel[["iso_code","year","fertility_rate",
            "female_participation","unemployment","log_gdppc","married_percentage"]]
    .dropna(subset=["fertility_rate"])
    .copy()
)

# Plausibility bounds for percentage-like vars
for col in ["female_participation","unemployment"]:
    emp[col] = pd.to_numeric(emp[col], errors="coerce")
    emp = emp[(emp[col].isna()) | emp[col].between(0, 100)]

summary = {
    "rows_in_panel": len(panel),
    "rows_in_emp_slice": len(emp),
    "date_range_panel": (int(panel["year"].min()), int(panel["year"].max())),
    "date_range_emp": (int(emp["year"].min()), int(emp["year"].max())),
    "non_null_fraction_female_participation": round(panel["female_participation"].notna().mean()*100,1),
    "non_null_fraction_unemployment": round(panel["unemployment"].notna().mean()*100,1),
}
summary


{'rows_in_panel': 17927,
 'rows_in_emp_slice': 16928,
 'date_range_panel': (1960, 2024),
 'date_range_emp': (1960, 2023),
 'non_null_fraction_female_participation': 45.8,
 'non_null_fraction_unemployment': 44.5}

In [17]:
# Countries & coverage in Employment slice 

# Total unique countries
n_countries_panel = panel["iso_code"].nunique(dropna=True)
n_countries_emp   = emp["iso_code"].nunique(dropna=True)

print(f"Unique countries in PANEL:      {n_countries_panel}")
print(f"Unique countries in EMP slice:  {n_countries_emp}")

# How many rows per country in the EMP slice (useful to spot thin coverage)
rows_per_country = (
    emp.groupby("iso_code", dropna=True)
       .size()
       .rename("rows")
       .sort_values(ascending=True)
)

# Employment-variable coverage by country (non-null counts)
cov = (
    emp.groupby("iso_code", dropna=True)
       .agg(
           rows=("iso_code", "size"),
           n_female_participation=("female_participation", lambda s: s.notna().sum()),
           n_unemployment=("unemployment", lambda s: s.notna().sum()),
       )
       .assign(
           has_female_participation=lambda d: (d["n_female_participation"] > 0).astype(int),
           has_unemployment=lambda d: (d["n_unemployment"] > 0).astype(int),
       )
       .sort_values(["has_female_participation","has_unemployment","rows"], ascending=[True, True, True])
)

#print("\nCountries with the thinnest EMP coverage (first 15):")
#display(cov.head(15))

# Quick summary of coverage breadth
print("\nCoverage summary:")
print("Countries with ANY female_participation data:", int(cov["has_female_participation"].sum()))
print("Countries with ANY unemployment data:        ", int(cov["has_unemployment"].sum()))
print("Countries with BOTH employment vars:         ",
      int(((cov["has_female_participation"] == 1) & (cov["has_unemployment"] == 1)).sum()))


Unique countries in PANEL:      278
Unique countries in EMP slice:  265

Coverage summary:
Countries with ANY female_participation data: 235
Countries with ANY unemployment data:         235
Countries with BOTH employment vars:          235


In [18]:
# Pooled correlations 
from scipy.stats import pearsonr, spearmanr

def corr_tests(df, x, y="fertility_rate", min_n=10):
    sub = df[[y, x]].dropna()
    n = len(sub)
    if n < min_n:
        return {"var": x, "n": n, "pearson_r": np.nan, "pearson_p": np.nan,
                "spearman_r": np.nan, "spearman_p": np.nan}
    r_p, p_p = pearsonr(sub[y].to_numpy(), sub[x].to_numpy())
    r_s, p_s = spearmanr(sub[y].to_numpy(), sub[x].to_numpy())
    return {"var": x, "n": n, "pearson_r": float(r_p), "pearson_p": float(p_p),
            "spearman_r": float(r_s), "spearman_p": float(p_s)}

pooled_rows = [corr_tests(emp, v) for v in ["female_participation","unemployment"]]
emp_pooled_sig = pd.DataFrame(pooled_rows)[["var","n","pearson_r","pearson_p","spearman_r","spearman_p"]]
emp_pooled_sig


Unnamed: 0,var,n,pearson_r,pearson_p,spearman_r,spearman_p
0,female_participation,7986,0.098179,1.448453e-18,0.088141,3.00347e-15
1,unemployment,7751,-0.121932,4.601028e-27,-0.165152,1.601575e-48


Female participation: effect small

Unemployment: effect small

Interpretation: Both employment metrics are statistically significant but weak correlates of fertility in pooled data: fertility is slightly higher where female participation is higher and slightly lower where unemployment is higher. Given small effect sizes (|r| ≈ 0.10–0.17) and thinner coverage, treat these as low practical significance until we check robustness.

In [19]:
# Within-country (demeaned) 
def demean_by_country(df, cols):
    out = df.copy()
    for c in cols:
        out[c] = out.groupby("iso_code")[c].transform(lambda s: s - s.mean())
    return out

emp_w = demean_by_country(emp, ["fertility_rate","female_participation","unemployment"])
within_rows = [corr_tests(emp_w, v) for v in ["female_participation","unemployment"]]
emp_within_sig = pd.DataFrame(within_rows)[["var","n","pearson_r","pearson_p","spearman_r","spearman_p"]]
emp_within_sig


Unnamed: 0,var,n,pearson_r,pearson_p,spearman_r,spearman_p
0,female_participation,7986,-0.03183,0.004445,-0.011529,0.302937
1,unemployment,7751,-0.044853,7.8e-05,-0.020671,0.068799


Decision: Pearson shows statistical significance but very small effects (|r| ≈ 0.03–0.05); Spearman is not significant for both. After demeaning, employment variables exhibit minimal within-country association with fertility. Under multiple-testing correction, these would be borderline/likely non-robust.
Employment metrics contribute little to explaining year-to-year fertility movement once country fixed differences are removed.

In [20]:
# Lags
emp_l = emp.sort_values(["iso_code","year"]).copy()
for v in ["female_participation","unemployment"]:
    for k in (1,2,3):
        emp_l[f"{v}_lag{k}"] = emp_l.groupby("iso_code")[v].shift(k)

lag_rows = []
for v in ["female_participation","unemployment"]:
    for k in (1,2,3):
        x = f"{v}_lag{k}"
        lag_rows.append(corr_tests(emp_l, x))
emp_lags_sig = (pd.DataFrame(lag_rows)
                [["var","n","pearson_r","pearson_p","spearman_r","spearman_p"]]
                .sort_values(["var"]))
emp_lags_sig


Unnamed: 0,var,n,pearson_r,pearson_p,spearman_r,spearman_p
0,female_participation_lag1,7754,0.106041,7.785512e-21,0.093079,2.161973e-16
1,female_participation_lag2,7520,0.11449,2.2928910000000003e-23,0.098912,8.174775e-18
2,female_participation_lag3,7285,0.122169,1.2529880000000001e-25,0.104077,5.296713e-19
3,unemployment_lag1,7519,-0.129031,2.786391e-29,-0.172909,1.506811e-51
4,unemployment_lag2,7285,-0.135112,5.008147e-31,-0.17934,1.033447e-53
5,unemployment_lag3,7050,-0.138382,1.746051e-31,-0.181167,4.398504e-53


Results (pooled, your outputs):

Female participation: small and positive.

Unemployment: small and negative.

Interpretation: In pooled data, higher female participation in prior years is associated with slightly higher fertility, and higher unemployment with slightly lower fertility. Magnitudes are small (|r|≈0.10–0.18) and likely reflect between-country structure and persistence rather than time-local causal effects. Treat as statistical rather than practically large signal.

In [21]:
# Partial correlations | log(GDPpc) and | marriage 


# Ensure we have the needed columns in `emp`
need_cols = {"iso_code","year","fertility_rate","female_participation","unemployment","log_gdppc","married_percentage"}
missing = sorted(list(need_cols - set(emp.columns)))
if missing:
    # merge from the full panel (assumes `panel` is in memory from E1)
    emp = emp.merge(panel[[c for c in missing if c in panel.columns] + ["iso_code","year"]],
                    on=["iso_code","year"], how="left")
    print("Merged missing columns into `emp`:", missing)

# Helper: partial r(X,Y | Z) with p-value
def partial_corr(df, x, y, z, min_n=10):
    sub = df[[x, y, z]].dropna()
    n = len(sub)
    if n < min_n:
        return {"var": x, "control": z, "n": n, "partial_r": np.nan, "partial_p": np.nan}
    # residualize y ~ z
    b1y, b0y = np.polyfit(sub[z], sub[y], 1); y_res = sub[y] - (b1y*sub[z] + b0y)
    # residualize x ~ z
    b1x, b0x = np.polyfit(sub[z], sub[x], 1); x_res = sub[x] - (b1x*sub[z] + b0x)
    r, p = pearsonr(y_res.to_numpy(), x_res.to_numpy())
    return {"var": x, "control": z, "n": n, "partial_r": float(r), "partial_p": float(p)}

# Partials controlling for log(GDPpc)
rows_log = []
for v in ["female_participation","unemployment"]:
    rows_log.append(partial_corr(emp, x=v, y="fertility_rate", z="log_gdppc"))
emp_partial_loggdp = pd.DataFrame(rows_log)[["var","control","n","partial_r","partial_p"]]

# Partials controlling for marriage (optional but useful)
rows_mar = []
if "married_percentage" in emp.columns:
    for v in ["female_participation","unemployment"]:
        rows_mar.append(partial_corr(emp, x=v, y="fertility_rate", z="married_percentage"))
    emp_partial_marriage = pd.DataFrame(rows_mar)[["var","control","n","partial_r","partial_p"]]
else:
    emp_partial_marriage = pd.DataFrame(columns=["var","control","n","partial_r","partial_p"])

# Show results explicitly
print("\n============= Employment partials | log(GDPpc)=============")
print(emp_partial_loggdp.to_string(index=False))

print("\n================== Employment partials | marriage ==================")
if len(emp_partial_marriage):
    print(emp_partial_marriage.to_string(index=False))
else:
    print("married_percentage not available in `emp`—skipped.")



                 var   control    n  partial_r    partial_p
female_participation log_gdppc 7823   0.094616 5.028473e-17
        unemployment log_gdppc 7602  -0.132480 4.116892e-31

                 var            control    n  partial_r    partial_p
female_participation married_percentage 6286   0.064106 3.640858e-07
        unemployment married_percentage 6101   0.049312 1.165654e-04


Partial correlations: Employment metrics retain statistically significant but small associations after controlling for income (and even after controlling for marriage). Practical effect sizes are limited.

In [22]:
# First differences 
from scipy.stats import pearsonr, spearmanr

def corr_cols(df, xcol, ycol):
    sub = df[[ycol, xcol]].dropna()
    n = len(sub)
    if n < 10:
        return {"var": xcol, "n": n, "pearson_r": np.nan, "pearson_p": np.nan,
                "spearman_r": np.nan, "spearman_p": np.nan}
    r_p, p_p = pearsonr(sub[ycol].to_numpy(), sub[xcol].to_numpy())
    r_s, p_s = spearmanr(sub[ycol].to_numpy(), sub[xcol].to_numpy())
    return {"var": xcol, "n": n, "pearson_r": float(r_p), "pearson_p": float(p_p),
            "spearman_r": float(r_s), "spearman_p": float(p_s)}

emp_d = emp.sort_values(["iso_code","year"]).copy()
emp_d["d_fertility"] = emp_d.groupby("iso_code")["fertility_rate"].diff()
emp_d["d_female_participation"] = emp_d.groupby("iso_code")["female_participation"].diff()
emp_d["d_unemployment"] = emp_d.groupby("iso_code")["unemployment"].diff()

delta_rows = [
    corr_cols(emp_d, "d_female_participation", "d_fertility"),
    corr_cols(emp_d, "d_unemployment", "d_fertility")
]
emp_delta_sig = pd.DataFrame(delta_rows)[["var","n","pearson_r","pearson_p","spearman_r","spearman_p"]]
emp_delta_sig


Unnamed: 0,var,n,pearson_r,pearson_p,spearman_r,spearman_p
0,d_female_participation,7751,0.034226,0.002581292,0.043093,0.0001475928
1,d_unemployment,7516,-0.101957,7.9433639999999995e-19,-0.14339,8.074422e-36


# SUMMARY

H0: There is no statistical association between fertility and employment (female labor-force participation or unemployment).

Results:
female participation: n = 7,751, r = +0.034, p = 2.58×10⁻³; ρ = +0.043, p = 1.48×10⁻⁴: reject H0 (effect tiny, r² ≈ 0.001).
unemployment: n = 7,516, r = −0.102, p = 7.94×10⁻¹⁹; ρ = −0.143, p = 8.07×10⁻³⁶: reject H0 (effect small, r² ≈ 0.010).

Year-to-year changes in employment conditions have very limited association with year-to-year fertility changes. The unemployment signal is statistically clear but small in magnitude; female participation is minuscule. We can reject null hypothesis for statistically clear results, but the effect is negligible. 