In [3]:
import pandas as pd
from pathlib import Path

# --- paths ---
INTERIM = Path(r"C:\Repositories\white-bowblis-nhmc\data\interim")
MCR_FP  = INTERIM / "mcr_chow_provider_events_all.csv"   # has n_chow
OWN_FP  = INTERIM / "ccn_chow_summary.csv"               # has num_chows (or similar)

# --- helpers ---
def pick_col(cols, options):
    for c in options:
        if c in cols:
            return c
    return None

# --- load ---
mcr = pd.read_csv(MCR_FP, dtype={"cms_certification_number":"string"})
own = pd.read_csv(OWN_FP, dtype={"cms_certification_number":"string"})

print(f"[MCR] providers total = {len(mcr):,}")
print(f"[OWN] providers total = {len(own):,}")

# identify CHOW count columns
mcr_cnt_col = pick_col(mcr.columns, ["n_chow","num_chow","n_chows","num_chows"])
own_cnt_col = pick_col(own.columns, ["n_chow","num_chow","n_chows","num_chows"])

if mcr_cnt_col is None:
    raise KeyError("Could not find a CHOW count column in MCR file.")
if own_cnt_col is None:
    raise KeyError("Could not find a CHOW count column in Ownership file.")

# keep just what we need
mcr_keep = mcr[["cms_certification_number", mcr_cnt_col]].rename(columns={mcr_cnt_col:"n_chow_mcr"})
own_keep = own[["cms_certification_number", own_cnt_col]].rename(columns={own_cnt_col:"n_chow_own"})

# --- merge on CCN (inner = only providers present in both universes) ---
merged = pd.merge(mcr_keep, own_keep, on="cms_certification_number", how="inner")
print(f"[MERGED] providers overlap = {len(merged):,}")

# ensure ints (coerce if any str slipped in)
merged["n_chow_mcr"] = pd.to_numeric(merged["n_chow_mcr"], errors="coerce").fillna(0).astype(int)
merged["n_chow_own"] = pd.to_numeric(merged["n_chow_own"], errors="coerce").fillna(0).astype(int)

# --- buckets ---
both_zero = merged[(merged["n_chow_mcr"] == 0) & (merged["n_chow_own"] == 0)]
both_one  = merged[(merged["n_chow_mcr"] == 1) & (merged["n_chow_own"] == 1)]
both_2p   = merged[(merged["n_chow_mcr"] >= 2) & (merged["n_chow_own"] >= 2)]

# discrepancies = everything else in the overlap
discrep = merged.drop(both_zero.index.union(both_one.index).union(both_2p.index))

# --- report ---
print("\n=== Bucket Counts ===")
print(f"Total MCR providers           : {len(mcr):,}")
print(f"Overlap with ownership        : {len(merged):,}")
print(f"Both zero (0 vs 0)            : {len(both_zero):,}")
print(f"Both one  (1 vs 1)            : {len(both_one):,}")
print(f"Both 2+   (≥2 vs ≥2)          : {len(both_2p):,}")
print(f"Discrepancies (everything else): {len(discrep):,}")

# optional quick peek at discrepancies
print("\n=== Discrepancy examples ===")
print(discrep.sample(min(10, len(discrep)), random_state=1) if len(discrep) else "None")

[MCR] providers total = 15,124
[OWN] providers total = 13,419
[MERGED] providers overlap = 13,348

=== Bucket Counts ===
Total MCR providers           : 15,124
Overlap with ownership        : 13,348
Both zero (0 vs 0)            : 5,945
Both one  (1 vs 1)            : 1,659
Both 2+   (≥2 vs ≥2)          : 546
Discrepancies (everything else): 5,198

=== Discrepancy examples ===
      cms_certification_number  n_chow_mcr  n_chow_own
3135                    145971           1           3
11597                   505532           0           1
6998                    285279           1           0
9993                    395636           0           4
5189                    225179           0           1
10865                   455497           1           0
12323                   555737           1           2
12438                   555905           0           1
8319                    345557           0           4
8233                    345426           0           4


In [4]:
bucket_counts = {
    "Both zero (0 vs 0)"     : len(both_zero),
    "Both one (1 vs 1)"      : len(both_one),
    "Both 2+ (≥2 vs ≥2)"     : len(both_2p),
    "Discrepancies (other)"  : len(discrep),
    "Total Overlap"          : len(merged)
}

bucket_table = pd.DataFrame.from_dict(bucket_counts, orient="index", columns=["Count"])
print(bucket_table)

                       Count
Both zero (0 vs 0)      5945
Both one (1 vs 1)       1659
Both 2+ (≥2 vs ≥2)       546
Discrepancies (other)   5198
Total Overlap          13348


In [5]:
ctab = pd.crosstab(
    merged["n_chow_mcr"].clip(upper=2).replace({2:"2+"}),
    merged["n_chow_own"].clip(upper=2).replace({2:"2+"}),
    rownames=["MCR n_chow"], colnames=["Ownership n_chow"]
)
print(ctab)

Ownership n_chow     0     1    2+
MCR n_chow                        
0                 5945  1843  1147
1                  642  1659  1337
2+                  50   179   546


In [6]:
import pandas as pd
from pathlib import Path

# ---------- Paths ----------
INTERIM = Path(r"C:\Repositories\white-bowblis-nhmc\data\interim")
MCR_FP  = INTERIM / "mcr_chow_provider_events_all.csv"   # columns: cms_certification_number, n_chow, ...
OWN_FP  = INTERIM / "ccn_chow_summary.csv"               # columns: cms_certification_number, (n_chow|num_chow|n_chows|num_chows), ...

# ---------- Helpers ----------
def pick_col(cols, options):
    for c in options:
        if c in cols:
            return c
    return None

def bucketize_count(n: int) -> str:
    if n <= 0: return "0"
    if n == 1: return "1"
    return "2+"

# ---------- Load ----------
mcr = pd.read_csv(MCR_FP, dtype={"cms_certification_number":"string"})
own = pd.read_csv(OWN_FP, dtype={"cms_certification_number":"string"})

mcr_cnt_col = pick_col(mcr.columns, ["n_chow","num_chow","n_chows","num_chows"])
own_cnt_col = pick_col(own.columns, ["n_chow","num_chow","n_chows","num_chows"])
if mcr_cnt_col is None:
    raise KeyError("Could not find a CHOW count column in MCR file.")
if own_cnt_col is None:
    raise KeyError("Could not find a CHOW count column in Ownership file.")

mcr_keep = mcr[["cms_certification_number", mcr_cnt_col]].rename(columns={mcr_cnt_col:"n_chow_mcr"})
own_keep = own[["cms_certification_number", own_cnt_col]].rename(columns={own_cnt_col:"n_chow_own"})

# ---------- Merge (inner overlap only) ----------
merged = pd.merge(mcr_keep, own_keep, on="cms_certification_number", how="inner")
merged["n_chow_mcr"] = pd.to_numeric(merged["n_chow_mcr"], errors="coerce").fillna(0).astype(int)
merged["n_chow_own"] = pd.to_numeric(merged["n_chow_own"], errors="coerce").fillna(0).astype(int)

# ---------- Buckets ----------
both_zero = merged[(merged["n_chow_mcr"] == 0) & (merged["n_chow_own"] == 0)]
both_one  = merged[(merged["n_chow_mcr"] == 1) & (merged["n_chow_own"] == 1)]
both_2p   = merged[(merged["n_chow_mcr"] >= 2) & (merged["n_chow_own"] >= 2)]
discrep   = merged.drop(both_zero.index.union(both_one.index).union(both_2p.index))

bucket_table = pd.DataFrame(
    {
        "Bucket": [
            "Both zero (0 vs 0)",
            "Both one (1 vs 1)",
            "Both 2+ (≥2 vs ≥2)",
            "Discrepancies (other)",
            "Total Overlap",
        ],
        "Count": [
            len(both_zero),
            len(both_one),
            len(both_2p),
            len(discrep),
            len(merged),
        ],
    }
)

# ---------- Crosstab (0 / 1 / 2+) ----------
tmp = merged.copy()
tmp["MCR n_chow"]  = tmp["n_chow_mcr"].map(bucketize_count)
tmp["OWN n_chow"]  = tmp["n_chow_own"].map(bucketize_count)

ctab = pd.crosstab(
    tmp["MCR n_chow"], tmp["OWN n_chow"],
    rownames=["MCR n_chow"], colnames=["Ownership n_chow"]
).reindex(index=["0","1","2+"], columns=["0","1","2+"], fill_value=0)

# ---------- Save: Excel ----------
excel_fp = INTERIM / "chow_agreement_tables.xlsx"
with pd.ExcelWriter(excel_fp, engine="openpyxl") as xw:
    bucket_table.to_excel(xw, index=False, sheet_name="SummaryBuckets")
    ctab.to_excel(xw, sheet_name="Crosstab_0_1_2plus")
print(f"[saved] Excel -> {excel_fp}")

# ---------- Save: LaTeX ----------
# a) Summary buckets (pre-format Count with commas for nicer TeX)
bucket_tex = bucket_table.assign(Count=bucket_table["Count"].map("{:,}".format)).to_latex(
    index=False, escape=False, bold_rows=False, longtable=False, multicolumn=False,
    caption="Agreement buckets between MCR and ownership CHOW counts",
    label="tab:chow_agreement_summary", column_format="lr"
)
(tex_fp := INTERIM / "chow_agreement_summary.tex").write_text(bucket_tex, encoding="utf-8")
print(f"[saved] LaTeX summary -> {tex_fp}")

# b) Crosstab (order preserved, add caption/label)
# Convert to DataFrame with string-formatted numbers if you want commas; usually small here, so plain ints are fine.
ctab_tex = ctab.to_latex(
    index=True, escape=False, bold_rows=False, longtable=False, multicolumn=False,
    caption="Crosstab of MCR vs Ownership CHOW counts (collapsed to 0/1/2+)",
    label="tab:chow_agreement_crosstab", column_format="lrrr"
)
(tex2_fp := INTERIM / "chow_agreement_crosstab.tex").write_text(ctab_tex, encoding="utf-8")
print(f"[saved] LaTeX crosstab -> {tex2_fp}")

# ---------- (Optional) Console preview ----------
print("\n=== Summary Buckets ===")
print(bucket_table.to_string(index=False))
print("\n=== Crosstab (0/1/2+) ===")
print(ctab)

[saved] Excel -> C:\Repositories\white-bowblis-nhmc\data\interim\chow_agreement_tables.xlsx
[saved] LaTeX summary -> C:\Repositories\white-bowblis-nhmc\data\interim\chow_agreement_summary.tex
[saved] LaTeX crosstab -> C:\Repositories\white-bowblis-nhmc\data\interim\chow_agreement_crosstab.tex

=== Summary Buckets ===
               Bucket  Count
   Both zero (0 vs 0)   5945
    Both one (1 vs 1)   1659
   Both 2+ (≥2 vs ≥2)    546
Discrepancies (other)   5198
        Total Overlap  13348

=== Crosstab (0/1/2+) ===
Ownership n_chow     0     1    2+
MCR n_chow                        
0                 5945  1843  1147
1                  642  1659  1337
2+                  50   179   546
