In [1]:
import pandas as pd

# Load files
lite_path = r"C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_lite.csv"
mcr_path  = r"C:\Repositories\white-bowblis-nhmc\data\interim\mcr_chow_provider_events_all.csv"

lite = pd.read_csv(lite_path, dtype={"cms_certification_number": "string"})
mcr  = pd.read_csv(mcr_path, dtype={"cms_certification_number": "string"})

# Harmonize columns
lite_comp = lite[["cms_certification_number", "num_chows"]].rename(columns={"num_chows": "num_chows_lite"})
mcr_comp  = mcr[["cms_certification_number", "n_chow"]].rename(columns={"n_chow": "num_chows_mcr"})

# Merge all CCNs
merged = lite_comp.merge(mcr_comp, on="cms_certification_number", how="outer")

# Create binary CHOW flags
merged["is_chow_lite"] = merged["num_chows_lite"].fillna(0) > 0
merged["is_chow_mcr"]  = merged["num_chows_mcr"].fillna(0) > 0

# Build 2x2 cross-tab
crosstab = pd.crosstab(merged["is_chow_lite"], merged["is_chow_mcr"],
                       rownames=["Lite (ours)"], colnames=["MCR"])

print("\n=== Crosstab of CHOW presence ===")
print(crosstab)

# Explicit categories
both_zero = merged[(~merged["is_chow_lite"]) & (~merged["is_chow_mcr"])]
both_yes  = merged[( merged["is_chow_lite"]) & ( merged["is_chow_mcr"])]
only_lite = merged[( merged["is_chow_lite"]) & (~merged["is_chow_mcr"])]
only_mcr  = merged[(~merged["is_chow_lite"]) & ( merged["is_chow_mcr"])]

print("\n=== Counts ===")
print("Both zero:", len(both_zero))
print("Both yes :", len(both_yes))
print("Only lite:", len(only_lite))
print("Only mcr :", len(only_mcr))
print("Total    :", len(merged))

# Some quick info about mismatches
print("\n=== Only in Lite (examples) ===")
print(only_lite.head())

print("\n=== Only in MCR (examples) ===")
print(only_mcr.head())


=== Crosstab of CHOW presence ===
MCR          False  True 
Lite (ours)              
False         7673    801
True          3000   3721

=== Counts ===
Both zero: 7673
Both yes : 3721
Only lite: 3000
Only mcr : 801
Total    : 15195

=== Only in Lite (examples) ===
   cms_certification_number  num_chows_lite  num_chows_mcr  is_chow_lite  \
7                    015019             2.0            0.0          True   
11                   015028             3.0            0.0          True   
23                   015048             1.0            0.0          True   
31                   015067             2.0            0.0          True   
37                   015084             3.0            0.0          True   

    is_chow_mcr  
7         False  
11        False  
23        False  
31        False  
37        False  

=== Only in MCR (examples) ===
    cms_certification_number  num_chows_lite  num_chows_mcr  is_chow_lite  \
24                    015049             0.0            1.

In [2]:
# Identify CHOW flags
lite["is_chow"] = lite["num_chows"] > 0
mcr["is_chow"]  = mcr["n_chow"] > 0

# Merge only the chow_date columns + flags
chow_date_cols_lite = [c for c in lite.columns if c.startswith("chow_date_")]
chow_date_cols_mcr  = [c for c in mcr.columns  if c.startswith("chow_") and c.endswith("_date")]

merged = lite[["cms_certification_number","is_chow"]+chow_date_cols_lite].merge(
    mcr[["cms_certification_number","is_chow"]+chow_date_cols_mcr],
    on="cms_certification_number", suffixes=("_lite","_mcr")
)

# Restrict to CCNs where both have at least 1 CHOW
both = merged[(merged["is_chow_lite"]) & (merged["is_chow_mcr"])].copy()

# Function: check if any date matches across the two sets
def any_date_match(row):
    lite_dates = set(pd.to_datetime(row[chow_date_cols_lite], errors="coerce").dropna().dt.date)
    mcr_dates  = set(pd.to_datetime(row[chow_date_cols_mcr],  errors="coerce").dropna().dt.date)
    return len(lite_dates & mcr_dates) > 0

# Apply
both["has_same_date"] = both.apply(any_date_match, axis=1)

# Frequency table
freq_table = both["has_same_date"].value_counts().rename(index={True:"Same date", False:"Different dates"}).to_frame("count")
freq_table["percent"] = 100 * freq_table["count"] / len(both)

print("\n=== Frequency of same CHOW dates (among CCNs with CHOW in both) ===")
print(freq_table)

# Optional: see examples
print("\n=== Example with same date ===")
print(both[both["has_same_date"]].head(5))

print("\n=== Example with different dates ===")
print(both[~both["has_same_date"]].head(5))


=== Frequency of same CHOW dates (among CCNs with CHOW in both) ===
                 count    percent
has_same_date                    
Same date         2287  61.461973
Different dates   1434  38.538027

=== Example with same date ===
   cms_certification_number  is_chow_lite chow_date_1 chow_date_2 chow_date_3  \
13                   015037          True  2017-12-01  2019-12-16         NaN   
21                   015050          True  2022-02-14  2022-05-01         NaN   
27                   015071          True  2022-10-01         NaN         NaN   
29                   015075          True  2020-01-01  2021-01-01         NaN   
48                   015119          True  2017-12-01  2019-12-16  2023-08-25   

   chow_date_4 chow_date_5 chow_date_6 chow_date_7 chow_date_8 chow_date_9  \
13         NaN         NaN         NaN         NaN         NaN         NaN   
21         NaN         NaN         NaN         NaN         NaN         NaN   
27         NaN         NaN         NaN    

In [1]:
import pandas as pd
import re

# --- paths ---
lite_fp = r"C:\Repositories\white-bowblis-nhmc\data\interim\ccn_chow_lite.csv"
mcr_fp  = r"C:\Repositories\white-bowblis-nhmc\data\interim\mcr_chow_provider_events_all.csv"
hosp_fp = r"C:\Users\Owner\OneDrive\NursingHomeData\provider-info-files\provider_resides_in_hospital_by_ccn.csv"

def std_ccn(df, col="cms_certification_number"):
    df[col] = (df[col].astype(str)
               .str.extract(r"(\d+)", expand=False)
               .str.zfill(6))
    return df

# --------- load ---------
lite = pd.read_csv(lite_fp, dtype={"cms_certification_number": str})
mcr  = pd.read_csv(mcr_fp, dtype={"cms_certification_number": str})
hosp = pd.read_csv(hosp_fp, dtype={"cms_certification_number": str})

# --------- standardize CCNs ---------
for df in (lite, mcr, hosp):
    std_ccn(df)

# --------- hospital filter (drop True) ---------
hosp_col = "provider_resides_in_hospital"
if hosp_col not in hosp.columns:
    # common variants
    alt = [c for c in hosp.columns if re.search(r"inhosp|resides", c, re.I)]
    if not alt:
        raise ValueError("Couldn't find hospital flag column in the hospital file.")
    hosp_col = alt[0]

def to_boolish(s):
    s = s.astype(str).str.strip().str.lower()
    return s.map({"1": True, "y": True, "yes": True, "true": True, "t": True,
                  "0": False, "n": False, "no": False, "false": False, "f": False}).astype("boolean")

hosp["in_hosp"] = to_boolish(hosp[hosp_col])

lite = lite.merge(hosp[["cms_certification_number","in_hosp"]], on="cms_certification_number", how="left")
mcr  = mcr.merge(hosp[["cms_certification_number","in_hosp"]],  on="cms_certification_number", how="left")

lite = lite[lite["in_hosp"] != True].copy()
mcr  = mcr[mcr["in_hosp"]  != True].copy()

# --------- derive num_chows + is_chow for LITE ---------
def derive_lite_counts(df):
    out = df.copy()
    if "num_chows" in out.columns:
        num = pd.to_numeric(out["num_chows"], errors="coerce").fillna(0).astype(int)
    else:
        chow_cols = [c for c in out.columns if re.match(r"chow_date_\d+$", c)]
        if chow_cols:
            num = out[chow_cols].notna().sum(axis=1).astype(int)
        else:
            # fallback: if only a single is_chow flag exists
            num = out.get("is_chow", pd.Series(False, index=out.index)).astype(bool).astype(int)
    out["num_chows_lite"] = num
    out["is_chow_lite"] = (num > 0)
    return out[["cms_certification_number","num_chows_lite","is_chow_lite"]].drop_duplicates("cms_certification_number")

lite_counts = derive_lite_counts(lite)

# --------- derive num_chows + is_chow for MCR (handles multiple schemas) ---------
def derive_mcr_counts(df):
    d = df.copy()

    # Standardize obvious event fields to strings
    for c in d.columns:
        if d[c].dtype == "bool":
            d[c] = d[c].astype("boolean")
        elif d[c].dtype.name != "category":
            try:
                d[c] = d[c].astype(str)
            except Exception:
                pass

    # 1) If already summarized
    if "num_chows" in d.columns and "cms_certification_number" in d.columns:
        g = (d[["cms_certification_number","num_chows"]]
               .dropna()
               .assign(num_chows=lambda x: pd.to_numeric(x["num_chows"], errors="coerce").fillna(0).astype(int))
               .groupby("cms_certification_number", as_index=False)["num_chows"].max())
        g["is_chow_mcr"] = g["num_chows"] > 0
        g = g.rename(columns={"num_chows":"num_chows_mcr"})
        return g

    # 2) Event-level: explicit is_chow flag
    flag_cols = [c for c in d.columns if c.lower() == "is_chow"]
    if flag_cols:
        c = flag_cols[0]
        d[c] = d[c].str.strip().str.lower().map({"1":True,"y":True,"yes":True,"true":True,"t":True,
                                                 "0":False,"n":False,"no":False,"false":False,"f":False}).fillna(False)
        g = (d.groupby("cms_certification_number", as_index=False)[c]
               .sum()
               .rename(columns={c:"num_chows_mcr"}))
        g["num_chows_mcr"] = g["num_chows_mcr"].astype(int)
        g["is_chow_mcr"] = g["num_chows_mcr"] > 0
        return g

    # 3) Event-level: infer from event type/name text
    # Look for a column that carries event type/name
    text_cols_pref = [c for c in d.columns if re.search(r"(event|type|name|desc)", c, re.I)]
    if text_cols_pref:
        # consider a row a CHOW if any text col contains 'change of ownership' or 'chow'
        patt = re.compile(r"\bchange of ownership\b|\bchow\b", re.I)
        chow_mask = pd.Series(False, index=d.index)
        for c in text_cols_pref:
            try:
                chow_mask = chow_mask | d[c].str.contains(patt, na=False)
            except Exception:
                pass
        d["_is_chow_row"] = chow_mask
        g = (d.groupby("cms_certification_number", as_index=False)["_is_chow_row"]
               .sum()
               .rename(columns={"_is_chow_row":"num_chows_mcr"}))
        g["num_chows_mcr"] = g["num_chows_mcr"].astype(int)
        g["is_chow_mcr"] = g["num_chows_mcr"] > 0
        return g

    # 4) Last resort: count unique event dates (likely over-count, but avoids KeyError)
    date_cols = [c for c in d.columns if re.search(r"date", c, re.I)]
    if date_cols:
        c = date_cols[0]
        g = (d.groupby("cms_certification_number")[c]
               .nunique(dropna=True)
               .reset_index(name="num_chows_mcr"))
        g["is_chow_mcr"] = g["num_chows_mcr"] > 0
        return g

    # If nothing matched, return zeros for seen CCNs
    ccns = d["cms_certification_number"].dropna().unique()
    g = pd.DataFrame({"cms_certification_number": ccns, "num_chows_mcr": 0})
    g["is_chow_mcr"] = False
    return g

mcr_counts = derive_mcr_counts(mcr)

# --------- merge and crosstab ---------
merged = (lite_counts
          .merge(mcr_counts, on="cms_certification_number", how="outer")
          .fillna({"num_chows_lite":0, "num_chows_mcr":0})
         )
merged["is_chow_lite"] = merged["is_chow_lite"].fillna(False).astype(bool)
merged["is_chow_mcr"]  = merged["is_chow_mcr"].fillna(False).astype(bool)

xtab = pd.crosstab(merged["is_chow_lite"], merged["is_chow_mcr"],
                   rownames=["Lite (ours)"], colnames=["MCR"])

print("=== Crosstab of CHOW presence (after hospital filter) ===")
print(xtab, "\n")

print("=== Counts ===")
both_zero = ((~merged["is_chow_lite"]) & (~merged["is_chow_mcr"])).sum()
both_yes  = (merged["is_chow_lite"] & merged["is_chow_mcr"]).sum()
only_lite = (merged["is_chow_lite"] & ~merged["is_chow_mcr"]).sum()
only_mcr  = (merged["is_chow_mcr"] & ~merged["is_chow_lite"]).sum()
print("Both zero:", int(both_zero))
print("Both yes :", int(both_yes))
print("Only lite:", int(only_lite))
print("Only mcr :", int(only_mcr))
print("Total    :", len(merged))

# Optional: peek at disagreements
only_lite_df = merged[(merged["is_chow_lite"]) & (~merged["is_chow_mcr"])].head(10)
only_mcr_df  = merged[(~merged["is_chow_lite"]) & (merged["is_chow_mcr"])].head(10)

print("\n=== Only in Lite (examples) ===")
print(only_lite_df[["cms_certification_number","num_chows_lite","num_chows_mcr","is_chow_lite","is_chow_mcr"]])

print("\n=== Only in MCR (examples) ===")
print(only_mcr_df[["cms_certification_number","num_chows_lite","num_chows_mcr","is_chow_lite","is_chow_mcr"]])

=== Crosstab of CHOW presence (after hospital filter) ===
MCR          False  True 
Lite (ours)              
False         7458    722
True          3000   3721 

=== Counts ===
Both zero: 7458
Both yes : 3721
Only lite: 3000
Only mcr : 722
Total    : 14901

=== Only in Lite (examples) ===
   cms_certification_number  num_chows_lite  num_chows_mcr  is_chow_lite  \
6                    015019             2.0            0.0          True   
10                   015028             3.0            0.0          True   
21                   015048             1.0            0.0          True   
29                   015067             2.0            0.0          True   
35                   015084             3.0            0.0          True   
43                   015104             1.0            0.0          True   
45                   015111             2.0            0.0          True   
47                   015113             2.0            0.0          True   
48                   015

  merged["is_chow_lite"] = merged["is_chow_lite"].fillna(False).astype(bool)
  merged["is_chow_mcr"]  = merged["is_chow_mcr"].fillna(False).astype(bool)
