# **Predicting High-Cost Medicare Beneficiaries: A Machine Learning Comparison Using CMS Synthetic Claims Data**

# **Part 0: Data Loading**

In [1]:
import numpy as np
import pandas as pd
import zipfile
import os

In [2]:
def load_csv_from_zip(zip_path, date_column=None):
    """
    Load a CSV from a zip file.
    Optionally extract a 'year' column from a date column.
    """
    with zipfile.ZipFile(zip_path) as z:
        file_name = z.namelist()[0]
        with z.open(file_name) as f:
            df = pd.read_csv(f, low_memory=False)
    
    if date_column and date_column in df.columns:
        df['year'] = pd.to_datetime(df[date_column]).dt.year
    
    return df

In [3]:
# Load files for beneficiaries
beneficiary_paths = {
    2008: "data/beneficiary_summary/2008/176586_DE1_0_2008_Beneficiary_Summary_File_Sample_9.zip",
    2009: "data/beneficiary_summary/2009/176582_DE1_0_2009_Beneficiary_Summary_File_Sample_9.zip",
    2010: "data/beneficiary_summary/2010/DE1_0_2010_Beneficiary_Summary_File_Sample_9.zip"
}

beneficiaries_list = []
for year, path in beneficiary_paths.items():
    df = load_csv_from_zip(path)
    df['year'] = int(year)
    beneficiaries_list.append(df)

beneficiaries_all = pd.concat(beneficiaries_list, ignore_index=True)
print("Beneficiary summary loaded:", beneficiaries_all.shape)
beneficiaries_all.to_pickle("data/beneficiaries_raw.pkl")

# Quick check
beneficiaries_all.head(5)

Beneficiary summary loaded: (343467, 33)


Unnamed: 0,DESYNPUF_ID,BENE_BIRTH_DT,BENE_DEATH_DT,BENE_SEX_IDENT_CD,BENE_RACE_CD,BENE_ESRD_IND,SP_STATE_CODE,BENE_COUNTY_CD,BENE_HI_CVRAGE_TOT_MONS,BENE_SMI_CVRAGE_TOT_MONS,...,MEDREIMB_IP,BENRES_IP,PPPYMT_IP,MEDREIMB_OP,BENRES_OP,PPPYMT_OP,MEDREIMB_CAR,BENRES_CAR,PPPYMT_CAR,year
0,000102649ED5601B,19311001,,2,1,0,49,290,12,12,...,0.0,0.0,0.0,360.0,90.0,0.0,860.0,170.0,0.0,2008
1,0002278C944E240A,19360901,,2,1,Y,34,250,12,12,...,0.0,0.0,0.0,2590.0,1530.0,0.0,5000.0,1230.0,0.0,2008
2,000330E625C93700,19330901,,2,1,0,4,420,12,12,...,0.0,0.0,0.0,0.0,0.0,0.0,270.0,150.0,0.0,2008
3,000374D5E110EDA6,19360801,,2,1,0,5,90,12,12,...,0.0,0.0,0.0,60.0,30.0,0.0,1090.0,360.0,0.0,2008
4,0003950E4B4FEC8D,19401101,,2,2,0,19,420,12,12,...,0.0,0.0,0.0,3690.0,420.0,0.0,3590.0,960.0,0.0,2008


In [5]:
# Load files for carriers
carrier_paths = {
    "data/carrier/DE1_0_2008_to_2010_Carrier_Claims_Sample_9A.zip",
    "data/carrier/DE1_0_2008_to_2010_Carrier_Claims_Sample_9B.zip"
}

# Because attempts to read would always encounter "ran out of input" error but it did load 5.1 GB data properly, I'll keep A and B files separate.
# Also used parquet to read instead because pickle is taking really long.
for path in carrier_paths:
    print("Loading:", path)
    df = load_csv_from_zip(path, date_column='CLM_FROM_DT')
    
    # Save raw chunk directly in data/
    out_file = path.split('/')[-1].replace('.zip', '_raw.parquet')
    df.to_parquet(f"data/{out_file}", engine='pyarrow', index=False)
    print(f"Saved {out_file} successfully!")

# Quick check for one chunk
carrier_a = pd.read_parquet("data/DE1_0_2008_to_2010_Carrier_Claims_Sample_9A_raw.parquet")
carrier_a.head()

Loading: data/carrier/DE1_0_2008_to_2010_Carrier_Claims_Sample_9A.zip
Saved DE1_0_2008_to_2010_Carrier_Claims_Sample_9A_raw.parquet successfully!
Loading: data/carrier/DE1_0_2008_to_2010_Carrier_Claims_Sample_9B.zip
Saved DE1_0_2008_to_2010_Carrier_Claims_Sample_9B_raw.parquet successfully!


Unnamed: 0,DESYNPUF_ID,CLM_ID,CLM_FROM_DT,CLM_THRU_DT,ICD9_DGNS_CD_1,ICD9_DGNS_CD_2,ICD9_DGNS_CD_3,ICD9_DGNS_CD_4,ICD9_DGNS_CD_5,ICD9_DGNS_CD_6,...,LINE_ICD9_DGNS_CD_5,LINE_ICD9_DGNS_CD_6,LINE_ICD9_DGNS_CD_7,LINE_ICD9_DGNS_CD_8,LINE_ICD9_DGNS_CD_9,LINE_ICD9_DGNS_CD_10,LINE_ICD9_DGNS_CD_11,LINE_ICD9_DGNS_CD_12,LINE_ICD9_DGNS_CD_13,year
0,000102649ED5601B,684813370207372,20080301,20080301,2720,49121,V5869,,,,...,2724.0,2724.0,2724.0,2724.0,2724.0,2724.0,2724.0,2724.0,2724.0,1970
1,000102649ED5601B,684253371655461,20080407,20080407,1741,V4571,,,,,...,,,,,,,,,,1970
2,000102649ED5601B,684323369524620,20080420,20080420,V285,,,,,,...,,,,,,,,,,1970
3,000102649ED5601B,684163371235030,20080524,20080524,7850,4011,78079,,,,...,,,,,,,,,,1970
4,000102649ED5601B,684913369780400,20080604,20080604,33111,,,,,,...,,,,,,,,,,1970


In [6]:
# Load files for inpatient, outpatient, and prescription drug events

ip_all = load_csv_from_zip("data/inpatient/176536_DE1_0_2008_to_2010_Inpatient_Claims_Sample_9.zip",
                           date_column='CLM_FROM_DT')
print("Inpatient claims loaded:", ip_all.shape)
ip_all.to_pickle("data/ip_raw.pkl")

op_all = load_csv_from_zip("data/outpatient/176624_DE1_0_2008_to_2010_Outpatient_Claims_Sample_9.zip",
                           date_column='CLM_FROM_DT')
print("Outpatient claims loaded:", op_all.shape)
op_all.to_pickle("data/op_raw.pkl")
pde_all = load_csv_from_zip("data/pde/DE1_0_2008_to_2010_Prescription_Drug_Events_Sample_9.zip",
                            date_column='SRVC_DT')
print("PDE claims loaded:", pde_all.shape)
pde_all.to_pickle("data/pde_raw.pkl")

ip_all.head(5)
#op_all.head(5)
#pde_all.head(5)

Inpatient claims loaded: (66763, 82)
Outpatient claims loaded: (790818, 77)
PDE claims loaded: (5552470, 9)


Unnamed: 0,DESYNPUF_ID,CLM_ID,SEGMENT,CLM_FROM_DT,CLM_THRU_DT,PRVDR_NUM,CLM_PMT_AMT,NCH_PRMRY_PYR_CLM_PD_AMT,AT_PHYSN_NPI,OP_PHYSN_NPI,...,HCPCS_CD_37,HCPCS_CD_38,HCPCS_CD_39,HCPCS_CD_40,HCPCS_CD_41,HCPCS_CD_42,HCPCS_CD_43,HCPCS_CD_44,HCPCS_CD_45,year
0,0002278C944E240A,992581161620535,1,20090314.0,20090418.0,3400QT,23000.0,0.0,698797200.0,8972194000.0,...,,,,,,,,,,1970.0
1,0002278C944E240A,992111161633637,1,20090620.0,20090625.0,3400ZQ,46000.0,0.0,4561865000.0,6779930000.0,...,,,,,,,,,,1970.0
2,0002278C944E240A,992231161630505,1,20090620.0,20090725.0,3400BT,27000.0,0.0,6343428000.0,9989246000.0,...,,,,,,,,,,1970.0
3,0002278C944E240A,992261161643487,1,20100501.0,20100503.0,3400VS,3000.0,0.0,8527411000.0,,...,,,,,,,,,,1970.0
4,000374D5E110EDA6,992261161657815,1,20091230.0,20091230.0,0500VS,9000.0,0.0,3909066000.0,,...,,,,,,,,,,1970.0
