In [3]:
import pandas as pd
import os

# Define file paths for each year
ffs_paths = {
    2007: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/Aged07.csv",
    2008: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/AGED08.csv",
    2009: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged09.csv",
    2010: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged10.csv",
    2011: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged11.csv",
    2012: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged12.csv",
    2013: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged13.csv",
    2014: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/aged14.csv",
    2015: "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/input/ffs-costs/FFS15.xlsx"
}

# Define the number of rows to skip for each year
skip_rows = {
    2007: 5,
    2008: 4,
    2009: 7,
    2010: 7,
    2011: 2,
    2012: 2,
    2013: 2,
    2014: 2,
    2015: 2
}

ffs_costs_list = []

# Define the column names as given for 2007-2008
cols_07_08 = [
    "state", "ssa", "county_name", "parta_enroll", "parta_reimb", "parta_percap",
    "parta_reimb_unadj", "parta_percap_unadj", "parta_ime", "parta_dsh",
    "parta_gme", "parta_demo", "partb_enroll", "partb_reimb", "partb_percap",
    "partb_demo", "mean_risk"
]

# Process the files for 2007 and 2008
for year in [2007, 2008]:
    df = pd.read_csv(
        ffs_paths[year],
        skiprows=skip_rows[year],
        header=None,
        names=cols_07_08,
        na_values="*"
    )
    # Select relevant columns and add a year column
    ffs_costs = df[[
        "ssa", "state", "county_name", "parta_enroll", "parta_reimb",
        "partb_enroll", "partb_reimb", "mean_risk"
    ]].copy()
    ffs_costs["year"] = year
    ffs_costs_list.append(ffs_costs)

# %%
# Define the column names for years 2009-2014
cols_09_14 = [
    "ssa", "state", "county_name", "parta_enroll", "parta_reimb", "parta_percap",
    "parta_reimb_unadj", "parta_percap_unadj", "parta_ime", "parta_dsh",
    "parta_gme", "partb_enroll", "partb_reimb", "partb_percap", "mean_risk"
]

for year in range(2009, 2015):
    df = pd.read_csv(
        ffs_paths[year],
        skiprows=skip_rows[year],
        header=None,
        na_values="*"
    )
    # Use only the first 15 columns and assign proper names
    df = df.iloc[:, :15]
    df.columns = cols_09_14

    # Select the desired columns and add the year column
    ffs_costs = df[[
        "ssa", "state", "county_name", "parta_enroll", "parta_reimb",
        "partb_enroll", "partb_reimb", "mean_risk"
    ]].copy()
    ffs_costs["year"] = year
    
    # Convert the 'ssa' column to numeric
    ffs_costs["ssa"] = pd.to_numeric(ffs_costs["ssa"], errors="coerce")
    
    # Remove commas and convert specified columns to numeric
    for col in ["parta_enroll", "parta_reimb", "partb_enroll", "partb_reimb", "mean_risk"]:
        ffs_costs[col] = ffs_costs[col].astype(str).str.replace(",", "", regex=False)
        ffs_costs[col] = pd.to_numeric(ffs_costs[col], errors="coerce")
    
    ffs_costs_list.append(ffs_costs)

# Define column names for 2015
cols_2015 = [
    "ssa", "state", "county_name", "parta_enroll", "parta_reimb", "parta_percap",
    "parta_reimb_unadj", "parta_percap_unadj", "parta_ime", "parta_dsh",
    "parta_gme", "partb_enroll", "partb_reimb", "partb_percap", "mean_risk"
]

df_2015 = pd.read_excel(
    ffs_paths[2015],
    skiprows=skip_rows[2015],
    header=None,
    names=cols_2015,
    na_values="*"
)

ffs_costs_2015 = df_2015[[
    "ssa", "state", "county_name", "parta_enroll", "parta_reimb",
    "partb_enroll", "partb_reimb", "mean_risk"
]].copy()
ffs_costs_2015["year"] = 2015

# Convert the 'ssa' column to numeric
ffs_costs_2015["ssa"] = pd.to_numeric(ffs_costs_2015["ssa"], errors="coerce")

# Remove commas and convert specified columns to numeric
for col in ["parta_enroll", "parta_reimb", "partb_enroll", "partb_reimb", "mean_risk"]:
    ffs_costs_2015[col] = ffs_costs_2015[col].astype(str).str.replace(",", "", regex=False)
    ffs_costs_2015[col] = pd.to_numeric(ffs_costs_2015[col], errors="coerce")

ffs_costs_list.append(ffs_costs_2015)

# Combine all yearly DataFrames into one final DataFrame
ffs_costs_final = pd.concat(ffs_costs_list, ignore_index=True)

# %% [markdown]
# Save the final DataFrame to a pickle file

# %%
output_path = "/Users/kathrynmawhinney/Documents/GitHub/Homework4/data/output/ffs_costs.pkl"
os.makedirs(os.path.dirname(output_path), exist_ok=True)
ffs_costs_final.to_pickle(output_path)

ffs_costs_final.head()


Unnamed: 0,ssa,state,county_name,parta_enroll,parta_reimb,partb_enroll,partb_reimb,mean_risk,year
0,1000.0,ALABAMA,AUTAUGA,4579,18053515,4323,16652361,0.91,2007
1,1010.0,ALABAMA,BALDWIN,22130,84948302,21210,76422234,0.944,2007
2,1020.0,ALABAMA,BARBOUR,2859,11565606,2800,9727886,1.016,2007
3,1030.0,ALABAMA,BIBB,2044,10453847,1955,7389636,0.98,2007
4,1040.0,ALABAMA,BLOUNT,4086,19736686,3887,13405269,1.005,2007
