In [1]:
import pandas as pd

In [2]:
year_to_sparcs_key = {2009: "q6hk-esrj", 2010: "mtfm-rxf4", 2011: "pyhr-5eas", 2012: "u4ud-w55t",
                      2013: "npsr-cm47", 2014: "rmwa-zns4"}

In [3]:
base_url = "https://health.data.ny.gov/resource/"

In [None]:
def get_df_across_multiple_files(key_dict, base_url, search_query, row_limit=10000):
    df_dict = {} # Store df as keys
    for key in key_dict:
        ds_hash = key_dict[key]
        request_url = base_url + ds_hash + ".json" + "?" + search_query + "&$limit=" + str(row_limit)
        print("Extracting %s with '%s'" % (key, request_url))
        df = pd.read_json(request_url)
        df_dict[key] = df
    return df_dict

In [None]:
kd_df_dicts = get_df_across_multiple_files(year_to_sparcs_key, base_url, "ccs_procedure_code=105")

Extracting 2009 with 'https://health.data.ny.gov/resource/q6hk-esrj.json?ccs_procedure_code=105&$limit=10000'
Extracting 2010 with 'https://health.data.ny.gov/resource/mtfm-rxf4.json?ccs_procedure_code=105&$limit=10000'
Extracting 2011 with 'https://health.data.ny.gov/resource/pyhr-5eas.json?ccs_procedure_code=105&$limit=10000'
Extracting 2012 with 'https://health.data.ny.gov/resource/u4ud-w55t.json?ccs_procedure_code=105&$limit=10000'
Extracting 2013 with 'https://health.data.ny.gov/resource/npsr-cm47.json?ccs_procedure_code=105&$limit=10000'
Extracting 2014 with 'https://health.data.ny.gov/resource/rmwa-zns4.json?ccs_procedure_code=105&$limit=10000'


In [None]:
def create_single_df_from_df_dict(df_dict, key_name):
    """Build a single dataframe that that has a new field called key_name with the key values in df_dict"""
    df_keys = df_dict.keys()
    base_df = df_dict[df_keys[0]]
    base_df[key_name] = df_keys[0]
    for df_key in df_keys[1:]:
        df = df_dict[df_key]
        df[key_name] = df_key
        base_df = base_df.append(df, ignore_index=True)
    return base_df

In [None]:
kidney_cy_09_14 = create_single_df_from_df_dict(kd_df_dicts, "discharge_year")

In [None]:
kidney_cy_09_14.discharge_year.count()

In [None]:
kidney_cy_09_14.groupby("discharge_year")["length_of_stay"].count()

In [None]:
kidney_cy_09_14["length_of_stay"] = pd.to_numeric(kidney_cy_09_14["length_of_stay"], errors='coerce')

In [None]:
kidney_cy_09_14.groupby(["facility_name"])["length_of_stay"].mean()

In [None]:
kidney_cy_09_14.discharge_year.head()

In [None]:
kidney_cy_09_14.apr_drg_description.head()

In [None]:
kd_df_dicts[2011].length_of_stay.head()

In [None]:
pd.crosstab(kidney_cy_09_14["facility_name"],  kidney_cy_09_14["discharge_year"], margins=True)

In [None]:
import numpy as np

In [None]:
pd.crosstab(kidney_cy_09_14["facility_name"],  kidney_cy_09_14["discharge_year"], 
            margins=True, values=kidney_cy_09_14["length_of_stay"],aggfunc=np.mean)

In [None]:
pd.crosstab(kidney_cy_09_14["facility_name"],  kidney_cy_09_14["discharge_year"], 
            margins=True, values=kidney_cy_09_14["length_of_stay"],aggfunc=np.median)

In [None]:
import seaborn as sb

In [None]:
%matplotlib inline

In [None]:
sb.boxplot(y="length_of_stay", x="discharge_year", data=kidney_cy_09_14)

In [None]:
sb.boxplot(y="length_of_stay", x="discharge_year", data=kidney_cy_09_14[kidney_cy_09_14["length_of_stay"] <= 20])

In [None]:
sb.violinplot(y="length_of_stay", x="discharge_year", data=kidney_cy_09_14[kidney_cy_09_14["length_of_stay"] <= 20])

In [None]:
kidney_cy_09_14.to_csv("./data/sparcs_ny_kidney_transplants_2009_2014.csv")