# Asthma ED Visit Rates - Data Cleaning

In [1]:
import pandas as pd
import numpy as np
from siuba import _, select, rename, mutate, group_by, arrange, filter, summarize, distinct
# https://siuba.readthedocs.io/en/latest/api_index.html
# https://timmastny.rbind.io/blog/why-you-should-use-siuba-dplyr-for-python/

In [2]:
df_raw = pd.read_csv('~/Google Drive/DS4A Empowerment/Team 103 Shared/Data Sources/California Department of Public Health/asthma-ed-visit-rates_2019.csv')

In [3]:
df_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9484 entries, 0 to 9483
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   LGHC Indicator Name  9484 non-null   object 
 1   Geography            9484 non-null   object 
 2   Year                 9484 non-null   int64  
 3   Age Group            9484 non-null   object 
 4   Strata               9484 non-null   object 
 5   Strata Name          9484 non-null   object 
 6   Numerator            9484 non-null   object 
 7   Rate                 9370 non-null   float64
 8   LGHC Indicator ID    9484 non-null   int64  
 9   LGHC Target Rate     9484 non-null   int64  
dtypes: float64(1), int64(3), object(6)
memory usage: 741.1+ KB


In [4]:
df_raw.shape

(9484, 10)

In [5]:
df_raw.head()

Unnamed: 0,LGHC Indicator Name,Geography,Year,Age Group,Strata,Strata Name,Numerator,Rate,LGHC Indicator ID,LGHC Target Rate
0,Asthma ED Visit Rates,Alameda,2015,18 and Over,Total Population,18 and Over,6513,53.1,7,28
1,Asthma ED Visit Rates,Amador,2015,18 and Over,Total Population,18 and Over,151,49.1,7,28
2,Asthma ED Visit Rates,Butte,2015,18 and Over,Total Population,18 and Over,736,43.8,7,28
3,Asthma ED Visit Rates,Calaveras,2015,18 and Over,Total Population,18 and Over,139,42.7,7,28
4,Asthma ED Visit Rates,California,2015,18 and Over,Total Population,18 and Over,117854,40.0,7,28


In [6]:
df_raw.dtypes

LGHC Indicator Name     object
Geography               object
Year                     int64
Age Group               object
Strata                  object
Strata Name             object
Numerator               object
Rate                   float64
LGHC Indicator ID        int64
LGHC Target Rate         int64
dtype: object

In [7]:
df = (df_raw
  >> rename(Age = "Age Group", Category = "Strata", Demographic = "Strata Name", Count = "Numerator")
  >> select(_.Year, _.Geography, _.Age, _.Category, _.Demographic, _.Count, _.Rate)
  >> mutate(Count = _.Count.str.replace(",","").astype(int))
  )

# test = df_raw \
#     .siu_rename(Age = "Age Group", Category = "Strata", Demographic = "Strata Name", Count = "Numerator") \
#     .siu_select(_.Year, _.Geography, _.Age, _.Category, _.Demographic, _.Count, _.Rate) \
#     .siu_mutate(Count = _.Count.str.replace(",","").astype(int))

In [8]:
df.describe()

Unnamed: 0,Year,Count,Rate
count,9484.0,9484.0,9370.0
mean,2015.03817,2002.594791,66.991793
std,2.584401,9425.124379,86.053953
min,2011.0,7.0,5.2
25%,2013.0,57.0,33.8
50%,2015.0,252.0,48.9
75%,2017.0,954.25,71.6
max,2019.0,191904.0,4564.1


In [9]:
df[df.isnull().any(axis=1)].shape # dimensions of rows with null values

(114, 7)

In [10]:
df.loc[:, df.isnull().any()] # columns with null values

Unnamed: 0,Rate
0,53.1
1,49.1
2,43.8
3,42.7
4,40.0
...,...
9479,81.2
9480,90.0
9481,61.0
9482,52.4


In [11]:
df[df.isnull().any(axis=1)].describe()

Unnamed: 0,Year,Count,Rate
count,114.0,114.0,0.0
mean,2015.140351,10.912281,
std,2.636964,0.507665,
min,2011.0,7.0,
25%,2013.0,11.0,
50%,2015.0,11.0,
75%,2017.0,11.0,
max,2019.0,11.0,


In [12]:
df.to_csv('cleaned_data/asthma_clean_file.csv', encoding = 'utf-8', index = False)