# Main script to clean UW final satellite pm2.5 data at the zip code, monthly level

Modules: N/A <br>
Author: Cornelia Ilin <br>
Email: cilin@wisc.edu <br>
Date created: Oct 20, 2020 <br>

**Citations (data sources)**

``PM 25 at the monthly level:``
1. https://sites.wustl.edu/acag/datasets/historical-pm2-5-across-north-america/


``Shapefiles for California ZIP codes (2010 census):``

2. https://www.census.gov/cgi-bin/geo/shapefiles/index.php?year=2010&layergroup=ZIP+Code+Tabulation+Areas

``Installation errors with Geopandas:``

3. https://stackoverflow.com/questions/54734667/error-installing-geopandas-a-gdal-api-version-must-be-specified-in-anaconda
    
**Citations (persons)**
1. N/A

**Preferred environment**
1. Code written in Jupyter Notebooks

#### Step 1: Import packages

In [1]:
# standard
import pandas as pd
import numpy as np
import os
import h5py

# geography
import geopandas as gpd
import osmnx as ox
import shapely

#### Step 2: Define working directories

In [2]:
in_dir_zip_shapes = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/census_geo/shapefiles_zcta/'
# note the data in the in_dir directory only includes years 2007 to 2012 because of storage issues
# for the other datasets, downlaod from the source indicated above
in_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/raw_data/pollution/satellite/UW/monthly/'
out_dir = 'C:/Users/cilin/Research/CA_hospitals/Input/final_data/pollution/satellite/UW/monthly/'

#### Step 3: Define functions

``read data``

In [3]:
def read_census_geom():
    """ Read Census (lat, lon) coordinates for California zip-codes
    parameters:
    -----------
    None
    
    return:
    -------
    Df with osmnx_geom
    """
    ### Step 1 ### 
    ##############
    # Read the shapefiles for California's ZIP codes
    for file in os.listdir(in_dir_zip_shapes):
        if file.endswith('.shp'):
            gdf = gpd.read_file(in_dir_zip_shapes + file)

    # keep only cols of interest 
    # ('ZCTA5CE10' = 2010 Census ZIP codes,	'GEOID10' = 2010 Census Tract codes)
    gdf = gdf[['ZCTA5CE10',	'GEOID10', 'geometry']]
    
    
    ### Step 2 ###
    ###############
    # For each zip code extract polygon with (lat, lon) info

    zip_poly = pd.DataFrame()

    for idx, multipoly in enumerate(gdf.geometry):
        if isinstance(multipoly, shapely.geometry.polygon.Polygon):
            temp_df = pd.DataFrame({'latitude': multipoly.exterior.coords.xy[1], 
                                    'longitude': multipoly.exterior.coords.xy[0],
                                    'ZCTA10': gdf.loc[idx, 'ZCTA5CE10'],
                                    'GEOID10': gdf.loc[idx, 'GEOID10']})
            zip_poly = pd.concat([zip_poly, temp_df], axis=0)

        if isinstance(multipoly, shapely.geometry.multipolygon.MultiPolygon):
            for poly in multipoly:
                temp_df = pd.DataFrame({'latitude': poly.exterior.coords.xy[1], 
                                        'longitude': poly.exterior.coords.xy[0],
                                        'ZCTA10': gdf.loc[idx, 'ZCTA5CE10'],
                                        'GEOID10': gdf.loc[idx, 'GEOID10']})
                zip_poly = pd.concat([zip_poly, temp_df], axis=0)   
    

    # round (lat, lon) to 2 decimal points and add 0.005 to match the UW (lat, lon) values
    zip_poly['latitude'] = zip_poly.latitude.round(2) + 0.005
    zip_poly['longitude'] = zip_poly.longitude.round(2) + 0.005
    zip_poly.sort_values(by=['ZCTA10', 'latitude', 'longitude'], inplace=True)
    zip_poly.drop_duplicates(subset=['ZCTA10', 'latitude', 'longitude'], inplace=True)

    
    return zip_poly

In [4]:
def read_uw_pm25(zip_poly):
    """Read UW pm25 data
    parameters:
    -----------
    osmnx_geom: df, contains osmnx_geom and county name/code
    
    return:
    df with pm25 values by year and county in California
    """
    df = pd.DataFrame()
    
    for idx, file in enumerate(os.listdir(in_dir)):
        if file.endswith('.h5'):
            print(file[:4] + '_' + file[4:6])
            # read data
            f = h5py.File(os.path.join(in_dir, file), 'r')
            # read latitude
            row_index = f['latitude']
            row_index = pd.DataFrame(row_index, columns=['latitude'])
            # read longitude
            col_index = f['longitude']
            col_index = pd.DataFrame(col_index, columns = ['longitude'])
            # read pm25 (divide by 100 as indicated here: https://zenodo.org/record/2616769#.X4999NBKg4c)
            pm25 = f['CorrectedPM2.5']
            pm25 = pd.DataFrame(pm25)/100

            # add col and row index to pm25_df
            pm25.set_index(row_index.latitude.values, inplace=True)
            pm25.columns = col_index.longitude.values
            pm25.reset_index(drop=False, inplace=True)
            pm25.rename(columns={'index':'latitude'}, inplace=True)

            # melt pm25_df
            pm25 = pd.melt(pm25, id_vars='latitude', var_name='longitude', value_vars=col_index.longitude.values, value_name='pm25')
            pm25.sort_values(by=['latitude', 'longitude'], inplace=True)

            # set lat and lon to 3 decimals
            pm25['latitude'] = pm25.latitude.round(3)
            pm25['longitude'] = pm25.longitude.astype(float).round(3)

            # add year and month column
            pm25['year_month'] = file[:4] + '_' + file[4:6]

            # merge with zip_poly
            pm25 = zip_poly.merge(pm25, on=['latitude', 'longitude'], how='inner')

            # group by zip code and census tract (get mean for each zip code)
            pm25 = pm25.groupby(['year_month', 'ZCTA10', 'GEOID10'], as_index=False).agg({'pm25': np.mean})

            # add year and zip column
            pm25['year_month_zcta'] = pm25.year_month.astype(str) + '_' + pm25.ZCTA10.astype(str)

            # append to df
            df = pd.concat([df, pm25], axis=0)
    
    # sort and reset index
    df.sort_values(by=['year_month'], inplace=True)
    df.reset_index(drop=True, inplace=True)
    
    return df

#### Step 4: Read data

In [5]:
zip_poly = read_census_geom()
zip_poly.head(2)

Unnamed: 0,latitude,longitude,ZCTA10,GEOID10
235,37.465,-117.925,89010,689010
234,37.465,-117.915,89010,689010


In [6]:
df = read_uw_pm25(zip_poly)

2007_10
2007_11
2007_12
2008_01
2008_02
2008_03
2008_04
2008_05
2008_06
2008_07
2008_08
2008_09
2008_10
2008_11
2008_12
2009_01
2009_02
2009_03
2009_04
2009_05
2009_06
2009_07
2009_08
2009_09
2009_10
2009_11
2009_12
2010_01
2010_02
2010_03
2010_04
2010_05
2010_06
2010_07
2010_08
2010_09
2010_10
2010_11
2010_12
2011_01
2011_02
2011_03
2011_04
2011_05
2011_06
2011_07
2011_08
2011_10
2011_11
2012_01
2012_02
2012_03
2012_04
2012_05
2012_06
2012_07
2012_08
2012_09
2012_10
2012_11
2012_12


In [7]:
df.sort_values(by=['ZCTA10', 'GEOID10', 'year_month'], inplace=True)
df.head()

Unnamed: 0,year_month,ZCTA10,GEOID10,pm25,year_month_zcta
0,2007_10,89010,689010,4.702143,2007_10_89010
3073,2007_11,89010,689010,6.472143,2007_11_89010
4687,2007_12,89010,689010,3.801429,2007_12_89010
6444,2008_01,89010,689010,3.231429,2008_01_89010
8200,2008_02,89010,689010,3.16,2008_02_89010


In [8]:
df.reset_index(drop=True, inplace=True)
df.to_csv(out_dir + 'pm25_uw_zip_monthly_2007_2012.csv')