In [1]:
%matplotlib inline

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np

import xarray as xr

import regionmask

import pandas as pd

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import cartopy.feature as cfeat
from scipy.interpolate import RectBivariateSpline
from datetime import datetime, timedelta
import geopandas

import calendar

import glob

import sys
sys.path.append('../')
import utils

np.seterr(invalid='ignore'); # disable a warning from matplotlib and cartopy


Bad key "text.kerning_factor" on line 4 in
/home/emfreese/anaconda3/envs/conda_env/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


# Data

## GEOS Chem Runs

Using egrid, NEI, and our model emissions inventories for NO, NO2, SO2

In [2]:
egrid_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation-egrid/OutputDir/'
NEI_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gtropchem/merra2_05x0625_tropchem_na/OutputDir/'
normal_model_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation/OutputDir/'
speciesconc_output = 'GEOSChem.SpeciesConc.2016*.nc4'
aerosol_output = 'GEOSChem.AerosolMass.2016*.nc4'

ds_egrid = utils.import_GC_runs_general(egrid_path, speciesconc_output, aerosol_output, 'egrid')
ds_NEI = utils.import_GC_runs_general(NEI_path, speciesconc_output, aerosol_output, 'NEI')
ds_normal_model = utils.import_GC_runs_general(normal_model_path, speciesconc_output, aerosol_output, 'normal_model')


NameError: name 'ds_MODEL' is not defined

In [4]:

datasets = [
    ds_egrid,
    ds_NEI,
    ds_normal_model
]

In [7]:
poll_ds = utils.combine_and_convert_ds(utils.gas_species_list, utils.aerosol_species_dict.keys(), datasets, ['egrid', 'NEI', 'normal_model'],['egrid','NEI'], 
                                       'normal_model', 'model_NEI_egrid')




In [10]:
xr.Dataset.to_zarr(poll_ds, 'model_NEI_egrid_GC_ds.zarr') #save the dataset 

<xarray.backends.zarr.ZarrStore at 0x7fd3369cd4c0>

poll_ds = xr.open_zarr('model_NEI_egrid_GC_ds.zarr/')

## EPA Observational data
data is from https://aqs.epa.gov/aqsweb/airdata/download_files.html
for the year 2016
choosing O3, NO, SO2, PM25 (FEM/FRM)


In [5]:
EPA_obs_df = utils.import_and_edit_EPAobs('../../GEOS_CHEM/obs_data/daily*.csv')

## IMPROVE Observational data

mw_dict = {'ammNO3': 80.043,
           'ammSO4': 132.14} #g/mol 
g_ug = 1e6

for species in mw_dict.keys():
    ammon_df[f'{species}f:Value'] *= mw_dict[species] * g_ug

In [20]:
pm_df = utils.import_IMPROVE('../../GEOS_CHEM/obs_data/IMPROVE_2016_PM.txt', 'PM25', 'MF')
s_df = utils.import_IMPROVE('../../GEOS_CHEM/obs_data/IMPROVE_2016_Sulfate.txt', 'SO4', 'SO4f')
n_df = utils.import_IMPROVE('../../GEOS_CHEM/obs_data/IMPROVE_2016_Nitrate.txt', 'NIT', 'NO3f')
oc_df = utils.import_IMPROVE('../../GEOS_CHEM/obs_data/IMPROVE_2016_ammonia.txt', 'NH4', 'NH4f')
ammon_df = utils.import_IMPROVE('../../GEOS_CHEM/obs_data/IMPROVE_2016_OC.txt', 'OC_EC', 'ECf')

IMPROVE_df = pd.concat([pm_df, s_df, n_df,oc_df, ammon_df], axis = 0) #concatenate all dataframes and reset the index
IMPROVE_df['Date'] = pd.to_datetime(IMPROVE_df['Date']) #change to datetime
IMPROVE_df = IMPROVE_df.loc[IMPROVE_df['Arithmetic Mean'] >= 0] #get rid of -999 readings where there is no data


## Lists/Dicts of Variables

In [22]:
#define Lat and Lon of the nested grid US
levels_dict = {'PM25':np.arange(0., 20., .5), 'SO2':np.arange(0., 5., .1), 
               'NO2':np.arange(0., 5., .1), 'NOx':np.arange(0., 5., .1), 'O3':np.arange(0., 70., 1.),
               'dif':np.arange(-.3, .31, .01), 'regional_dif':np.arange(-1.5, 1.51, .01)}


#model_names = []
#for d in datasets:    
    #model_names.append(d.attrs['name'])


# Stats on Observations vs. Models

## Interpolate data to get GC run data at observational points

#### EPA Interpolation

##### For egrid, NEI, normal

In [None]:
### interpolate data for EPA
interp_EPA_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for model in model_names:
    for species in list(utils.species_dict.keys()):
        for month in pd.DatetimeIndex(EPA_obs_df.loc[EPA_obs_df.date.dt.month.isin(utils.month_string)]['date'].values).month.unique().values:

            #data selected for date
            data = poll_ds.sel(model_name = model)[f'{species}'].groupby('time.month').mean().sel(month = month)
            
            #new lat and lon in radians
            lats_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Latitude'].unique()
            lons_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Longitude'].unique()
            
            #interpolation function
            #interp_spline= RectBivariateSpline(lats, lons, data)
            #interp_data = interp_spline(lats_new,lons_new,grid = False)
            interp_data = []
            for idx in range(lats_new.size):
                interp_data.append(data.sel(lat=lats_new[idx], lon=lons_new[idx], method='nearest').values.item())
            
            tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': model, 'species': utils.species_dict[species], 'date': month})
            interp_EPA_df = interp_EPA_df.append(tmp_df, sort=False, ignore_index=True)

In [None]:
for i in range(0, len(interp_EPA_df)):
    interp_EPA_df.loc[i,('date')] = datetime(2016,interp_EPA_df['date'][i],calendar.monthrange(2016,interp_EPA_df['date'][i])[1])

##### For Nonuc

In [2]:
poll_nonuc1 = xr.open_zarr('../ego_nonuclear_project/pm_nonuc_ds.zarr/')
poll_nonuc2 = xr.open_zarr('../ego_nonuclear_project/nonuc_normal_GC/')
poll_nonuc2 = poll_nonuc2.drop(labels = ['HNO3', 'NH3', 'SO4'])

In [3]:
#convert to ug/m3 from mol/mol
mw_species = {
    'NH4': 18.,
    'NIT': 62.,
    'SO4': 96.,
    'BCPI': 12.,
    'OCPI': 12.,
    'BCPO': 12.,
    'OCPO': 12.,
    'DST1': 29.,
    'DST2': 29.,
    'SALA': 31.4,
}
#list of pm species
pm_species = list(mw_species.keys())

poll_nonuc1 = utils.ppb_to_ug(poll_nonuc1, pm_species, mw_species)


In [4]:
poll_nonuc = xr.merge([poll_nonuc1, poll_nonuc2])

In [16]:
### interpolate data for EPA
interp_EPA_nonuc_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for species in ['PM25', 'SO2', 'NO2', 'O3']:
    for month in pd.DatetimeIndex(EPA_obs_df.loc[EPA_obs_df.date.dt.month.isin(utils.month_string)]['date'].values).month.unique().values:

        #data selected for date
        data = poll_nonuc.sel(model_name = 'nonuc')[f'{species}'].groupby('time.month').mean().sel(month = month)

        #new lat and lon in radians
        lats_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Latitude'].unique()
        lons_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Longitude'].unique()

        #interpolation function
        interp_data = []
        for idx in range(lats_new.size):
            interp_data.append(data.sel(lat=lats_new[idx], lon=lons_new[idx], method='nearest').values.item())

        tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': 'nonuc', 'species': species, 'date': month})
        interp_EPA_nonuc_df = interp_EPA_nonuc_df.append(tmp_df, sort=False, ignore_index=True)

In [18]:
for i in range(0, len(interp_EPA_nonuc_df)):
    interp_EPA_nonuc_df.loc[i,('date')] = datetime(2016,interp_EPA_nonuc_df['date'][i],calendar.monthrange(2016,interp_EPA_nonuc_df['date'][i])[1])

#### IMPROVE Interpolation

##### for EGRID, NEI, NORMAL

In [26]:
#import dataset with the SO4 and NIT contributions from the model
poll_2_ds = xr.open_zarr('pm_ds.zarr')
#convert to ug/m3 from mol/mol
mw_species = {
    'NH4': 18.,
    'NIT': 62.,
    'SO4': 96.,
    'BCPI': 12.,
    'OCPI': 12.,
    'BCPO': 12.,
    'OCPO': 12.,
    'DST1': 29.,
    'DST2': 29.,
    'SALA': 31.4,
}
#list of pm species
pm_species = list(mw_species.keys())

poll_2_ds = utils.ppb_to_ug(poll_2_ds, pm_species, mw_species)
    
poll_ds = xr.merge([poll_ds, poll_2_ds])

In [31]:
### interpolate data for EPA
interp_IMPROVE_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for model in model_names:
    for species in ['PM25','NIT','SO4','NH3','NH4','HNO3']:
        for month in pd.DatetimeIndex(IMPROVE_df.loc[IMPROVE_df.Date.dt.month.isin(utils.month_string)]['Date'].values).month.unique().values:

            #data selected for date
            data = poll_ds.sel(model_name = model)[f'{species}'].groupby('time.month').mean().sel(month = month)
            
            #new lat and lon in radians
            lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == species)]['Latitude'].unique()
            lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == species)]['Longitude'].unique()
            if species == 'NH3':
                lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NH4')]['Latitude'].unique()
                lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NH4')]['Longitude'].unique()
            if species == 'HNO3':
                lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NIT')]['Latitude'].unique()
                lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NIT')]['Longitude'].unique()
            #interpolation function
            interp_data = []
            for idx in range(lats_new.size):
                interp_data.append(data.sel(lat=lats_new[idx], lon=lons_new[idx], method='nearest').values.item())
            
            tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': model, 'species': species, 'date': month})
            interp_IMPROVE_df = interp_IMPROVE_df.append(tmp_df, sort=False, ignore_index=True)

In [35]:
for i in range(0, len(interp_IMPROVE_df)):
    interp_IMPROVE_df.loc[i,('date')] = datetime(2016,interp_IMPROVE_df['date'][i],calendar.monthrange(2016,interp_IMPROVE_df['date'][i])[1])

##### For NONUC

In [73]:
poll_nonuc1 = xr.open_zarr('../ego_nonuclear_project/pm_nonuc_ds.zarr/')
poll_nonuc2 = xr.open_zarr('../ego_nonuclear_project/nonuc_normal_GC/')
poll_nonuc2 = poll_nonuc2.drop(labels = ['HNO3', 'NH3', 'SO4'])

In [74]:
#convert to ug/m3 from mol/mol
mw_species = {
    'NH4': 18.,
    'NIT': 62.,
    'SO4': 96.,
    'BCPI': 12.,
    'OCPI': 12.,
    'BCPO': 12.,
    'OCPO': 12.,
    'DST1': 29.,
    'DST2': 29.,
    'SALA': 31.4,
}
#list of pm species
pm_species = list(mw_species.keys())

poll_nonuc1 = utils.ppb_to_ug(poll_nonuc1, pm_species, mw_species)


In [75]:
poll_nonuc = xr.merge([poll_nonuc1, poll_nonuc2])

In [90]:
### interpolate data for EPA
interp_IMPROVE_nonuc_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for species in ['PM25','NIT','SO4','NH3','NH4','HNO3']:
    for month in pd.DatetimeIndex(IMPROVE_df.loc[IMPROVE_df.Date.dt.month.isin(utils.month_string)]['Date'].values).month.unique().values:

        #data selected for date
        data = poll_nonuc.sel(model_name = 'nonuc')[f'{species}'].groupby('time.month').mean().sel(month = month)

        #new lat and lon in radians
        lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == species)]['Latitude'].unique()
        lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == species)]['Longitude'].unique()
        if species == 'NH3':
            lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NH4')]['Latitude'].unique()
            lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NH4')]['Longitude'].unique()
        if species == 'HNO3':
            lats_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NIT')]['Latitude'].unique()
            lons_new = IMPROVE_df.loc[(IMPROVE_df['species'] == 'NIT')]['Longitude'].unique()
        #interpolation function
        interp_data = []
        for idx in range(lats_new.size):
            interp_data.append(data.sel(lat=lats_new[idx], lon=lons_new[idx], method='nearest').values.item())

        tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': 'nonuc', 'species': species, 'date': month})
        interp_IMPROVE_nonuc_df = interp_IMPROVE_nonuc_df.append(tmp_df, sort=False, ignore_index=True)

In [91]:
for i in range(0, len(interp_IMPROVE_nonuc_df)):
    interp_IMPROVE_nonuc_df.loc[i,('date')] = datetime(2016,interp_IMPROVE_nonuc_df['date'][i],calendar.monthrange(2016,interp_IMPROVE_nonuc_df['date'][i])[1])

## Create a monthly observational dataframe

#### EPA

In [None]:
gdf = geopandas.GeoDataFrame(
    EPA_obs_df, geometry=geopandas.points_from_xy(EPA_obs_df.Longitude, EPA_obs_df.Latitude))

geometries = gdf['geometry'].apply(lambda x: x.wkt).values

In [None]:
EPA_obs_df['geometry'] = geometries
EPA_obs_df.index = EPA_obs_df['date']

In [None]:
monthly_EPA_df = pd.DataFrame(columns = ['Arithmetic Mean','latitude','longitude', 'geometry','species', 'date'])
for geometry in np.unique(geometries):
    for species in np.unique(EPA_obs_df['species'].values):
        lat = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Latitude'].first().values
        lon = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Longitude'].first().values
        data = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Arithmetic Mean'].mean()
        tmp_df = pd.DataFrame({'Arithmetic Mean': data.values, 'latitude':lat, 'longitude':lon, 
                               'geometry':geometry, 'species': species, 'date': data.index})
        monthly_EPA_df = monthly_EPA_df.append(tmp_df, sort=False, ignore_index=True)

#### IMPROVE

In [21]:
gdf = geopandas.GeoDataFrame(
    IMPROVE_df, geometry=geopandas.points_from_xy(IMPROVE_df.Longitude, IMPROVE_df.Latitude))

geometries = gdf['geometry'].apply(lambda x: x.wkt).values

In [22]:
IMPROVE_df['geometry'] = geometries
IMPROVE_df.index = IMPROVE_df['Date']

In [23]:
monthly_IMPROVE_df = pd.DataFrame(columns = ['Arithmetic Mean','latitude','longitude', 'geometry','species', 'date'])
for geometry in np.unique(geometries):
    for species in np.unique(IMPROVE_df['species'].values):
        lat = IMPROVE_df.loc[(IMPROVE_df['geometry'] == geometry) & (IMPROVE_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Latitude'].first().values
        lon = IMPROVE_df.loc[(IMPROVE_df['geometry'] == geometry) & (IMPROVE_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Longitude'].first().values
        data = IMPROVE_df.loc[(IMPROVE_df['geometry'] == geometry) & (IMPROVE_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Arithmetic Mean'].mean()
        tmp_df = pd.DataFrame({'Arithmetic Mean': data.values, 'latitude':lat, 'longitude':lon, 
                               'geometry':geometry, 'species': species, 'date': data.index})
        monthly_IMPROVE_df = monthly_IMPROVE_df.append(tmp_df, sort=False, ignore_index=True)

## Add Region Names to the dataframes and save out

In [88]:
#add region to the dataframes based on lat_lon dictionary

IMPROVE_df['Region'] = 'a'
interp_IMPROVE_df['Region'] = 'a'
interp_EPA_df['Region'] = 'a'
EPA_obs_df['Region'] = 'a'
monthly_EPA_df['Region'] = 'a'
monthly_IMPROVE_df['Region'] = 'a'
interp_IMPROVE_nonuc_df['Region'] = 'a'

for region in ['SE_lat_lon','NW_lat_lon','NE_lat_lon','MW_lat_lon','SW_lat_lon']: 

    IMPROVE_df.loc[
            (IMPROVE_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (IMPROVE_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
    interp_IMPROVE_df.loc[
            (interp_IMPROVE_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (interp_IMPROVE_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region

    EPA_obs_df.loc[
        (EPA_obs_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (EPA_obs_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
    interp_df.loc[
        (interp_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (interp_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
    monthly_EPA_df.loc[
        (monthly_EPA_df['longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (monthly_EPA_df['latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region

    monthly_IMPROVE_df.loc[
            (monthly_IMPROVE_df['longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (monthly_IMPROVE_df['latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region

    interp_IMPROVE_nonuc_df.loc[
            (interp_IMPROVE_nonuc_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (interp_IMPROVE_nonuc_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region


In [21]:
interp_EPA_nonuc_df['Region'] = 'a'
for region in ['SE_lat_lon','NW_lat_lon','NE_lat_lon','MW_lat_lon','SW_lat_lon']: 

    interp_EPA_nonuc_df.loc[
                (interp_EPA_nonuc_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (interp_EPA_nonuc_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
interp_EPA_nonuc_df.to_csv('interp_EPA_nonuc_df.csv', date_format='%Y%m%d', index=False)

In [89]:
interp_EPA_df.to_csv('interp_EPA_df.csv', date_format='%Y%m%d', index=False)
EPA_obs_df.to_csv('EPA_obs_df.csv', date_format='%Y%m%d', index=False)
monthly_EPA_df.to_csv('EPA_monthly_obs_df.csv', date_format='%Y%m%d', index=False)
IMPROVE_df.to_csv('IMPROVE_df.csv', date_format='%Y%m%d', index=False)
interp_IMPROVE_df.to_csv('interp_IMPROVE_df.csv', date_format='%Y%m%d', index=False)
monthly_IMPROVE_df.to_csv('IMPROVE_monthly_obs_df.csv', date_format='%Y%m%d', index=False)
interp_IMPROVE_nonuc_df.to_csv('interp_IMPROVE_nonuc_df.csv', date_format='%Y%m%d', index=False)