In [1]:
%matplotlib inline

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np

import xarray as xr

import regionmask

import pandas as pd

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import cartopy.feature as cfeat
from scipy.interpolate import RectBivariateSpline
from datetime import datetime, timedelta
import geopandas

import calendar

import glob

import sys
sys.path.append('../')
import utils

np.seterr(invalid='ignore'); # disable a warning from matplotlib and cartopy


Bad key "text.kerning_factor" on line 4 in
/home/emfreese/anaconda3/envs/conda_env/lib/python3.7/site-packages/matplotlib/mpl-data/stylelib/_classic_test_patch.mplstyle.
You probably need to get an updated matplotlibrc file from
http://github.com/matplotlib/matplotlib/blob/master/matplotlibrc.template
or from the matplotlib source distribution


# Data

## GEOS Chem Runs

Using egrid, NEI, and our model emissions inventories for NO, NO2, SO2

In [None]:
egrid_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation-egrid/OutputDir/'
NEI_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gtropchem/merra2_05x0625_tropchem_na/OutputDir/'
MODEL_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation/OutputDir/'
speciesconc_output = 'GEOSChem.SpeciesConc.2016*.nc4'
aerosol_output = 'GEOSChem.AerosolMass.2016*.nc4'

ds_egrid, ds_NEI, ds_MODEL = utils.import_GC_runs(egrid_path, NEI_path, MODEL_path, aerosol_output, speciesconc_output)



In [None]:
datasets = [
    ds_egrid,
    ds_NEI,
    ds_MODEL
]

In [None]:
ds = xr.concat(datasets, pd.Index(['egrid', 'NEI', 'MODEL'], name='model_name'))

In [3]:
species_list = [ #create a list of the species we want to keep
    'NO',
    'NO2',
    'SO2',
    'O3',
    'NOx',
]
species_tracers = ['SpeciesConc_' + spec for spec in species_list]
species_tracers.append('PM25')
ds['SpeciesConc_NOx'] = (
        ds['SpeciesConc_NO'] + ds['SpeciesConc_NO2'] #sum our NO2 and NO to get NOx
                   )

NameError: name 'ds' is not defined

In [11]:
poll_ds = ds[species_tracers].rename({'SpeciesConc_' + spec: spec for spec in species_list})

In [12]:
#convert everything to ppbv
for species in species_list:
    poll_ds[f'{species}'] *= 1e9 #convert from mol/mol to ppbv
    poll_ds[f'{species}'].attrs['units'] = 'ppbv'


for species in species_list:
    #calculate the differences for species between model and NEI
    poll_ds[f'dif_MODEL-NEI_{species}'] = poll_ds.sel(model_name = 'MODEL')[f'{species}'] - poll_ds.sel(model_name = 'NEI')[f'{species}']
    poll_ds[f'dif_MODEL-NEI_{species}'].attrs['units'] = 'ppbv'

    #calculate the differences for species between model and egrid
    poll_ds[f'dif_MODEL-egrid_{species}'] = poll_ds.sel(model_name = 'MODEL')[f'{species}'] - poll_ds.sel(model_name = 'egrid')[f'{species}']
    poll_ds[f'dif_MODEL-egrid_{species}'].attrs['units'] = 'ppbv'

#calculate the dif for PM between model and NEI
poll_ds['dif_MODEL-NEI_PM25'] = poll_ds.sel(model_name = 'MODEL')['PM25']- poll_ds.sel(model_name = 'NEI')['PM25']
poll_ds[f'dif_MODEL-NEI_PM25'].attrs['units'] = 'PM $\mu$g m$^{-3}$'

#calculate the dif for PM between model and NEI
poll_ds['dif_MODEL-egrid_PM25'] = poll_ds.sel(model_name = 'MODEL')['PM25']- poll_ds.sel(model_name = 'egrid')['PM25']
poll_ds[f'dif_MODEL-egrid_PM25'].attrs['units'] = 'PM $\mu$g m$^{-3}$'

poll_ds = poll_ds.isel(lev = 0)

In [13]:
xr.Dataset.to_zarr(poll_ds, 'model_NEI_egrid_GC_ds.zarr') #save the dataset 

<xarray.backends.zarr.ZarrStore at 0x7f09d58ab7d8>

## EPA Observational data
data is from https://aqs.epa.gov/aqsweb/airdata/download_files.html
for the year 2016
choosing O3, NO, SO2, PM25 (FEM/FRM)


In [2]:
EPA_obs_df = pd.concat(map(pd.read_csv, glob.glob('../../GEOS_CHEM/obs_data/daily*.csv')))
EPA_obs_df['date'] = pd.to_datetime(EPA_obs_df['Date Local'])
EPA_obs_df['date'] = EPA_obs_df['date'].dt.normalize() + timedelta(hours=12)
EPA_obs_df['Longitude'] = np.round(EPA_obs_df['Longitude'], decimals = 8)
EPA_obs_df['Latitude'] = np.round(EPA_obs_df['Latitude'], decimals = 8)

EPA_obs_df = EPA_obs_df.drop(columns = ['State Code','County Code','Site Num','Parameter Code','POC','Datum','Sample Duration','Date Local', 'Event Type',
       'Observation Count', 'Observation Percent','1st Max Value', '1st Max Hour', 'Address', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change', 'Method Name'])

#only use the SO2 1 hour 2010 pollutant standard arithmetic means and drop the 3-hour 1971 arithmetic means
EPA_obs_df = EPA_obs_df.loc[~(EPA_obs_df['Pollutant Standard'] == 'SO2 3-hour 1971')]
EPA_obs_df = EPA_obs_df.rename(columns = {'Parameter Name':'species'})

#convert Ozone to ppb
EPA_obs_df.loc[EPA_obs_df['species'] == 'Ozone','Arithmetic Mean'] *= 1e3 #ppb
EPA_obs_df = EPA_obs_df.loc[~(EPA_obs_df['Arithmetic Mean'] <= 0)]



## Lists/Dicts of Variables

In [3]:
#define Lat and Lon of the nested grid US
levels_dict = {'PM25':np.arange(0., 20., .5), 'SO2':np.arange(0., 5., .1), 
               'NO2':np.arange(0., 5., .1), 'NOx':np.arange(0., 5., .1), 'O3':np.arange(0., 70., 1.),
               'dif':np.arange(-.3, .31, .01), 'regional_dif':np.arange(-1.5, 1.51, .01)}


#model_names = []
#for d in datasets:    
    #model_names.append(d.attrs['name'])


# Stats on Observations vs. Models

## Interpolate data to get GC run data at observational points

In [None]:
### interpolate data
interp_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for model in model_names:
    for species in list(utils.species_dict.keys()):
        for month in pd.DatetimeIndex(EPA_obs_df.loc[EPA_obs_df.date.dt.month.isin(utils.month_string)]['date'].values).month.unique().values:

            #data selected for date
            data = poll_ds.sel(model_name = model)[f'{species}'].groupby('time.month').mean().sel(month = month)
            
            #new lat and lon in radians
            lats_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Latitude'].unique()
            lons_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species])]['Longitude'].unique()
            
            #interpolation function
            #interp_spline= RectBivariateSpline(lats, lons, data)
            #interp_data = interp_spline(lats_new,lons_new,grid = False)
            interp_data = []
            for idx in range(lats_new.size):
                interp_data.append(data.sel(lat=lats_new[idx], lon=lons_new[idx], method='nearest').values.item())
            
            tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': model, 'species': utils.species_dict[species], 'date': month})
            interp_df = interp_df.append(tmp_df, sort=False, ignore_index=True)

In [None]:
for i in range(0, len(interp_df)):
    interp_df.loc[i,('date')] = datetime(2016,interp_df['date'][i],calendar.monthrange(2016,interp_df['date'][i])[1])

## Create a monthly EPA observational dataframe

In [7]:
gdf = geopandas.GeoDataFrame(
    EPA_obs_df, geometry=geopandas.points_from_xy(EPA_obs_df.Longitude, EPA_obs_df.Latitude))

geometries = gdf['geometry'].apply(lambda x: x.wkt).values

In [8]:
EPA_obs_df['geometry'] = geometries
EPA_obs_df.index = EPA_obs_df['date']

In [9]:
monthly_EPA_df = pd.DataFrame(columns = ['Arithmetic Mean','latitude','longitude', 'geometry','species', 'date'])
for geometry in np.unique(geometries):
    for species in np.unique(EPA_obs_df['species'].values):
        lat = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Latitude'].first().values
        lon = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Longitude'].first().values
        data = EPA_obs_df.loc[(EPA_obs_df['geometry'] == geometry) & (EPA_obs_df['species'] == species)].groupby(pd.Grouper(freq='M'))['Arithmetic Mean'].mean()
        tmp_df = pd.DataFrame({'Arithmetic Mean': data.values, 'latitude':lat, 'longitude':lon, 
                               'geometry':geometry, 'species': species, 'date': data.index})
        monthly_EPA_df = monthly_EPA_df.append(tmp_df, sort=False, ignore_index=True)

## Add Region Names to the dataframes and save out

In [5]:
#add region to the dataframes based on lat_lon dictionary
interp_df['Region'] = 'a'
EPA_obs_df['Region'] = 'a'
monthly_EPA_df['Region'] = 'a'

for region in ['SE_lat_lon','NW_lat_lon','NE_lat_lon','MW_lat_lon','SW_lat_lon']: 
    EPA_obs_df.loc[
        (EPA_obs_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (EPA_obs_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
    interp_df.loc[
        (interp_df['Longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (interp_df['Latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region
    monthly_EPA_df.loc[
        (monthly_EPA_df['longitude'].between(utils.lat_lon_dict[region][0], utils.lat_lon_dict[region][1], inclusive = True)) & (monthly_EPA_df['latitude'].between(utils.lat_lon_dict[region][2], utils.lat_lon_dict[region][3], inclusive = True)), 'Region'] = region



In [6]:
interp_df.to_csv('interp_df.csv', date_format='%Y%m%d', index=False)
EPA_obs_df.to_csv('EPA_obs_df.csv', date_format='%Y%m%d', index=False)
monthly_EPA_df.to_csv('EPA_monthly_obs_df.csv', date_format='%Y%m%d', index=False)