In [9]:
%matplotlib inline

%matplotlib inline
%load_ext autoreload
%autoreload 2

import numpy as np

import xarray as xr

import regionmask

import pandas as pd

import matplotlib.pyplot as plt
import cartopy.crs as ccrs
from matplotlib import cm
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
import cartopy.feature as cfeat
from scipy.interpolate import RectBivariateSpline
from datetime import datetime, timedelta

import calendar

import glob

import sys
sys.path.append('../')
import utils

np.seterr(invalid='ignore'); # disable a warning from matplotlib and cartopy

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Data

## GEOS Chem Runs

Using egrid, NEI, and our model emissions inventories for NO, NO2, SO2

In [2]:
egrid_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation-egrid/OutputDir/'
NEI_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gtropchem/merra2_05x0625_tropchem_na/OutputDir/'
MODEL_path = '../../GCrundirs/type_NA-out_conc-yr_2016-gridmod_final-validation/OutputDir/'
speciesconc_output = 'GEOSChem.SpeciesConc.2016*.nc4'
aerosol_output = 'GEOSChem.AerosolMass.2016*.nc4'

ds_egrid, ds_NEI, ds_MODEL = utils.import_GC_runs(egrid_path, NEI_path, MODEL_path, aerosol_output, speciesconc_output)



In [3]:
datasets = [
    ds_egrid,
    ds_NEI,
    ds_MODEL
]

In [4]:
ds = utils.combine_calculate_GC_runs(datasets)

In [5]:
xr.Dataset.to_netcdf(ds, 'GC_ds.nc4') #save the dataset 

In [2]:
ds = xr.open_dataset('./GC_ds.nc4')

## EPA Observational data
data is from https://aqs.epa.gov/aqsweb/airdata/download_files.html
for the year 2016
choosing O3, NO, SO2, PM25 (FEM/FRM)


In [4]:
EPA_obs_df = pd.concat(map(pd.read_csv, glob.glob('../../GEOS_CHEM/obs_data/daily*.csv')))
EPA_obs_df['date'] = pd.to_datetime(EPA_obs_df['Date Local'])
EPA_obs_df['date'] = EPA_obs_df['date'].dt.normalize() + timedelta(hours=12)
EPA_obs_df['Longitude'] = np.round(EPA_obs_df['Longitude'], decimals = 8)
EPA_obs_df['Latitude'] = np.round(EPA_obs_df['Latitude'], decimals = 8)

EPA_obs_df = EPA_obs_df.drop(columns = ['State Code','County Code','Site Num','Parameter Code','POC','Datum','Sample Duration','Date Local', 'Event Type',
       'Observation Count', 'Observation Percent','1st Max Value', '1st Max Hour', 'Address', 'County Name', 'City Name',
       'CBSA Name', 'Date of Last Change', 'Method Name'])

#only use the SO2 1 hour 2010 pollutant standard arithmetic means and drop the 3-hour 1971 arithmetic means
EPA_obs_df = EPA_obs_df.loc[~(EPA_obs_df['Pollutant Standard'] == 'SO2 3-hour 1971')]
EPA_obs_df = EPA_obs_df.rename(columns = {'Parameter Name':'species'})

#convert Ozone to ppb
EPA_obs_df.loc[EPA_obs_df['species'] == 'Ozone','Arithmetic Mean'] *= 1e3 #ppb
EPA_obs_df = EPA_obs_df.loc[~(EPA_obs_df['Arithmetic Mean'] <= 0)]

## Lists/Dicts of Variables

In [5]:
#define Lat and Lon of the nested grid US
levels_dict = {'PM25':np.arange(0., 20., .5), 'SO2':np.arange(0., 5., .1), 
               'NO2':np.arange(0., 5., .1), 'NOx':np.arange(0., 5., .1), 'O3':np.arange(0., 70., 1.),
               'dif':np.arange(-.3, .31, .01), 'regional_dif':np.arange(-1.5, 1.51, .01)}


model_names = []
for d in datasets:    
    model_names.append(d.attrs['name'])


NameError: name 'datasets' is not defined

In [6]:
model_names = ['egrid','NEI','MODEL']

# Stats on Observations vs. Models

## Interpolate data to get GC run data at observational points

In [None]:
### interpolate data
interp_df = pd.DataFrame(columns=['Arithmetic Mean', 'Longitude', 'Latitude','model','species','date'])

for model in model_names:
    for species in list(utils.species_dict.keys()):
        for month in pd.DatetimeIndex(EPA_obs_df.loc[EPA_obs_df.date.dt.month.isin(utils.month_string)]['date'].values).month.unique().values:
            #lat and lon in radians
            lats = np.array(ds['lat']) #*np.pi / 180) # in radians
            lons = np.array(ds['lon'])#*np.pi / 180) # in radians
            #data selected for date
            data = ds[f'{model}_{species}'].groupby('time.month').mean().sel(month = month)
            
            #new lat and lon in radians
            lats_new = EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species]) & (EPA_obs_df.date.dt.month.isin(utils.month_string))]['Latitude'].unique()#*np.pi / 180. # in radians
            lons_new = (EPA_obs_df.loc[(EPA_obs_df['species'] == utils.species_dict[species]) & (EPA_obs_df.date.dt.month.isin(utils.month_string))]['Longitude'].unique())#*np.pi / 180. # in radians
            
            #interpolation function
            #interp_spline= RectBivariateSpline(lats, lons, data)
            #interp_data = interp_spline(lats_new,lons_new,grid = False)
            interp_data = data.sel(lat=lats_new, lon=lons_new, method='nearest')
            
            tmp_df = pd.DataFrame({'Arithmetic Mean':interp_data, 'Longitude':lons_new, 'Latitude':lats_new, 'model': model, 'species': utils.species_dict[species], 'date': month})
            interp_df = interp_df.append(tmp_df, sort=False, ignore_index=True)

In [97]:
xtest = 38.3
ytest = -129.9

In [99]:
data.sel(lat=xtest, lon=ytest, method='nearest')

In [10]:
for i in range(0, len(interp_df)):
    interp_df.loc[i,('date')] = datetime(2016,interp_df['date'][i],calendar.monthrange(2016,interp_df['date'][i])[1])

In [11]:
interp_df.to_csv('interp_df.csv', date_format='%Y%m%d', index=False)
EPA_obs_df.to_csv('EPA_obs_df.csv', date_format='%Y%m%d', index=False)


In [14]:
EPA_obs_df.head()

Unnamed: 0,Latitude,Longitude,species,Pollutant Standard,Units of Measure,Arithmetic Mean,AQI,Method Code,Local Site Name,State Name,date
0,33.553056,-86.815,Sulfur dioxide,SO2 1-hour 2010,Parts per billion,0.413636,1.0,560.0,North Birmingham,Alabama,2016-01-01 12:00:00
1,33.553056,-86.815,Sulfur dioxide,SO2 1-hour 2010,Parts per billion,0.663636,1.0,560.0,North Birmingham,Alabama,2016-01-02 12:00:00
2,33.553056,-86.815,Sulfur dioxide,SO2 1-hour 2010,Parts per billion,0.863636,1.0,560.0,North Birmingham,Alabama,2016-01-03 12:00:00
3,33.553056,-86.815,Sulfur dioxide,SO2 1-hour 2010,Parts per billion,0.51,3.0,560.0,North Birmingham,Alabama,2016-01-04 12:00:00
4,33.553056,-86.815,Sulfur dioxide,SO2 1-hour 2010,Parts per billion,0.75,3.0,560.0,North Birmingham,Alabama,2016-01-05 12:00:00
