# Package Installations

In [1]:
#conda install -c conda-forge gdal

In [2]:
#conda install -c conda-forge geopandas

In [3]:
#conda install -c conda-forge earthpy

In [4]:
#conda install -c conda-forge cloudpathlib

In [5]:
#conda install -c conda-forge pyhdf

In [6]:
#conda install -c conda-forge profilehooks

In [7]:
!pip install netCDF4

In [8]:
#Import Packages. 
import sys
import os
import re
import warnings
import glob
import time
from datetime import datetime
from datetime import timedelta
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
#from shapely.geometry import mapping, box
# import geopandas as gpd
# import earthpy as et
# import earthpy.spatial as es
# import earthpy.plot as ep
# from osgeo import gdal
import pandas as pd
import pyproj

from cloudpathlib import S3Path, S3Client
from pyhdf.SD import SD, SDC

import boto3
import io
import pickle

# from profilehooks import profile

warnings.simplefilter('ignore')

# Prepping labels data

In [9]:
train_labels = pd.read_csv('train_labels.csv',delimiter='|')

In [10]:
def min_lon_polygon(row):
    bounds_str = row['wkt'].replace("POLYGON ((","").replace("))",",")
    bounds = bounds_str.split(",")
    lons=[]
    for b in bounds:
        lon_lat = b.strip().split(" ")
        if(len(lon_lat[0])>0):
            lons.append(float(lon_lat[0]))
    return min(lons)

def max_lon_polygon(row):
    bounds_str = row['wkt'].replace("POLYGON ((","").replace("))",",")
    bounds = bounds_str.split(",")
    lons=[]
    for b in bounds:
        lon_lat = b.strip().split(" ")
        if(len(lon_lat[0])>0):
            lons.append(float(lon_lat[0]))
    return max(lons)

In [11]:
def min_lat_polygon(row):
    bounds_str = row['wkt'].replace("POLYGON ((","").replace("))",",")
    bounds = bounds_str.split(",")
    lats=[]
    for b in bounds:
        lon_lat = b.strip().split(" ")
        if(len(b)>0 and len(lon_lat[1])>0):
            lats.append(float(lon_lat[1]))
    return min(lats)

def max_lat_polygon(row):
    bounds_str = row['wkt'].replace("POLYGON ((","").replace("))",",")
    bounds = bounds_str.split(",")
    lats=[]
    for b in bounds:
        lon_lat = b.strip().split(" ")
        if(len(b)>0 and len(lon_lat[1])>0):
            lats.append(float(lon_lat[1]))
    return max(lats)

In [12]:
def location_code(row):
    if(row['location']=='Los Angeles (SoCAB)'):
        return 'la'
    if(row['location']=='Delhi'):
        return 'dl'
    if(row['location']=='Taipei'):
        return 'tpe'

In [13]:
def utc_year(row):
    dt_str=row['datetime'][:row['datetime'].find("T")]
    dt= datetime.strptime(dt_str,"%Y-%m-%d")
    return dt.year
def utc_month(row):
    dt_str=row['datetime'][:row['datetime'].find("T")]
    dt= datetime.strptime(dt_str,"%Y-%m-%d")
    return dt.month
def utc_date(row):
    return row['datetime'][:row['datetime'].find("T")]

In [14]:
train_labels["min_lon"] = train_labels.apply(lambda row: min_lon_polygon(row), axis=1)
train_labels["max_lon"] = train_labels.apply(lambda row: max_lon_polygon(row), axis=1)
train_labels["min_lat"] = train_labels.apply(lambda row: min_lat_polygon(row), axis=1)
train_labels["max_lat"] = train_labels.apply(lambda row: max_lat_polygon(row), axis=1)
train_labels["loc"] = train_labels.apply(lambda row: location_code(row), axis=1)
train_labels["utc_date"] = train_labels.apply(lambda row: utc_date(row), axis=1)

In [15]:
train_labels[0:4]

Unnamed: 0,datetime,grid_id,value,location,tz,wkt,min_lon,max_lon,min_lat,max_lat,loc,utc_date
0,2018-02-01T08:00:00Z,3S31A,11.4,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.9338248256995 33.79558357488509...,-117.978741,-117.933825,33.795584,33.832902,la,2018-02-01
1,2018-02-01T08:00:00Z,A2FBI,17.0,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.3948356552278 33.98201108613195...,-117.439751,-117.394836,33.982011,34.019248,la,2018-02-01
2,2018-02-01T08:00:00Z,DJN0F,11.1,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.6194144762577 34.09367183102137...,-117.66433,-117.619414,34.093672,34.130859,la,2018-02-01
3,2018-02-01T08:00:00Z,E5P9N,22.1,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.4846671836398 33.98201108613195...,-117.529583,-117.484667,33.982011,34.019248,la,2018-02-01


# PM25 Satellite Metadata

In [16]:
pm25_sat_metadata = pd.read_csv('pm25_satellite_metadata.csv')

In [17]:
pm25_sat_metadata = pm25_sat_metadata[pm25_sat_metadata['product']=='misr']

In [18]:
pm25_sat_metadata[0:4]

Unnamed: 0,granule_id,time_start,time_end,product,location,split,us_url,eu_url,as_url,cksum,granule_size
6704,20180203T193400_misr_la_0.nc,2018-02-03T18:37:03.000Z,2018-02-03 19:34:00+00:00,misr,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406
6705,20180205T192128_misr_la_0.nc,2018-02-05T18:24:29.000Z,2018-02-05 19:21:28+00:00,misr,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2333466395,35593605
6706,20180207T190854_misr_la_0.nc,2018-02-07T18:11:57.000Z,2018-02-07 19:08:54+00:00,misr,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,2932143265,30668976
6707,20180210T193931_misr_la_0.nc,2018-02-10T18:42:33.000Z,2018-02-10 19:39:31+00:00,misr,la,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,3564723192,42459135


In [19]:
pm25_sat_metadata = pm25_sat_metadata.rename(columns={"location": "loc"})

In [20]:
def utc_date_end(row):
    return row['time_end'][:row['time_end'].find(" ")]

In [21]:
pm25_sat_metadata['utc_date']=pm25_sat_metadata.apply(lambda row: utc_date_end(row), axis=1)

In [22]:
pm25_sat_metadata['time_end'].dtype

dtype('O')

In [23]:
pm25_sat_metadata.groupby(['loc','time_end']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,granule_id,time_start,product,split,us_url,eu_url,as_url,cksum,granule_size,utc_date
loc,time_end,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
dl,2017-11-06 06:29:11+00:00,1,1,1,1,1,1,1,1,1,1
dl,2017-11-08 06:17:01+00:00,1,1,1,1,1,1,1,1,1,1
dl,2017-11-15 06:23:44+00:00,1,1,1,1,1,1,1,1,1,1
dl,2017-11-22 06:30:23+00:00,1,1,1,1,1,1,1,1,1,1
dl,2017-11-24 06:18:08+00:00,1,1,1,1,1,1,1,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...
tpe,2021-07-28 02:57:27+00:00,1,1,1,1,1,1,1,1,1,1
tpe,2021-08-04 03:04:00+00:00,1,1,1,1,1,1,1,1,1,1
tpe,2021-08-11 03:10:39+00:00,1,1,1,1,1,1,1,1,1,1
tpe,2021-08-13 02:58:27+00:00,1,1,1,1,1,1,1,1,1,1


# Merge PM25 Satellite Metadata and Labels Data

In [24]:
#datetime(02/01 6am)

#time_end (02/01 noon)   -   datetime(02/02 6am)   YES

#time_end (02/02 noon)   -   datetime(02/02 6am)   NO

test=0
if(not test):
    pm25_sat_metadata = pm25_sat_metadata[pm25_sat_metadata['split']=='train']
    all_metadata_new = pd.merge(train_labels, pm25_sat_metadata, on=['loc'])
    all_metadata_new = all_metadata_new[(all_metadata_new['time_end'].astype('datetime64[ns]')<=
                                         all_metadata_new['datetime'].astype('datetime64[ns]') + timedelta(days=1)) &
                                        (all_metadata_new['time_end'].astype('datetime64[ns]') >=
                                         all_metadata_new['datetime'].astype('datetime64[ns]'))
                                       ]

else:
    all_metadata_new = pm25_sat_metadata
    all_metadata_new = all_metadata_new[(all_metadata_new['time_end'].astype('datetime64[ns]')<=
                                         all_metadata_new['datetime'].astype('datetime64[ns]') + timedelta(days=1)) &
                                        (all_metadata_new['time_end'].astype('datetime64[ns]') >=
                                         all_metadata_new['datetime'].astype('datetime64[ns]'))
                                       ]

In [25]:
len(pm25_sat_metadata)

788

In [26]:
len(train_labels)

34312

In [27]:
len(all_metadata_new)

8502

# Write AOD data averaged for every grid (of interest) boundary to S3 bucket

In [28]:
all_metadata = all_metadata_new

In [29]:
len(all_metadata)

8502

In [30]:
all_metadata.rename(columns={'datetime': 'pm25_reading_date', 'time_end': 'misr_reading_end'}, inplace=True)


In [31]:
all_metadata['datetime_dt'] = pd.to_datetime(all_metadata['pm25_reading_date'], errors='coerce')
all_metadata['misr_read_end_dt'] = pd.to_datetime(all_metadata['misr_reading_end'], errors='coerce')

year_2018_metadata = all_metadata[all_metadata['datetime_dt'].dt.year==2018]

In [32]:
len(year_2018_metadata)

1698

In [33]:
year_2019_metadata = all_metadata[all_metadata['datetime_dt'].dt.year==2019]

In [34]:
len(year_2019_metadata)

2872

In [35]:
year_2020_metadata = all_metadata[all_metadata['datetime_dt'].dt.year==2020]

In [36]:
len(year_2020_metadata)

3932

In [37]:
all_metadata['datetime_dt'].dt.year.unique()

array([2018, 2019, 2020])

In [38]:
all_metadata[0:6]

Unnamed: 0,pm25_reading_date,grid_id,value,location,tz,wkt,min_lon,max_lon,min_lat,max_lat,...,product,split,us_url,eu_url,as_url,cksum,granule_size,utc_date_y,datetime_dt,misr_read_end_dt
4524,2018-02-03T08:00:00Z,3S31A,27.2,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.9338248256995 33.79558357488509...,-117.978741,-117.933825,33.795584,33.832902,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00
4901,2018-02-03T08:00:00Z,A2FBI,18.0,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.3948356552278 33.98201108613195...,-117.439751,-117.394836,33.982011,34.019248,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00
5278,2018-02-03T08:00:00Z,DJN0F,27.3,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.6194144762577 34.09367183102137...,-117.66433,-117.619414,34.093672,34.130859,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00
5655,2018-02-03T08:00:00Z,E5P9N,32.3,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-117.4846671836398 33.98201108613195...,-117.529583,-117.484667,33.982011,34.019248,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00
6032,2018-02-03T08:00:00Z,FRITQ,45.3,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-118.1584036467294 33.83290166381627...,-118.203319,-118.158404,33.832902,33.870203,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00
6409,2018-02-03T08:00:00Z,H96P6,12.8,Los Angeles (SoCAB),Etc/GMT+8,POLYGON ((-118.5177297603772 34.16803061743935...,-118.562646,-118.51773,34.168031,34.205185,...,misr,train,s3://drivendata-competition-airathon-public-us...,s3://drivendata-competition-airathon-public-eu...,s3://drivendata-competition-airathon-public-as...,1711515017,36930406,2018-02-03,2018-02-03 08:00:00+00:00,2018-02-03 19:34:00+00:00


In [39]:
year_2018_metadata_la = all_metadata[(all_metadata['datetime_dt'].dt.year==2018) & 
                                    (all_metadata['location']=='Los Angeles (SoCAB)')]

In [40]:
len(year_2018_metadata_la)

1113

# Extracting MISR Data

In [41]:
import netCDF4 as nc

In [42]:
vars = ['Latitude','Longitude',
 'Elevation','Year','Day_Of_Year','Month','Day','Hour','Minute','Land_Water_Retrieval_Type',
 'Aerosol_Optical_Depth','Aerosol_Optical_Depth_Uncertainty','Angstrom_Exponent_550_860nm',
 'Absorption_Aerosol_Optical_Depth','Nonspherical_Aerosol_Optical_Depth',
 'Small_Mode_Aerosol_Optical_Depth','Medium_Mode_Aerosol_Optical_Depth','Large_Mode_Aerosol_Optical_Depth']

In [43]:
def get_misr_data(file_name, vars):
    ds = nc.Dataset(file_name)
    output = []
    for var in vars:
        #print(var)
        mdata = ds.groups['4.4_KM_PRODUCTS'].variables[var][:].astype(np.double)
        data = np.ma.getdata(mdata)

        if var in ['Latitude','Longitude','Year','Day_Of_Year','Month','Day','Hour','Minute']:
            data[data==mdata.get_fill_value()]=np.nan

        flat_data = data.ravel()
        #print(data.shape)
        if len(output)==0:
            output = flat_data
        else:
            output = np.column_stack((output, flat_data))
    return pd.DataFrame(output, columns = vars)

In [44]:
def misr_avg_5km(misr_file_path, min_lon, max_lon, min_lat, max_lat, grid_id, 
                 misr_reading_end, pm25_reading_date, s3_cli):
    
    start_time = time.time()
    misr_file = S3Path(misr_file_path, client=s3_cli)
    file_name = misr_file.fspath
    
    misr_data = get_misr_data(file_name, vars)
    print("--- Time taken to get misr data for one file %s seconds ---" % (time.time() - start_time))

    
    misr_avg_5km_df = misr_data.loc[(misr_data['Longitude']>=float(min_lon)) & (misr_data['Latitude']>=float(min_lat))
                            & (misr_data['Longitude']<=float(max_lon)) & (misr_data['Latitude']<=float(max_lat))]
    print(len(misr_avg_5km_df))
    #aod_avg_5km_df_meaned = aod_avg_5km_df[aod_avg_5km_df.columns.tolist()].mean()
    misr_avg_5km_df['grid_id']=grid_id
    misr_avg_5km_df['misr_reading_end'] = misr_reading_end
    misr_avg_5km_df['pm25_reading_date'] = pm25_reading_date
    return misr_avg_5km_df#.to_frame().transpose()

In [None]:
start_time = time.time()

# all_5km_dfs = []

s3_cli = S3Client(no_sign_request=True)

for row in year_2019_metadata.iterrows():
    row_data = row[1]
    print(row_data['us_url'])
    if(".nc" in row_data['us_url']):
        misr_avg_5km_df = misr_avg_5km(row_data['us_url'], 
                                  row_data['min_lon'], row_data['max_lon'], row_data['min_lat'], row_data['max_lat'], 
                                  row_data['grid_id'],row_data['misr_reading_end'],row_data['pm25_reading_date'], s3_cli)
#         all_5km_dfs.append(misr_avg_5km_df)
# all_2018_misr_df = pd.concat(all_5km_dfs, axis=0)
        output_fname = 'misr/misr_'+str(row_data['grid_id'])+'_'+str(row_data['misr_reading_end'])+'.parquet'
        misr_avg_5km_df.to_parquet('s3://particulate-articulate-capstone/train/'+output_fname)

print("--- Time taken write 2019 grid level misr data to parquet - %s seconds ---" % (time.time() - start_time))

s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr data for one file 2.6131463050842285 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr data for one file 2.135622262954712 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr data for one file 2.137969732284546 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr data for one file 1.8318934440612793 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr data for one file 1.8798468112945557 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180203T193400_misr_la_0.nc
--- Time taken to get misr dat

s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data for one file 2.3178179264068604 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data for one file 2.335273504257202 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data for one file 2.31870698928833 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data for one file 2.3249683380126953 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data for one file 2.3192434310913086 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180214T191422_misr_la_0.nc
--- Time taken to get misr data

s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr data for one file 2.3315372467041016 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr data for one file 2.3226993083953857 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr data for one file 2.3350272178649902 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr data for one file 2.3750016689300537 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr data for one file 2.3687281608581543 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180302T191237_misr_la_0.nc
--- Time taken to get misr d

s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr data for one file 2.3064162731170654 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr data for one file 2.312077522277832 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr data for one file 2.316472291946411 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr data for one file 2.3446059226989746 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr data for one file 2.3162269592285156 seconds ---
0
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180314T193559_misr_la_0.nc
--- Time taken to get misr dat

s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr data for one file 2.284024477005005 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr data for one file 2.2514126300811768 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr data for one file 2.2868142127990723 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr data for one file 2.317610740661621 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr data for one file 2.3098268508911133 seconds ---
1
s3://drivendata-competition-airathon-public-us/pm25/train/misr/2018/20180325T191624_misr_la_0.nc
--- Time taken to get misr dat