In [1]:
import os
import re
import datetime
from pathlib import Path

import pandas as pd
import numpy as np
import shapefile
import pupygrib
import ogr
from osgeo import gdal
from netCDF4 import Dataset

In [2]:
class Analysis:
    ELEMENTS = ['O3', 'NO2', 'CO']
    HEIGHT_DEVIATION = 500
    DEGREE_DEVIATION = 0.1
    HEIGHT_MAX = 5000

    def __init__(self):
        self.project_folder = Path(os.path.abspath('')).parent
        self.data_folder = self.project_folder / 'data'
        self.shp_folder = self.data_folder / 'shp'
        self.nc_folder = self.data_folder / 'nc'
        
    def getDate(self, file_path):
        m = re.search(r"\d{8}", file_path)
        date = datetime.datetime.strptime(m.group(), '%Y%m%d')
        
        return date
        
    def process(self, file_, date):
        date_str = date.strftime("%Y%m%d")
        with shapefile.Reader(str(file_)) as shp_file:
            result_row = {}
            result_row['date'] = date_str

            records = shp_file.records()
            fields = shp_file.fields[1:]
            fields_names = [field[0] for field in fields]
            
            fields_names[3] = 'O3'  # rename Ozone to O3

            data = pd.DataFrame(
                np.array(records), columns=fields_names
            )
            data[['StdAltitu']] /= 3.281  # ft to meters

            points_x = np.empty(len(records))
            points_y = np.empty(len(records))

            shp = ogr.Open(str(file_.absolute()))
            layer = shp.GetLayer()
            i = 0
            while i < len(records):
                point = layer.GetFeature(i)
                geom = point.GetGeometryRef()
                points_x[i] = geom.GetPoint(0)[0]
                points_y[i] = geom.GetPoint(0)[1]
                i += 1

            data['lat'] = points_x
            data['long'] = points_y

            for element in self.ELEMENTS:
                data_clean = data.dropna(axis=1, how='all')
                if element not in data_clean.columns:
                    continue

                data_clean = data_clean.dropna(how='any', axis=0, subset=[element])

                clean_data_2km = pd.DataFrame()
                clean_data_3km = pd.DataFrame()
                clean_data_5km = pd.DataFrame()

                for index, row in data_clean.iterrows():
                    if row['StdAltitu'] > self.HEIGHT_MAX + self.HEIGHT_DEVIATION:
                        continue

                    nc_date = datetime.datetime.combine(date, datetime.datetime.min.time())
                    nc_date += datetime.timedelta(seconds=row['TimeCRef'])
                    nc_date_str = nc_date.strftime("%Y%m%d")

                    nc = 0
                    for file_ in self.nc_folder.iterdir():
                        if nc_date_str in file_.name and element in file_.name:
                            with Dataset(str(file_), 'r', format='NETCDF4') as nc_:
                                nc = nc_
                                el = nc.variables[element]
                                lons = nc.variables['lon'][:]
                                lats = nc.variables['lat'][:]
                                heights = nc.variables['height'][:]

                                # latitude index
                                if np.abs( lats - row['lat'] ).any() > self.DEGREE_DEVIATION:
                                    continue
                                else:
                                    lat_id = np.argmin( np.abs( lats - row['lat'] ) )
                                # longitude index
                                if np.abs( lons - row['long'] ).any() > self.DEGREE_DEVIATION:
                                    continue
                                else:
                                    lon_id = np.argmin( np.abs( lons - row['long'] ) )
                                # height index
                                if np.abs( heights - row['StdAltitu'] ).any() > self.HEIGHT_DEVIATION:
                                    continue
                                else:
                                    height_id = np.argmin( np.abs( heights - row['StdAltitu'] ) )

                                # time index
                                nc_hour = nc_date.strftime("%H")

                                if height_id == 5:  # if 2km
                                    clean_data_2km = clean_data_2km.append({
                                        'nc_data_'+element: float(el[np.int64(nc_hour), height_id, lat_id, lon_id]),
                                        element: row[element]
                                    }, ignore_index=True)
                                elif height_id == 6: # if 3km
                                    clean_data_3km = clean_data_3km.append({
                                        'nc_data_'+element: float(el[np.int64(nc_hour), height_id, lat_id, lon_id]),
                                        element: row[element]
                                    }, ignore_index=True)
                                elif height_id == 7: # if 5km
                                    clean_data_5km = clean_data_5km.append({
                                        'nc_data_'+element: float(el[np.int64(nc_hour), height_id, lat_id, lon_id]),
                                        element: row[element]
                                    }, ignore_index=True)  

                                else:
                                    continue

                    if not nc:
                        continue

                if not clean_data_2km.empty:
                    corr_2km = clean_data_2km['nc_data_' + element].corr(clean_data_2km[element])
                    result_row[element+'_2km'] = corr_2km
                    result_row['dots_'+element+'_2km'] = len(clean_data_2km.index)
                if not clean_data_3km.empty:
                    corr_3km = clean_data_3km['nc_data_' + element].corr(clean_data_3km[element])
                    result_row[element+'_3km'] = corr_3km
                    result_row['dots_'+element+'_3km'] = len(clean_data_3km.index)
                if not clean_data_5km.empty:
                    corr_5km = clean_data_5km['nc_data_' + element].corr(clean_data_5km[element])
                    result_row[element+'_5km'] = corr_5km
                    result_row['dots_'+element+'_5km'] = len(clean_data_5km.index)
        return result_row

    def analyse(self, date: datetime.date = None):
        # Get data
        result_data = pd.DataFrame([], columns=['date'])
        data = []
        if date:
            date_str = date.strftime("%Y%m%d")
            for file_ in self.shp_folder.iterdir():
                if date_str in file_.name and file_.suffix == '.shp':
                    result_row = self.process(file_, date)      
                    result_data = result_data.append(result_row, ignore_index=True)
        else:
            for file_ in self.shp_folder.iterdir():
                if file_.suffix == '.shp':
                    date = self.getDate(str(file_))
                    result_row = self.process(file_, date)      
                    result_data = result_data.append(result_row, ignore_index=True)
                
        return result_data

In [3]:
a = Analysis()
d = a.analyse()
print(d)

        date
0   20160112
1   20160114
2   20160215
3   20151130
4   20151202
5   20160113
6   20151201
7   20160215
8   20151201
9   20160113
10  20160216
