In [3]:
%load_ext jupyter_black

# transform a timeseries of json files into a Dataframe
- working with geopandas
- datatypes
- masking
- io with parquet filetype 


In [4]:
import json
import os
from glob import glob
from pathlib import Path
from warnings import warn
from datetime import datetime
from typing import Iterator, NewType

import numpy as np
import pandas as pd
import geopandas as gpd

notebook_dir = os.getcwd().split("/notebooks")[0]
archives = f"{notebook_dir}/archive"
FeatureCollection = NewType("FeatureCollection", dict[str, any])

In [5]:
def iterpaths(path: str) -> Iterator[Path]:
    """
    generator function to be iterated over
    yields a iterable collection of Path objects
    """
    # glob: will allows the use of wild card patterns
    # very useful for identifiy several files in a directory
    # Return a list of paths matching a pathname pattern.
    # The pattern may contain simple shell-style wildcards a la fnmatch. However, unlike fnmatch,
    # filenames starting with a dot are special cases that are not matched by '*' and '?' patterns.
    # If recursive is true, the pattern '**' will match any files and zero or more directories and subdirectories.

    yield from (Path(file) for file in glob(path))


def open_probsevere(globpath: str) -> pd.DataFrame:
    def generate():
        for path in iterpaths(globpath):
            with path.open("rb") as f:
                try:
                    fc: FeatureCollection = json.load(f)
                except json.JSONDecodeError:
                    warn(f"json error decoding {path}")
                    continue
                df = gpd.GeoDataFrame.from_features(fc["features"])
                df["validTime"] = datetime.strptime(fc["validTime"], "%Y%m%d_%H%M%S %Z")
                yield df

    return pd.concat(generate()).set_index(["validTime", "ID"])

In [12]:
# takes rougly 10 seconds to read and process all of the files
df = open_probsevere(archives + "/probsevere/*.json")
# dtypes are all jacked up so lets fix that
df.dtypes

geometry             geometry
MUCAPE                 object
MLCAPE                 object
MLCIN                  object
EBSHEAR                object
SRH01KM                object
MEANWIND_1-3kmAGL      object
MESH                   object
VIL_DENSITY            object
FLASH_RATE             object
FLASH_DENSITY          object
MAXLLAZ                object
P98LLAZ                object
P98MLAZ                object
MAXRC_EMISS            object
MAXRC_ICECF            object
WETBULB_0C_HGT         object
PWAT                   object
CAPE_M10M30            object
LJA                    object
SIZE                   object
AVG_BEAM_HGT           object
MOTION_EAST            object
MOTION_SOUTH           object
PS                     object
dtype: object

In [14]:
df.shape

(630751, 25)

In [13]:
# the ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"] are either a string or object type and cannot be cast as a float
# this demonstrastes mask indexing?
mask = df.columns.isin(["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"])
# to use this method your mask should be the same shape as the thing you are going to mask
assert mask.shape == df.columns.shape
# the mask is just an array of bool values
mask

array([ True, False, False, False, False, False, False, False, False,
       False, False, False, False, False,  True,  True, False, False,
       False, False, False,  True, False, False, False])

In [15]:
# the mask can be inverted using the ~ Bitwise NOT operator
~mask

array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True, False, False,  True,  True,
        True,  True,  True, False,  True,  True,  True])

In [16]:
# again since our mask is the same shape as our columns we can to select the columns we want to convert to floating points
float_cols = df.columns[~mask]
float_cols

Index(['MUCAPE', 'MLCAPE', 'MLCIN', 'EBSHEAR', 'SRH01KM', 'MEANWIND_1-3kmAGL',
       'MESH', 'VIL_DENSITY', 'FLASH_RATE', 'FLASH_DENSITY', 'MAXLLAZ',
       'P98LLAZ', 'P98MLAZ', 'WETBULB_0C_HGT', 'PWAT', 'CAPE_M10M30', 'LJA',
       'SIZE', 'MOTION_EAST', 'MOTION_SOUTH', 'PS'],
      dtype='object')

In [17]:
# using the float_cols to both index the dataframe and select floatable values to set back into the original dataframe
df[float_cols] = df[float_cols].astype(np.float32)
df.dtypes

geometry             geometry
MUCAPE                float32
MLCAPE                float32
MLCIN                 float32
EBSHEAR               float32
SRH01KM               float32
MEANWIND_1-3kmAGL     float32
MESH                  float32
VIL_DENSITY           float32
FLASH_RATE            float32
FLASH_DENSITY         float32
MAXLLAZ               float32
P98LLAZ               float32
P98MLAZ               float32
MAXRC_EMISS            object
MAXRC_ICECF            object
WETBULB_0C_HGT        float32
PWAT                  float32
CAPE_M10M30           float32
LJA                   float32
SIZE                  float32
AVG_BEAM_HGT           object
MOTION_EAST           float32
MOTION_SOUTH          float32
PS                    float32
dtype: object

In [7]:
# lets wrap all that in a function
def to_numeric(
    df: pd.DataFrame,
    dont_float: list[str] = ["MAXRC_EMISS", "MAXRC_ICECF", "AVG_BEAM_HGT", "geometry"],
    dtype=np.float32,
) -> pd.DataFrame:
    mask = df.columns.isin(dont_float)
    float_cols = df.columns[~mask]
    df[float_cols] = df[float_cols].astype(dtype)
    return df

In [8]:
# putting it all together in a single chained operation
(
    # create the glob path to our data which returns a dataframe
    open_probsevere(archives + "/probsevere/*.json")
    # the dataframe pipe method allows to continue chaining operations
    .pipe(to_numeric)
    # then output the data as as parquet
    .to_parquet(archives+"/probsevere.parquet")
)

In [10]:
# previously it took nearly 15 seconds to open a days worth of data, format it and save the output file.
# the parquet file type reads much faster .2 seconds reading with pandas 1.6 seconds with geopandas because of the geometry object creation
df: gpd.GeoDataFrame = gpd.read_parquet(archives + "/probsevere.parquet")
df

Unnamed: 0_level_0,Unnamed: 1_level_0,geometry,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,...,MAXRC_ICECF,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS
validTime,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2022-03-06 20:20:36,144463,"POLYGON ((-86.53000 37.45000, -86.49000 37.450...",642.0,620.0,-7.0,35.799999,89.0,30.600000,0.20,1.56,2.0,...,,10.6,1.6,171.0,0.0,376.0,3.59 kft / 1.10 km,17.049,-2.970,3.0
2022-03-06 20:20:36,144656,"POLYGON ((-84.80000 37.72000, -84.78000 37.710...",555.0,433.0,-1.0,36.200001,99.0,33.599998,0.11,1.37,4.0,...,,9.6,1.4,123.0,0.0,343.0,5.68 kft / 1.73 km,13.076,3.596,3.0
2022-03-06 20:20:36,144883,"POLYGON ((-83.73000 38.11000, -83.72000 38.100...",307.0,164.0,-32.0,30.799999,73.0,31.799999,0.00,0.66,0.0,...,,9.9,1.4,30.0,0.0,894.0,3.62 kft / 1.10 km,13.282,-2.977,1.0
2022-03-06 20:20:36,144910,"POLYGON ((-84.38000 39.42000, -84.36000 39.420...",1120.0,724.0,-2.0,54.799999,88.0,39.200001,0.04,0.62,0.0,...,1911Z 0.2/min (strong),7.3,1.0,234.0,0.0,78.0,1.96 kft / 0.60 km,10.986,-5.576,13.0
2022-03-06 20:20:36,144930,"POLYGON ((-81.54000 38.55000, -81.48000 38.550...",220.0,44.0,-52.0,29.500000,91.0,34.299999,0.00,0.46,1.0,...,,10.3,1.3,23.0,0.0,142.0,0.71 kft / 0.22 km,14.374,0.124,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-31 14:50:41,44622,"POLYGON ((-83.11000 33.21000, -83.07000 33.210...",525.0,164.0,-98.0,40.700001,250.0,44.900002,0.00,0.26,0.0,...,,11.0,1.8,190.0,0.0,72.0,1.96 kft / 0.60 km,13.602,-7.175,2.0
2022-03-31 14:50:41,44623,"POLYGON ((-82.40000 33.01000, -82.38000 33.010...",426.0,307.0,-13.0,53.799999,270.0,56.000000,0.00,0.78,0.0,...,,11.5,1.8,132.0,0.0,54.0,4.40 kft / 1.34 km,11.029,-6.008,4.0
2022-03-31 14:50:41,44624,"POLYGON ((-83.59000 31.82000, -83.52000 31.820...",724.0,240.0,-61.0,51.099998,303.0,48.400002,0.00,0.33,0.0,...,,12.2,1.9,223.0,0.0,65.0,4.07 kft / 1.24 km,11.029,-12.156,4.0
2022-03-31 14:50:41,44625,"POLYGON ((-84.84000 29.31000, -84.82000 29.300...",1715.0,1364.0,0.0,52.299999,243.0,46.000000,0.00,0.52,0.0,...,,12.2,2.2,316.0,0.0,61.0,6.46 kft / 1.97 km,9.599,-5.908,6.0


In [11]:
df = df.copy()
# if we wanted to index the storm based on its geometric center
# geopandas allows us to call that centroid property
cent = df["geometry"].centroid
# from which we can set x,y
df["x"] = cent.x
df["y"] = cent.y
# add them to the index, and drop the geometry object
df.set_index(["x", "y"], append=True).drop(columns=["geometry"])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,MUCAPE,MLCAPE,MLCIN,EBSHEAR,SRH01KM,MEANWIND_1-3kmAGL,MESH,VIL_DENSITY,FLASH_RATE,FLASH_DENSITY,...,MAXRC_ICECF,WETBULB_0C_HGT,PWAT,CAPE_M10M30,LJA,SIZE,AVG_BEAM_HGT,MOTION_EAST,MOTION_SOUTH,PS
validTime,ID,x,y,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1
2022-03-06 20:20:36,144463,-86.501830,37.358443,642.0,620.0,-7.0,35.799999,89.0,30.600000,0.20,1.56,2.0,0.13,...,,10.6,1.6,171.0,0.0,376.0,3.59 kft / 1.10 km,17.049,-2.970,3.0
2022-03-06 20:20:36,144656,-84.865939,37.577009,555.0,433.0,-1.0,36.200001,99.0,33.599998,0.11,1.37,4.0,0.37,...,,9.6,1.4,123.0,0.0,343.0,5.68 kft / 1.73 km,13.076,3.596,3.0
2022-03-06 20:20:36,144883,-84.038261,37.978728,307.0,164.0,-32.0,30.799999,73.0,31.799999,0.00,0.66,0.0,0.10,...,,9.9,1.4,30.0,0.0,894.0,3.62 kft / 1.10 km,13.282,-2.977,1.0
2022-03-06 20:20:36,144910,-84.362949,39.375179,1120.0,724.0,-2.0,54.799999,88.0,39.200001,0.04,0.62,0.0,0.00,...,1911Z 0.2/min (strong),7.3,1.0,234.0,0.0,78.0,1.96 kft / 0.60 km,10.986,-5.576,13.0
2022-03-06 20:20:36,144930,-81.550645,38.498519,220.0,44.0,-52.0,29.500000,91.0,34.299999,0.00,0.46,1.0,0.05,...,,10.3,1.3,23.0,0.0,142.0,0.71 kft / 0.22 km,14.374,0.124,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2022-03-31 14:50:41,44622,-83.097721,33.163732,525.0,164.0,-98.0,40.700001,250.0,44.900002,0.00,0.26,0.0,0.00,...,,11.0,1.8,190.0,0.0,72.0,1.96 kft / 0.60 km,13.602,-7.175,2.0
2022-03-31 14:50:41,44623,-82.425368,32.957489,426.0,307.0,-13.0,53.799999,270.0,56.000000,0.00,0.78,0.0,0.00,...,,11.5,1.8,132.0,0.0,54.0,4.40 kft / 1.34 km,11.029,-6.008,4.0
2022-03-31 14:50:41,44624,-83.549126,31.786214,724.0,240.0,-61.0,51.099998,303.0,48.400002,0.00,0.33,0.0,0.03,...,,12.2,1.9,223.0,0.0,65.0,4.07 kft / 1.24 km,11.029,-12.156,4.0
2022-03-31 14:50:41,44625,-84.860947,29.267088,1715.0,1364.0,0.0,52.299999,243.0,46.000000,0.00,0.52,0.0,0.00,...,,12.2,2.2,316.0,0.0,61.0,6.46 kft / 1.97 km,9.599,-5.908,6.0
