In [1]:
%load_ext jupyter_black

In [6]:
from glob import glob
from datetime import datetime

import pandas as pd
import xarray as xr

In [48]:
%%time


def open_sat_data(glob_like_path: str) -> xr.Dataset:
    def generate() -> xr.Dataset:
        yymmddhhss = ("year", "month", "day", "hour", "minute", "second")
        for file in glob(glob_like_path):
            # have to open the files one at a time because theres no validTime dim set by netcdf4?
            ds = xr.open_dataset(file, engine="netcdf4", chunks="auto", use_cftime=True)
            # iterate y, m, d, h, m, s and index the attrs to create a datetime object
            ts = datetime(*(int(ds.attrs[x]) for x in yymmddhhss))
            # expand the dev with the timestamp value and yeild the dataset
            yield ds.expand_dims({"validTime": [ts.timestamp()]})

    #  concatnate the data on the validTime dimension
    return xr.concat(generate(), dim="validTime")


# open a list of sat files
ds = open_sat_data("/workspaces/griblib/archive/*_nc")
ds

CPU times: user 1.88 s, sys: 20 ms, total: 1.9 s
Wall time: 1.9 s


Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.21 MiB 62.86 kiB Shape (36, 16093) (1, 16093) Count 362 Tasks 36 Chunks Type float32 numpy.ndarray",16093  36,

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.21 MiB 62.86 kiB Shape (36, 16093) (1, 16093) Count 362 Tasks 36 Chunks Type float32 numpy.ndarray",16093  36,

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.21 MiB 62.86 kiB Shape (36, 16093) (1, 16093) Count 362 Tasks 36 Chunks Type float32 numpy.ndarray",16093  36,

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.21 MiB 62.86 kiB Shape (36, 16093) (1, 16093) Count 362 Tasks 36 Chunks Type float32 numpy.ndarray",16093  36,

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 2.21 MiB 62.86 kiB Shape (36, 16093) (1, 16093) Count 362 Tasks 36 Chunks Type float32 numpy.ndarray",16093  36,

Unnamed: 0,Array,Chunk
Bytes,2.21 MiB,62.86 kiB
Shape,"(36, 16093)","(1, 16093)"
Count,362 Tasks,36 Chunks
Type,float32,numpy.ndarray


In [8]:
def to_datetime(df: pd.DataFrame, time_column="validTime"):
    df[time_column] = df[time_column].astype("datetime64[s]")
    return df


df = (
    # dataset to dataframe
    ds.to_dataframe()
    # reset the index to we can grab the valid times
    .reset_index()
    # calling the pipe method to turn POSIX time into datetime objects
    .pipe(to_datetime)
    # update our index
    .set_index(["validTime", "MSL_alt", "GEO_lat", "GEO_lon"])
    # drop rows with na
    .dropna(axis=0, how="all")
)
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,OCC_azi,TEC_cal,ELEC_dens
validTime,MSL_alt,GEO_lat,GEO_lon,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-01-09 12:47:50,0.105839,-21.061815,-17.407858,8.873467,100.379578,-132302.171875
2020-01-09 17:32:50,0.148841,38.609314,-108.179146,26.567747,49.669487,1928.172485
2020-01-09 16:40:24,0.168985,-50.168381,71.412903,175.372772,27.965189,43412.187500
2020-01-09 13:14:51,0.221322,-56.066246,92.076050,-132.743744,36.371437,11886.642578
2020-01-09 13:40:14,0.266322,45.914852,164.135376,-163.007645,69.038605,55002.234375
...,...,...,...,...,...,...
2020-01-09 15:57:54,835.093933,30.720913,-101.458092,-56.755951,0.625098,29550.847656
2020-01-09 12:31:29,835.285461,42.187160,-55.072266,-26.970804,0.214227,8663.916992
2020-01-09 17:37:06,835.338074,31.747385,-127.079155,-45.724815,0.502910,22299.529297
2020-01-09 17:32:50,835.372925,46.844166,-132.669479,18.745573,0.263199,10966.423828
