In [1]:
%load_ext autoreload
%autoreload 2
%load_ext jupyter_black

In [7]:
from __future__ import annotations
import typing

import pandas as pd
import polars as pl
from typing_extensions import Self
import os
import sevir as svr
from sevir.constants import DEFAULT_PATH_TO_SEVIR, DEFAULT_CATALOG

SEVIR_CATALOG = os.path.join(DEFAULT_PATH_TO_SEVIR, DEFAULT_CATALOG)
assert os.path.exists(SEVIR_CATALOG)
%timeit pl.read_csv(SEVIR_CATALOG, use_pyarrow=True)
%timeit pd.read_csv(SEVIR_CATALOG, low_memory=False)

967 ms ± 73.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
1.2 s ± 35.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# reading data with polars

In [8]:
df = pl.read_csv(
    SEVIR_CATALOG,
    dtypes={
        svr.ID: pl.Utf8,
        svr.FILE_NAME: pl.Utf8,
        svr.IMG_TYPE: pl.Utf8,
        svr.TIME_UTC: pl.Datetime,
    },
    use_pyarrow=True,
)
df.head()

id,file_name,file_index,img_type,time_utc,minute_offsets,episode_id,event_id,event_type,llcrnrlat,llcrnrlon,urcrnrlat,urcrnrlon,proj,size_x,size_y,height_m,width_m,data_min,data_max,pct_missing
str,str,i64,str,datetime[μs],str,str,str,str,f64,f64,f64,f64,str,i64,i64,f64,f64,f64,f64,f64
"""R1803250502768…","""vis/2018/SEVIR…",0,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,33.216708,-91.635132,36.336627,-87.070254,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.003361,0.0056,0.0
"""R1803250502767…","""vis/2018/SEVIR…",1,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,33.084309,-91.849435,36.213723,-87.301535,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.003361,0.0056,0.0
"""R1803250502772…","""vis/2018/SEVIR…",2,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,46.661866,-123.509928,50.883159,-120.009277,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.00287,0.005548,0.0
"""R1803251650824…","""vis/2018/SEVIR…",3,"""vis""",2018-03-25 16:50:00,"""-120:-115:-110…",,,,40.883237,-85.350213,43.686191,-79.903987,"""+proj=laea +la…",768,768,384000.0,384000.0,0.029911,0.7856,0.0
"""R1803251650762…","""vis/2018/SEVIR…",4,"""vis""",2018-03-25 16:50:00,"""-120:-115:-110…",,,,44.946047,-123.972935,49.169436,-120.575175,"""+proj=laea +la…",768,768,384000.0,384000.0,0.02856,0.79261,0.0


# Filtering unwanted rows

In [9]:
img_types = ["vis", "ir069"]
df = df.filter(df[svr.IMG_TYPE].is_in(img_types))
assert df[svr.IMG_TYPE].n_unique() == len(set(img_types))
count = df.groupby(svr.ID).count()
f_ids = count.filter(count["count"] >= len(img_types))[svr.ID]
df = df.filter(df[svr.ID].is_in(f_ids))
df.head()

id,file_name,file_index,img_type,time_utc,minute_offsets,episode_id,event_id,event_type,llcrnrlat,llcrnrlon,urcrnrlat,urcrnrlon,proj,size_x,size_y,height_m,width_m,data_min,data_max,pct_missing
str,str,i64,str,datetime[μs],str,str,str,str,f64,f64,f64,f64,str,i64,i64,f64,f64,f64,f64,f64
"""R1803250502768…","""vis/2018/SEVIR…",0,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,33.216708,-91.635132,36.336627,-87.070254,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.003361,0.0056,0.0
"""R1803250502767…","""vis/2018/SEVIR…",1,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,33.084309,-91.849435,36.213723,-87.301535,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.003361,0.0056,0.0
"""R1803250502772…","""vis/2018/SEVIR…",2,"""vis""",2018-03-25 05:00:00,"""-120:-115:-110…",,,,46.661866,-123.509928,50.883159,-120.009277,"""+proj=laea +la…",768,768,384000.0,384000.0,-0.00287,0.005548,0.0
"""R1803251650824…","""vis/2018/SEVIR…",3,"""vis""",2018-03-25 16:50:00,"""-120:-115:-110…",,,,40.883237,-85.350213,43.686191,-79.903987,"""+proj=laea +la…",768,768,384000.0,384000.0,0.029911,0.7856,0.0
"""R1803251650762…","""vis/2018/SEVIR…",4,"""vis""",2018-03-25 16:50:00,"""-120:-115:-110…",,,,44.946047,-123.972935,49.169436,-120.575175,"""+proj=laea +la…",768,768,384000.0,384000.0,0.02856,0.79261,0.0
