In [1]:
import pandas as pd
import zipfile
import glob

from tqdm.notebook import tqdm

# Pandas options
pd.set_option('display.max_colwidth', None)

Here we get a list of all the successfully downloaded files. We will check which data we have and will update our dataset accordingly.

In [2]:
# read all files in folder
path = "../../data/raw/ecallisto_ng_downloaded/"
files = glob.glob(path + "*.parquet")
files[:5]

['../../data/raw/ecallisto_ng_downloaded/australia_assa_02_2021-04-12 06-12-00_2021-04-12 06-13-00_None_None.parquet',
 '../../data/raw/ecallisto_ng_downloaded/australia_assa_02_2021-06-29 23-32-00_2021-06-29 23-33-00_None_None.parquet',
 '../../data/raw/ecallisto_ng_downloaded/swiss_landschlacht_01_2021-08-05 13-28-00_2021-08-05 13-29-00_None_None.parquet',
 '../../data/raw/ecallisto_ng_downloaded/swiss_landschlacht_01_2021-06-21 07-15-00_2021-06-21 07-16-00_None_None.parquet',
 '../../data/raw/ecallisto_ng_downloaded/alaska_haarp_62_2023-06-13 00-53-00_2023-06-13 00-54-00_None_None.parquet']

We split the filenames into a dataframe, which has the important information.

In [3]:
files = [f.split("/")[-1] for f in files]
files = pd.DataFrame(files, columns=["File Name"])
files["Instrument"] = files["File Name"].str.split("_").str[0:3].str.join("_")
files["Start"] = pd.to_datetime(files["File Name"].str.split("_").str[3], format="%Y-%m-%d %H-%M-%S")
files["End"] = pd.to_datetime(files["File Name"].str.split("_").str[4], format="%Y-%m-%d %H-%M-%S")
files

Unnamed: 0,File Name,Instrument,Start,End
0,australia_assa_02_2021-04-12 06-12-00_2021-04-12 06-13-00_None_None.parquet,australia_assa_02,2021-04-12 06:12:00,2021-04-12 06:13:00
1,australia_assa_02_2021-06-29 23-32-00_2021-06-29 23-33-00_None_None.parquet,australia_assa_02,2021-06-29 23:32:00,2021-06-29 23:33:00
2,swiss_landschlacht_01_2021-08-05 13-28-00_2021-08-05 13-29-00_None_None.parquet,swiss_landschlacht_01,2021-08-05 13:28:00,2021-08-05 13:29:00
3,swiss_landschlacht_01_2021-06-21 07-15-00_2021-06-21 07-16-00_None_None.parquet,swiss_landschlacht_01,2021-06-21 07:15:00,2021-06-21 07:16:00
4,alaska_haarp_62_2023-06-13 00-53-00_2023-06-13 00-54-00_None_None.parquet,alaska_haarp_62,2023-06-13 00:53:00,2023-06-13 00:54:00
...,...,...,...,...
64174,australia_assa_02_2021-02-14 09-09-00_2021-02-14 09-10-00_None_None.parquet,australia_assa_02,2021-02-14 09:09:00,2021-02-14 09:10:00
64175,alaska_haarp_62_2023-02-20 20-12-00_2023-02-20 20-13-00_None_None.parquet,alaska_haarp_62,2023-02-20 20:12:00,2023-02-20 20:13:00
64176,australia_assa_02_2021-09-26 05-53-00_2021-09-26 05-54-00_None_None.parquet,australia_assa_02,2021-09-26 05:53:00,2021-09-26 05:54:00
64177,alaska_haarp_62_2023-04-08 21-54-00_2023-04-08 21-55-00_None_None.parquet,alaska_haarp_62,2023-04-08 21:54:00,2023-04-08 21:55:00


Here we merge the downloaded data to with the dataframe. This will be our available data.

In [4]:
sunbursts = pd.read_csv("../../data/processed/sunburst_images_predownload.csv")

sunbursts["Start"] = sunbursts["Start"].astype("str")
sunbursts["End"] = sunbursts["End"].astype("str")

files["Start"] = files["Start"].astype("str")
files["End"] = files["End"].astype("str")

sunbursts = sunbursts.merge(files, on=["Instrument", "Start", "End"], how="right")
sunbursts

Unnamed: 0,Classification,Instrument,Start,End,File Name_x,Extension,File Name_y
0,no_burst,australia_assa_02,2021-04-12 06:12:00,2021-04-12 06:13:00,2021-04-12 06-12-00_2021-04-12 06-13-00_australia_assa_02_None_no_burst,png,australia_assa_02_2021-04-12 06-12-00_2021-04-12 06-13-00_None_None.parquet
1,no_burst,australia_assa_02,2021-06-29 23:32:00,2021-06-29 23:33:00,2021-06-29 23-32-00_2021-06-29 23-33-00_australia_assa_02_None_no_burst,png,australia_assa_02_2021-06-29 23-32-00_2021-06-29 23-33-00_None_None.parquet
2,no_burst,swiss_landschlacht_01,2021-08-05 13:28:00,2021-08-05 13:29:00,2021-08-05 13-28-00_2021-08-05 13-29-00_swiss_landschlacht_01_None_no_burst,png,swiss_landschlacht_01_2021-08-05 13-28-00_2021-08-05 13-29-00_None_None.parquet
3,no_burst,swiss_landschlacht_01,2021-06-21 07:15:00,2021-06-21 07:16:00,2021-06-21 07-15-00_2021-06-21 07-16-00_swiss_landschlacht_01_None_no_burst,png,swiss_landschlacht_01_2021-06-21 07-15-00_2021-06-21 07-16-00_None_None.parquet
4,no_burst,alaska_haarp_62,2023-06-13 00:53:00,2023-06-13 00:54:00,2023-06-13 00-53-00_2023-06-13 00-54-00_alaska_haarp_62_None_no_burst,png,alaska_haarp_62_2023-06-13 00-53-00_2023-06-13 00-54-00_None_None.parquet
...,...,...,...,...,...,...,...
64197,no_burst,australia_assa_02,2021-02-14 09:09:00,2021-02-14 09:10:00,2021-02-14 09-09-00_2021-02-14 09-10-00_australia_assa_02_None_no_burst,png,australia_assa_02_2021-02-14 09-09-00_2021-02-14 09-10-00_None_None.parquet
64198,no_burst,alaska_haarp_62,2023-02-20 20:12:00,2023-02-20 20:13:00,2023-02-20 20-12-00_2023-02-20 20-13-00_alaska_haarp_62_None_no_burst,png,alaska_haarp_62_2023-02-20 20-12-00_2023-02-20 20-13-00_None_None.parquet
64199,no_burst,australia_assa_02,2021-09-26 05:53:00,2021-09-26 05:54:00,2021-09-26 05-53-00_2021-09-26 05-54-00_australia_assa_02_None_no_burst,png,australia_assa_02_2021-09-26 05-53-00_2021-09-26 05-54-00_None_None.parquet
64200,no_burst,alaska_haarp_62,2023-04-08 21:54:00,2023-04-08 21:55:00,2023-04-08 21-54-00_2023-04-08 21-55-00_alaska_haarp_62_None_no_burst,png,alaska_haarp_62_2023-04-08 21-54-00_2023-04-08 21-55-00_None_None.parquet


Some cleanup.

In [5]:
sunbursts = sunbursts[["Classification", "Instrument", "Start", "End", "File Name_y"]].copy()
sunbursts.columns = ["Classification", "Instrument", "Start", "End", "File Name"]
sunbursts

Unnamed: 0,Classification,Instrument,Start,End,File Name
0,no_burst,australia_assa_02,2021-04-12 06:12:00,2021-04-12 06:13:00,australia_assa_02_2021-04-12 06-12-00_2021-04-12 06-13-00_None_None.parquet
1,no_burst,australia_assa_02,2021-06-29 23:32:00,2021-06-29 23:33:00,australia_assa_02_2021-06-29 23-32-00_2021-06-29 23-33-00_None_None.parquet
2,no_burst,swiss_landschlacht_01,2021-08-05 13:28:00,2021-08-05 13:29:00,swiss_landschlacht_01_2021-08-05 13-28-00_2021-08-05 13-29-00_None_None.parquet
3,no_burst,swiss_landschlacht_01,2021-06-21 07:15:00,2021-06-21 07:16:00,swiss_landschlacht_01_2021-06-21 07-15-00_2021-06-21 07-16-00_None_None.parquet
4,no_burst,alaska_haarp_62,2023-06-13 00:53:00,2023-06-13 00:54:00,alaska_haarp_62_2023-06-13 00-53-00_2023-06-13 00-54-00_None_None.parquet
...,...,...,...,...,...
64197,no_burst,australia_assa_02,2021-02-14 09:09:00,2021-02-14 09:10:00,australia_assa_02_2021-02-14 09-09-00_2021-02-14 09-10-00_None_None.parquet
64198,no_burst,alaska_haarp_62,2023-02-20 20:12:00,2023-02-20 20:13:00,alaska_haarp_62_2023-02-20 20-12-00_2023-02-20 20-13-00_None_None.parquet
64199,no_burst,australia_assa_02,2021-09-26 05:53:00,2021-09-26 05:54:00,australia_assa_02_2021-09-26 05-53-00_2021-09-26 05-54-00_None_None.parquet
64200,no_burst,alaska_haarp_62,2023-04-08 21:54:00,2023-04-08 21:55:00,alaska_haarp_62_2023-04-08 21-54-00_2023-04-08 21-55-00_None_None.parquet


We can see, that some data was classified multiple times? 

In [6]:
# Show sunbursts with duplicate file names
sunbursts[sunbursts["File Name"].duplicated(keep=False)].sort_values("File Name")

Unnamed: 0,Classification,Instrument,Start,End,File Name
7130,2,australia_assa_02,2021-07-24 00:14:00,2021-07-24 00:15:00,australia_assa_02_2021-07-24 00-14-00_2021-07-24 00-15-00_None_None.parquet
7129,3,australia_assa_02,2021-07-24 00:14:00,2021-07-24 00:15:00,australia_assa_02_2021-07-24 00-14-00_2021-07-24 00-15-00_None_None.parquet
19862,4,australia_assa_02,2021-09-08 00:08:00,2021-09-08 00:09:00,australia_assa_02_2021-09-08 00-08-00_2021-09-08 00-09-00_None_None.parquet
19863,2,australia_assa_02,2021-09-08 00:08:00,2021-09-08 00:09:00,australia_assa_02_2021-09-08 00-08-00_2021-09-08 00-09-00_None_None.parquet
33627,2,australia_assa_02,2021-09-08 00:09:00,2021-09-08 00:10:00,australia_assa_02_2021-09-08 00-09-00_2021-09-08 00-10-00_None_None.parquet
33626,4,australia_assa_02,2021-09-08 00:09:00,2021-09-08 00:10:00,australia_assa_02_2021-09-08 00-09-00_2021-09-08 00-10-00_None_None.parquet
16563,2,australia_assa_02,2021-09-08 00:10:00,2021-09-08 00:11:00,australia_assa_02_2021-09-08 00-10-00_2021-09-08 00-11-00_None_None.parquet
16562,4,australia_assa_02,2021-09-08 00:10:00,2021-09-08 00:11:00,australia_assa_02_2021-09-08 00-10-00_2021-09-08 00-11-00_None_None.parquet
2233,4,australia_assa_02,2021-09-08 00:11:00,2021-09-08 00:12:00,australia_assa_02_2021-09-08 00-11-00_2021-09-08 00-12-00_None_None.parquet
2234,2,australia_assa_02,2021-09-08 00:11:00,2021-09-08 00:12:00,australia_assa_02_2021-09-08 00-11-00_2021-09-08 00-12-00_None_None.parquet


We save the data list to a new csv.

In [7]:
sunbursts.to_csv("../../data/processed/sunburst_images.csv", index=False)

And zip the available data.

In [8]:
with zipfile.ZipFile("../../data/raw/ecallisto_ng_zipped.zip", "w", zipfile.ZIP_DEFLATED) as zipf:
    for i, file in tqdm(sunbursts.iterrows(), total=len(sunbursts), desc="Writing files"):
        zipf.write(path + file["File Name"], arcname=f"{file['Classification']}/{file['File Name']}")

Writing files:   0%|          | 0/64202 [00:00<?, ?it/s]