In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from datetime import timedelta
from ecallisto_ng.data_fetching.get_data import extract_instrument_name, get_data
from ecallisto_ng.data_fetching.get_information import (
    get_tables,
    check_table_data_availability,
)
import random
from tqdm import tqdm

# Data Generation Radio Sunburst Detector
## Create images with bursts

In [2]:
np.random.seed(52)

In [3]:
burst_list = pd.read_excel("burst_list.xlsx").dropna(subset=["instruments"])

In [4]:
burst_list.loc[:, "instruments"] = burst_list.instruments.apply(extract_instrument_name)

In [5]:
burst_list

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
1,20210120,12:37-12:37,3,austria_unigraz,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
2,20210120,12:37-12:37,3,humain,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
3,20210120,12:37-12:37,3,mrt1,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
4,20210120,12:37-12:37,3,southafrica_sansa,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
...,...,...,...,...,...,...,...,...,...,...
32464,20230707,10:15-10:16,4,germany_dlr,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32465,20230707,10:15-10:16,4,norway_egersund,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32466,20230707,10:15-10:16,4,swiss_heiterswil,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32467,20230707,10:15-10:16,4,swiss_landschlacht,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00


In [89]:
[x for x in get_tables() if "humain" in x]

['humain_59']

In [84]:
[x for x in get_tables() if "india" in x]

['india_gauri_59',
 'india_gauri_01',
 'india_nashik_59',
 'india_ooty_02',
 'india_ooty_01',
 'india_iiserp_01',
 'india_ooty_58',
 'india_udaipur_02',
 'india_udaipur_01',
 'india_nashik_01',
 'india_ooty_59']

In [7]:
### PARAMETERS ###
IMAGE_NUM_NON_BURST = 50000
IMAGE_NUM_BURST = 5000
IMAGE_LENGTH = timedelta(minutes=1)
PIXEL_PER_IMAGE_OVER_TIME = 200
PIXEL_PER_IMAGE_OVER_FREQUENCY = 200
INSTRUMENTS_TO_INCLUDE = [
    "australia_assa_01",
    "australia_assa_02",
    "australia_assa_56",
    "australia_assa_62",
    "alaska_haarp_63",
    "alaska_haarp_62" "germany_dlr_62",
    "germany_dlr_63",
    "austria_unigraz_01",
    "austria_unigraz_02",
]
###


# Filter burst list
def remove_id_from_instrument_name(instrument_name):
    return "_".join(instrument_name.split("_")[:-1])


instruments_to_include_sql_table_compatible = [
    remove_id_from_instrument_name(instrument) for instrument in INSTRUMENTS_TO_INCLUDE
]
# Drop duplicate list
instruments_to_include_sql_table_compatible = list(
    set(instruments_to_include_sql_table_compatible)
)
burst_list_filtered = burst_list[
    burst_list.instruments.isin(instruments_to_include_sql_table_compatible)
]
burst_list_filtered

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
7,20210127,04:32-04:32,3,australia_assa,04:32,04:32,20210127,20210127,2021-01-27 04:32:00,2021-01-27 04:32:00
89,20210419,06:55-06:57,3,australia_assa,06:55,06:57,20210419,20210419,2021-04-19 06:55:00,2021-04-19 06:57:00
108,20210419,23:39-23:42,2,australia_assa,23:39,23:42,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:42:00
109,20210419,23:39-23:43,3,australia_assa,23:39,23:43,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:43:00
...,...,...,...,...,...,...,...,...,...,...
32421,20230707,00:53-00:54,3,australia_assa,00:53,00:54,20230707,20230707,2023-07-07 00:53:00,2023-07-07 00:54:00
32424,20230707,01:33-01:34,5,australia_assa,01:33,01:34,20230707,20230707,2023-07-07 01:33:00,2023-07-07 01:34:00
32427,20230707,01:36-01:36,3,australia_assa,01:36,01:36,20230707,20230707,2023-07-07 01:36:00,2023-07-07 01:36:00
32431,20230707,04:43-04:46,3,australia_assa,04:43,04:46,20230707,20230707,2023-07-07 04:43:00,2023-07-07 04:46:00


## Find good instruments

In [8]:
# Change it to true if you want to create some images of selected instrument (to e.g. select only nice instruments, manually.)
if False:
    n = 3
    burst_list_filtered = burst_list_filtered.groupby("instruments").sample(n=n)

In [9]:
burst_list_filtered

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
7,20210127,04:32-04:32,3,australia_assa,04:32,04:32,20210127,20210127,2021-01-27 04:32:00,2021-01-27 04:32:00
89,20210419,06:55-06:57,3,australia_assa,06:55,06:57,20210419,20210419,2021-04-19 06:55:00,2021-04-19 06:57:00
108,20210419,23:39-23:42,2,australia_assa,23:39,23:42,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:42:00
109,20210419,23:39-23:43,3,australia_assa,23:39,23:43,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:43:00
...,...,...,...,...,...,...,...,...,...,...
32421,20230707,00:53-00:54,3,australia_assa,00:53,00:54,20230707,20230707,2023-07-07 00:53:00,2023-07-07 00:54:00
32424,20230707,01:33-01:34,5,australia_assa,01:33,01:34,20230707,20230707,2023-07-07 01:33:00,2023-07-07 01:34:00
32427,20230707,01:36-01:36,3,australia_assa,01:36,01:36,20230707,20230707,2023-07-07 01:36:00,2023-07-07 01:36:00
32431,20230707,04:43-04:46,3,australia_assa,04:43,04:46,20230707,20230707,2023-07-07 04:43:00,2023-07-07 04:46:00


### Create images of bursts

In [10]:
def get_data_save_as_img(
    instrument,
    start_datetime,
    end_datetime,
    time_bucket,
    agg_function="MAX",
    burst_type="no_burst",
    min_shape=(200, 200),
    data_folder="data",
):
    """
    Retrieves data for a specific instrument within a given time range, aggregates it using the specified function,
    normalizes the data, and saves it as an image file.

    Args:
        instrument (str): Name of the instrument for which data is to be retrieved.
        start_datetime (datetime.datetime): Start date and time of the data range.
        end_datetime (datetime.datetime): End date and time of the data range.
        time_bucket (str): Time granularity for data aggregation (e.g., '1H' for hourly, '30T' for every 30 minutes).
        agg_function (str, optional): Aggregation function to apply to the data. Defaults to 'MAX'.
        burst_type (str, optional): Label to be included in the file name. Defaults to 'no_burst'.
        data_folder (str, optional): Folder path where the data will be saved. Defaults to 'data'.
        min_shape (tuple, optional): Minimum shape of the image. Defaults to (200, 200).

    Returns:
        None

    Raises:
        None

    Examples:
        # Retrieve data for instrument 'instrument_name' from 'start_datetime' to 'end_datetime' and save it as an image
        get_data_save_as_img('instrument_name', start_datetime, end_datetime, '1H', 'MAX', 'no_burst', 'data')

    """
    sd_str = start_datetime.strftime("%Y-%m-%d %H:%M:%S")
    ed_str = end_datetime.strftime("%Y-%m-%d %H:%M:%S")

    # Generate path
    dir = os.path.join(data_folder, burst_type)
    if not os.path.exists(dir):
        os.makedirs(dir)

    # Create file path
    file_path = os.path.join(
        dir,
        sd_str.replace(":", "-")
        + "_"
        + ed_str.replace(":", "-")
        + "_"
        + instrument
        + "_"
        + str(time_bucket)
        + "_"
        + str(burst_type)
        + ".png",
    )
    if os.path.exists(file_path):
        print("File already exists.")
        return True

    df = get_data(
        instrument_name=instrument,
        start_datetime=sd_str,
        end_datetime=ed_str,
        timebucket=time_bucket,
        agg_function=agg_function,
    )

    img_data = df.to_numpy().astype(np.int16)
    if not img_data.shape[0] >= min_shape[0] and img_data.shape[1] >= min_shape[1]:
        print("Image shape is too small.")
        return False

    plt.imsave(file_path, img_data.T, cmap="gray")
    return True

    # return dataframe

In [11]:
image_num = 0

# Create a tqdm progress bar.
progress_bar = None
progress_bar = tqdm(total=IMAGE_NUM_BURST, desc="Processing Images", dynamic_ncols=True)

Processing Images:   0%|          | 0/5000 [00:00<?, ?it/s]

In [12]:
# Functionality to hide print statements
import os, sys


class HiddenPrints:
    def __enter__(self):
        self._original_stdout = sys.stdout
        sys.stdout = open(os.devnull, "w")

    def __exit__(self, exc_type, exc_val, exc_tb):
        sys.stdout.close()
        sys.stdout = self._original_stdout

### IMAGE GENERATION BURST

In [None]:
# Sort the burst list by instrument, type, and datetime_start to allow that the highest type is at the end and kept, when we drop duplicates (not sure if this is necessary)
burst_list_filtered = burst_list_filtered.sort_values(
    by=["instruments", "type", "datetime_start"], ascending=False
)
# Drop duplicates, keep the last (highest type)
burst_list_filtered = burst_list_filtered.drop_duplicates(
    subset=["instruments", "type", "datetime_start"], keep="last"
)
# Iterate through each row in the filtered burst list
burst_list_filtered = burst_list_filtered.sample(frac=1, random_state=42).reset_index(
    drop=True
)
# Iterate through each row in the filtered burst list
for index, burst_row in burst_list_filtered.iterrows():
    # Get the start and end datetime of the burst
    burst_start = burst_row.datetime_start
    burst_end = burst_row.datetime_end

    # Create a date range from the start to end datetime with a frequency of IMAGE_LENGTH, including the left endpoint
    burst_date_range = pd.date_range(
        burst_start, burst_end, freq=IMAGE_LENGTH, inclusive="left"
    )
    for date in burst_date_range:
        end_date = date + timedelta(minutes=1)
        # Iterate through each instrument table to include
        for instrument_table in INSTRUMENTS_TO_INCLUDE:
            if check_table_data_availability(
                instrument_table, str(date), str(end_date)
            ):
                # Attempt to retrieve the data and save it as an image
                # Parameters: instrument_table, start date, end date, x-limits, y-limits, burst category, data type
                try:
                    with HiddenPrints():
                        result = get_data_save_as_img(
                            instrument=instrument_table,
                            start_datetime=date,
                            end_datetime=end_date,
                            time_bucket=None,
                            agg_function=None,
                            burst_type=str(burst_row.type),
                            min_shape=(200, 200),
                            data_folder="data",
                        )
                    if result:
                        image_num += 1
                        progress_bar.update(1)
                        progress_bar.set_postfix(
                            {"Image": f"{instrument_table}_{date}_{end_date}"},
                            refresh=True,
                        )
                except Exception as e:
                    print(e)
            else:
                pass

        if image_num >= IMAGE_NUM_BURST:
            break
progress_bar.close()

In [181]:
# Cound number of files per folder in data:
import os

for folder in os.listdir("data"):
    print(f"{folder}: {len(os.listdir(f'data/{folder}'))}")

2: 197
6: 4496
1: 8
3: 1194
5: 19
4: 15
no_burst: 12319


### Create images of non burst

In [14]:
###
MIN_START_TIME = (
    burst_list_filtered.datetime_start.min()
)  # burst_list_filtered.datetime_start.apply(lambda dt: dt.replace(hour=8, minute=0, second=0)).min()
MAX_START_TIME = (
    burst_list_filtered.datetime_start.max() - IMAGE_LENGTH
)  # MIN_START_TIME + timedelta(hours=12) - IMAGE_LENGTH

print(MIN_START_TIME)
print("----")
print(MAX_START_TIME)


def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        # Get a random amount of seconds between `start` and `end`
        minutes=random.randint(0, int((end - start).total_seconds() // 60)),
    )

2021-01-19 02:42:00
----
2023-07-07 05:28:00


In [23]:
# Initialize a counter for the number of images added
non_burst_img_processed = 0
# Create a tqdm progress bar.
progress_bar = None
progress_bar = tqdm(
    total=IMAGE_NUM_NON_BURST, desc="Processing Images", dynamic_ncols=True
)

Processing Images:   0%|          | 0/50000 [00:01<?, ?it/s]
Processing Images:   0%|          | 0/50000 [00:00<?, ?it/s]

In [51]:
random_date(MIN_START_TIME, MAX_START_TIME)

Timestamp('2021-09-09 10:32:00')

In [77]:
random_start_date = random_date(MIN_START_TIME, MAX_START_TIME)
random_start_date

Timestamp('2021-11-07 23:24:00')

In [55]:
# Continue processing until the required number of images is reached
while non_burst_img_processed < IMAGE_NUM_NON_BURST:
    # Generate a random start date between a minimum and maximum start time
    random_start_date = random_date(MIN_START_TIME, MAX_START_TIME)
    end_date = random_start_date + IMAGE_LENGTH
    # Iterate through each instrument table to include
    for instrument_table in INSTRUMENTS_TO_INCLUDE:
        if check_table_data_availability(
            instrument_table, str(random_start_date), str(end_date)
        ):
            # Remove ID from the instrument name to get the base name
            base_instrument_name = remove_id_from_instrument_name(instrument_table)

            # Filter the burst list for entries that match the current instrument's base name
            burst_list_for_instrument = burst_list_filtered[
                burst_list_filtered.instruments == base_instrument_name
            ]

            # Check that the random_start_date is not within any burst period for the current instrument
            non_burst_in_burst_df = burst_list_for_instrument[
                (burst_list_for_instrument.datetime_start <= random_start_date)
                & (random_start_date <= burst_list_for_instrument.datetime_end)
            ]

            # If the random start date falls within any burst period for the current instrument, continue to next iteration
            if len(non_burst_in_burst_df) > 0:
                continue
            else:
                try:
                    # Attempt to retrieve the data and save it as an image
                    # Parameters: instrument_table, start date, end date, x-limits, y-limits, burst category, data type
                    get_data_save_as_img(
                        instrument_table,
                        random_start_date,
                        random_start_date + timedelta(minutes=1),
                        time_bucket=None,
                        agg_function=None,
                        burst_type="no_burst",
                        min_shape=(200, 200),
                        data_folder="data",
                    )
                    # Increment the count of images processed
                    non_burst_img_processed += 1
                    progress_bar.update(1)
                    progress_bar.set_postfix(
                        {"Image": f"{instrument_table}_{random_start_date}"},
                        refresh=True,
                    )
                except ValueError as e:
                    # Handle exception: print the error and skip to next date
                    print(e)
                    pass

Processing Images:  10%|▉         | 4995/50000 [15:20:03<80:38:27,  6.45s/it, Image=australia_assa_02_2021-08-13 07:32:00]

File downloaded successfully


Processing Images:  10%|▉         | 4996/50000 [15:20:21<123:15:00,  9.86s/it, Image=australia_assa_62_2022-06-18 01:50:00]

File downloaded successfully


Processing Images:  10%|▉         | 4997/50000 [15:20:38<151:58:08, 12.16s/it, Image=australia_assa_62_2022-01-28 21:34:00]

File downloaded successfully


Processing Images:  10%|▉         | 4998/50000 [15:20:56<174:36:27, 13.97s/it, Image=australia_assa_62_2023-06-14 03:21:00]

File downloaded successfully


KeyboardInterrupt: 