In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
from datetime import timedelta
from ecallisto_ng.data_fetching.get_data import extract_instrument_name, get_data
from ecallisto_ng.data_fetching.get_information import get_tables, get_table_names_with_data_between_dates, check_table_data_availability
import random

# Data Generation Radio Sunburst Detector
## Create images with bursts

In [2]:
np.random.seed(52)

In [3]:
burst_list = pd.read_excel('burst_list.xlsx').dropna(subset=['instruments'])

In [4]:
burst_list.loc[:, 'instruments'] = burst_list.instruments.apply(extract_instrument_name)

In [5]:
burst_list

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
1,20210120,12:37-12:37,3,austria_unigraz,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
2,20210120,12:37-12:37,3,humain,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
3,20210120,12:37-12:37,3,mrt1,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
4,20210120,12:37-12:37,3,southafrica_sansa,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00
...,...,...,...,...,...,...,...,...,...,...
32464,20230707,10:15-10:16,4,germany_dlr,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32465,20230707,10:15-10:16,4,norway_egersund,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32466,20230707,10:15-10:16,4,swiss_heiterswil,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00
32467,20230707,10:15-10:16,4,swiss_landschlacht,10:15,10:16,20230707,20230707,2023-07-07 10:15:00,2023-07-07 10:16:00


In [6]:
[x for x in get_tables() if 'australia' in x]

['australia_assa_02',
 'australia_assa_63',
 'australia_assa_01',
 'australia_assa_56',
 'australia_lmro_59',
 'australia_assa_57',
 'australia_assa_62',
 'australia_assa_60']

In [7]:
### PARAMETERS ###
IMAGE_LENGTH = timedelta(minutes=1)
PIXEL_PER_IMAGE_OVER_TIME = 200
PIXEL_PER_IMAGE_OVER_FREQUENCY = 200
INSTRUMENTS_TO_INCLUDE = ['australia_assa_01', ]
TOTAL_IMAGE_NUM = 1000
###
time_bucket = IMAGE_LENGTH.total_seconds

# Filter burst list
def remove_id_from_instrument_name(instrument_name):
    return '_'.join(instrument_name.split('_')[:-1])

instruments_to_include_sql_table_compatible = [remove_id_from_instrument_name(instrument) for instrument in INSTRUMENTS_TO_INCLUDE]
# Drop duplicate list
instruments_to_include_sql_table_compatible = list(set(instruments_to_include_sql_table_compatible))
burst_list_filtered = burst_list[burst_list.instruments.isin(instruments_to_include_sql_table_compatible)]
burst_list_filtered

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
7,20210127,04:32-04:32,3,australia_assa,04:32,04:32,20210127,20210127,2021-01-27 04:32:00,2021-01-27 04:32:00
89,20210419,06:55-06:57,3,australia_assa,06:55,06:57,20210419,20210419,2021-04-19 06:55:00,2021-04-19 06:57:00
108,20210419,23:39-23:42,2,australia_assa,23:39,23:42,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:42:00
109,20210419,23:39-23:43,3,australia_assa,23:39,23:43,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:43:00
...,...,...,...,...,...,...,...,...,...,...
32421,20230707,00:53-00:54,3,australia_assa,00:53,00:54,20230707,20230707,2023-07-07 00:53:00,2023-07-07 00:54:00
32424,20230707,01:33-01:34,5,australia_assa,01:33,01:34,20230707,20230707,2023-07-07 01:33:00,2023-07-07 01:34:00
32427,20230707,01:36-01:36,3,australia_assa,01:36,01:36,20230707,20230707,2023-07-07 01:36:00,2023-07-07 01:36:00
32431,20230707,04:43-04:46,3,australia_assa,04:43,04:46,20230707,20230707,2023-07-07 04:43:00,2023-07-07 04:46:00


## Find good instruments

In [8]:
# Change it to true if you want to create some images of selected instrument (to e.g. select only nice instruments, manually.)
if False:
    n = 3
    burst_list_filtered = burst_list_filtered.groupby('instruments').sample(n=n)

In [9]:
burst_list_filtered

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
0,20210119,02:42-02:42,3,australia_assa,02:42,02:42,20210119,20210119,2021-01-19 02:42:00,2021-01-19 02:42:00
7,20210127,04:32-04:32,3,australia_assa,04:32,04:32,20210127,20210127,2021-01-27 04:32:00,2021-01-27 04:32:00
89,20210419,06:55-06:57,3,australia_assa,06:55,06:57,20210419,20210419,2021-04-19 06:55:00,2021-04-19 06:57:00
108,20210419,23:39-23:42,2,australia_assa,23:39,23:42,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:42:00
109,20210419,23:39-23:43,3,australia_assa,23:39,23:43,20210419,20210419,2021-04-19 23:39:00,2021-04-19 23:43:00
...,...,...,...,...,...,...,...,...,...,...
32421,20230707,00:53-00:54,3,australia_assa,00:53,00:54,20230707,20230707,2023-07-07 00:53:00,2023-07-07 00:54:00
32424,20230707,01:33-01:34,5,australia_assa,01:33,01:34,20230707,20230707,2023-07-07 01:33:00,2023-07-07 01:34:00
32427,20230707,01:36-01:36,3,australia_assa,01:36,01:36,20230707,20230707,2023-07-07 01:36:00,2023-07-07 01:36:00
32431,20230707,04:43-04:46,3,australia_assa,04:43,04:46,20230707,20230707,2023-07-07 04:43:00,2023-07-07 04:46:00


### Create images of bursts

In [10]:
def get_data_save_as_img(instrument, start_datetime, end_datetime, time_bucket, agg_function='MAX', burst_type="no_burst", min_shape=(200, 200), data_folder="data"):
    """
    Retrieves data for a specific instrument within a given time range, aggregates it using the specified function,
    normalizes the data, and saves it as an image file.

    Args:
        instrument (str): Name of the instrument for which data is to be retrieved.
        start_datetime (datetime.datetime): Start date and time of the data range.
        end_datetime (datetime.datetime): End date and time of the data range.
        time_bucket (str): Time granularity for data aggregation (e.g., '1H' for hourly, '30T' for every 30 minutes).
        agg_function (str, optional): Aggregation function to apply to the data. Defaults to 'MAX'.
        burst_type (str, optional): Label to be included in the file name. Defaults to 'no_burst'.
        data_folder (str, optional): Folder path where the data will be saved. Defaults to 'data'.
        min_shape (tuple, optional): Minimum shape of the image. Defaults to (200, 200).

    Returns:
        None

    Raises:
        None

    Examples:
        # Retrieve data for instrument 'instrument_name' from 'start_datetime' to 'end_datetime' and save it as an image
        get_data_save_as_img('instrument_name', start_datetime, end_datetime, '1H', 'MAX', 'no_burst', 'data')

    """
    sd_str = start_datetime.strftime("%Y-%m-%d %H:%M:%S")
    ed_str = end_datetime.strftime("%Y-%m-%d %H:%M:%S")
    df = get_data(
        instrument_name=instrument, 
        start_datetime=sd_str,
        end_datetime=ed_str, 
        timebucket=time_bucket, 
        agg_function=agg_function
        )
    
    img_data = df.to_numpy().astype(np.int16)
    if not img_data.shape[0] >= min_shape[0] and img_data.shape[1] >= min_shape[1]: 
        raise ValueError("Image shape is too small.")
    # Generate path
    path = os.path.join(data_folder, burst_type)
    if not os.path.exists(path):
        os.makedirs(path)
    file_path = os.path.join(path, sd_str.replace(':', '-') + "_" + ed_str.replace(':', '-') + "_" + instrument + "_" + str(time_bucket) + ".png")
    plt.imsave(file_path, img_data.T, cmap="gray")

In [17]:
image_num = 0

In [18]:
# Iterate through each row in the filtered burst list
for index, burst_row in burst_list_filtered.sample(frac=1).iterrows():
    
    # Iterate through each instrument table to include
    for instrument_table in INSTRUMENTS_TO_INCLUDE:
        
        # Get the start and end datetime of the burst
        burst_start = burst_row.datetime_start
        burst_end = burst_row.datetime_end 
        
        # Create a date range from the start to end datetime with a frequency of IMAGE_LENGTH, including the left endpoint
        burst_date_range = pd.date_range(burst_start, burst_end, freq=IMAGE_LENGTH, inclusive='left')
        
        # Get the burst type as a string
        burst_category = str(burst_row.type)
        
        # Retrieve data for each date in the burst_date_range
        for date in burst_date_range:
            end_date = date + timedelta(minutes=1)
            if check_table_data_availability(instrument_table, str(date), str(end_date)):
                # Attempt to retrieve the data and save it as an image
                # Parameters: instrument_table, start date, end date, x-limits, y-limits, burst category, data type
                print(image_num, " ", instrument_table, " ----- ", date, " to ", end_date)
                get_data_save_as_img(
                    instrument=instrument_table, 
                    start_datetime=date, 
                    end_datetime=end_date, 
                    time_bucket=None, 
                    agg_function=None,
                    burst_type=burst_category,
                    min_shape=(200, 200),
                    data_folder='data'
                )
                image_num += 1
                if image_num >= TOTAL_IMAGE_NUM:
                    break
            else:                
                print(f"Skipping {instrument_table} from {date} to {date + timedelta(minutes=1)}")
                
        if image_num >= TOTAL_IMAGE_NUM:
            break
    if image_num >= TOTAL_IMAGE_NUM:
        break

Skipping australia_assa_01 from 2022-03-30 05:50:00 to 2022-03-30 05:51:00
Skipping australia_assa_01 from 2022-03-30 05:51:00 to 2022-03-30 05:52:00
Skipping australia_assa_01 from 2022-03-30 05:52:00 to 2022-03-30 05:53:00
Skipping australia_assa_01 from 2021-12-17 05:42:00 to 2021-12-17 05:43:00
0   australia_assa_01  -----  2021-09-26 23:46:00  to  2021-09-26 23:47:00
File downloaded successfully
Skipping australia_assa_01 from 2022-04-20 03:55:00 to 2022-04-20 03:56:00
Skipping australia_assa_01 from 2022-04-20 03:56:00 to 2022-04-20 03:57:00
Skipping australia_assa_01 from 2022-04-20 03:57:00 to 2022-04-20 03:58:00
Skipping australia_assa_01 from 2022-04-20 03:58:00 to 2022-04-20 03:59:00
Skipping australia_assa_01 from 2022-04-20 03:59:00 to 2022-04-20 04:00:00
Skipping australia_assa_01 from 2022-04-20 04:00:00 to 2022-04-20 04:01:00
Skipping australia_assa_01 from 2022-04-20 04:01:00 to 2022-04-20 04:02:00
Skipping australia_assa_01 from 2022-04-20 04:02:00 to 2022-04-20 04:03

ArrowInvalid: Could not open Parquet input source '<Buffer>': Parquet file size is 0 bytes

### Create images of non burst

In [None]:
HOW_MANY_IMAGES = 50
###
MIN_START_TIME = burst_list_filtered.datetime_start.min() #burst_list_filtered.datetime_start.apply(lambda dt: dt.replace(hour=8, minute=0, second=0)).min()
MAX_START_TIME = burst_list_filtered.datetime_start.max() - IMAGE_LENGTH #MIN_START_TIME + timedelta(hours=12) - IMAGE_LENGTH

print(MIN_START_TIME)
print("----")
print(MAX_START_TIME)
def random_date(start, end):
    """Generate a random datetime between `start` and `end`"""
    return start + timedelta(
        # Get a random amount of seconds between `start` and `end`
        minutes=random.randint(0, int((end - start).total_seconds() // 60)),
    )

In [16]:
# Initialize a counter for the number of images added
images_processed = 0

In [None]:
# Continue processing until the required number of images is reached
while images_processed < HOW_MANY_IMAGES:
    # Generate a random start date between a minimum and maximum start time
    random_start_date = random_date(MIN_START_TIME, MAX_START_TIME)

    # Iterate through each instrument table to include
    for instrument_table in INSTRUMENTS_TO_INCLUDE:
        
        # Remove ID from the instrument name to get the base name
        base_instrument_name = remove_id_from_instrument_name(instrument_table)
        
        # Filter the burst list for entries that match the current instrument's base name
        burst_list_for_instrument = burst_list_filtered[burst_list_filtered.instruments == base_instrument_name]
        
        # If the random start date falls within any burst period for the current instrument, continue to next iteration
        if any((burst_list_for_instrument.datetime_start <= random_start_date) & (random_start_date <= burst_list_for_instrument.datetime_end)):
            continue
        else:
            try:
                # Attempt to retrieve the data and save it as an image
                # Parameters: instrument_table, start date, end date, x-limits, y-limits, burst category, data type
                get_data_save_as_img(
                    instrument_table, 
                    random_start_date, 
                    random_start_date + timedelta(minutes=1), 
                    time_bucket=None, 
                    agg_function=None,
                    burst_type='no_burst',
                    min_shape=(200, 200),
                    data_folder='data'
                )
                # Increment the count of images processed
                images_processed += 1
            except ValueError as e:
                # Handle exception: print the error and skip to next date
                print(e)
                print(f"Skipping {instrument_table} from {random_start_date} to {random_start_date + timedelta(minutes=1)}")
                pass
