In [1]:
from burstextractor.burstlist import download_burst_data
from burstextractor.timeutils import extract_time, fix_typos_in_time, fix_24_hour_time, create_datetime
from burstextractor.data_utils import explode_instruments_long_clean_instruments, keep_only_type_I_to_VI
import pandas as pd
import numpy as np
from database_utils import extract_instrument_name

## Create Dataframe with the Burstlists

In [2]:
burst_list = download_burst_data([2021, 2022, 2023], months=range(1, 13), folder="ecallisto_files")

  data = pd.read_csv(
  data = pd.read_csv(


In [3]:
burst_list

Unnamed: 0,date,time,type,instruments
0,20210119,02:42-02:42,III,Australia-ASSA
1,20210120,12:37-12:37,III,"AUSTRIA-UNIGRAZ, [HUMAIN], MRT1, SOUTHAFRICA-S..."
2,20210127,04:32-04:32,III,"Australia-ASSA, INDIA-GAURI, SOUTHAFRICA-SANSA"
3,20210127,09:27-09:27,III,"AUSTRIA-UNIGRAZ, INDIA-GAURI, INDIA-OOTY, MRT1..."
4,20210218,18:04-18:04,III,"GREENLAND, MEXART, ROSWELL-NM"
...,...,...,...,...
5353,20230224,02:17-03:56,VI,"ALASKA-COHOE, Australia-ASSA, SSRT"
5354,20230224,04:16-04:25,III,"Australia-ASSA, INDIA-OOTY, INDIA-UDAIPUR, IND..."
5355,20230224,06:31-06:33,III,"Australia-ASSA, INDIA-OOTY, INDIA-UDAIPUR, SSRT"
5356,20230224,06:56-06:57,III,"Australia-ASSA, INDIA-OOTY, (SSRT)"


In [4]:
burst_list.sample(5)

Unnamed: 0,date,time,type,instruments
2973,20220620,21:41-21:41,III,"ALASKA-COHOE, ALASKA-HAARP, MEXICO-LANCE-A, ME..."
773,20210924,16:11-16:46,VI,"BIR, (SWISS-HEITERSWIL), (SWISS-Landschlacht)"
3344,20220721,01:21-01:28,II,"ALASKA-COHOE, ALASKA-HAARP, (ALMATY), Australi..."
2766,20220526,14:57-14:57,III,"Arecibo-Observatory, GLASGOW"
3972,20221007,23:12-23:14,III,"ALASKA-COHOE, ALASKA-HAARP, Australia-ASSA"


In [5]:
burst_list.shape

(5358, 4)

## Fix typos

In [6]:
extracted_digits = burst_list['time'].str.extract(r'(\d+).(\d+).(\d+).(\d+)', expand=True).astype(int)

In [7]:
impossible_times_bool = (extracted_digits[1] > 59) | (extracted_digits[3] > 59) | (extracted_digits[0] > 23) | (extracted_digits[2] > 23)
extracted_digits[impossible_times_bool]

Unnamed: 0,0,1,2,3
1693,6,6,6,88
2238,24,32,14,33
3876,23,59,24,0
3890,0,0,24,0
3920,3,50,24,0
4179,23,59,24,0
4467,23,58,24,0
4557,0,0,24,0
4576,0,0,24,0


In [8]:
burst_list[impossible_times_bool]

Unnamed: 0,date,time,type,instruments
1693,20220210,06:06-06:88,V,"ALMATY, Australia-ASSA, INDIA-OOTY, INDIA-UDAI..."
2238,20220421,24:32-14:33,III,"AUSTRIA-MICHELBACH, Arecibo-Observatory, GLASG..."
3876,20220926,23:59:24:00,III,"ALASKA-COHOE, ALASKA-HAARP, Australia-ASSA"
3890,20220929,00:00-24:00,,CTM
3920,20221001,03:50-24:00,CTM,*
4179,20221110,23:59-24:00,III,"ALASKA-COHOE, Australia-ASSA"
4467,20221215,23:58-24:00,III,Arecibo-Observatory
4557,20221221,00:00-24:00,CTM,*
4576,20221222,00:00-24:00,CTM,*


In [9]:
burst_list.loc[4179]

date                               20221110
time                            23:59-24:00
type                                    III
instruments    ALASKA-COHOE, Australia-ASSA
Name: 4179, dtype: object

In [10]:
burst_list.loc[4179+1]

date                 20221111
time              00:02-00:14
type                       VI
instruments    Australia-ASSA
Name: 4180, dtype: object

In [11]:
burst_list = fix_typos_in_time(burst_list)
burst_list = extract_time(burst_list)
burst_list = fix_24_hour_time(burst_list)
burst_list = create_datetime(burst_list)
burst_list = explode_instruments_long_clean_instruments(burst_list)
burst_list = keep_only_type_I_to_VI(burst_list)

In [12]:
burst_list.sample(5)

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
23161,20230111,19:09-19:10,III,Arecibo-Observatory,19:09,19:10,20230111,20230111,2023-01-11 19:09:00,2023-01-11 19:10:00
21082,20221211,07:38-07:39,III,INDIA-GAURI,07:38,07:39,20221211,20221211,2022-12-11 07:38:00,2022-12-11 07:39:00
15941,20220802,22:38-22:38,III,ALASKA-COHOE,22:38,22:38,20220802,20220802,2022-08-02 22:38:00,2022-08-02 22:38:00
13412,20220626,15:55-15:56,III,BIR,15:55,15:56,20220626,20220626,2022-06-26 15:55:00,2022-06-26 15:56:00
21621,20221219,02:08-02:09,III,SSRT,02:08,02:09,20221219,20221219,2022-12-19 02:08:00,2022-12-19 02:09:00


## Fix wrong names

In [13]:
MIN_BURST_PER_INSTRUMENT = 30

In [14]:
low_appearance_instruments = burst_list.groupby('instruments').filter(lambda x: len(x) <= MIN_BURST_PER_INSTRUMENT).instruments.unique()
low_appearance_instruments

array(['INDIA-UAIPUR', 'SPAIN-ALCALA', 'ROSWELL-NW', 'Australia-LMRO', '',
       'INDIA-NASHIK', 'DENMARK. GLASGOW', 'SWISS-BLEN5M', 'SWISS-BLEN7M',
       'URUGUAY', 'INPE?', 'MRT', 'SWISS-BLEN7M-E', 'NORWAY-RANDABERG',
       'INDIA-GAURI?', 'MRO?', 'AUSTRTIA-MICHELBACH', '/INDIA-UDAIPUR',
       'l MONGOLIA-UB', 'SWISS-Landschlach', 'HUMAIN. SWISS-Landschlacht',
       'GASGOW', 'UNAM', 'GLSAGOW', 'INDOENSIA', 'DENMAARK', 'Humain',
       'SWISS-LandschlachtEGYPT-Alexandria', 'INDIA-UDAIPUR MRT1',
       'AUSTRIA-Krumbach', 'NORWAY-NY-AALESUND', 'SP', 'IAIN-PERALEJOS',
       'ALASKA-ANCHORAGE', 'HUAMAIN', 'NDIA-GAURI', 'ROMANIA', 'HUMAIn',
       'MRT1?', 'HUMAI', 'NDIA-UDAIPUR', 'LASKA-COHOE', 'NDIA-OOTY',
       'MEXARFT', 'POLAND', 'USTRIA-UNIGRAZ', 'MRT21',
       'THAILAND-Pathumthan', 'INDIAMONGOLIA-UB', 'SSRT-UDAIPUR', '*',
       'Australia-ASSAArecibo-Observatory',
       'SSRT {more like drifting chain of type I}', 'INDIAALMATY',
       'SSRT-GAURI', 'INDOALASKA-COHOE', 

In [15]:
import difflib

In [16]:
def find_closest_instrument(instrument):
    try:
        instruments = burst_list.instruments.unique().tolist()
        instruments = np.setdiff1d(instruments, low_appearance_instruments)
        close_instrument = difflib.get_close_matches(instrument, instruments, n=1, cutoff=0.7)[0]
        print(f"Looking for a close match for {instrument}. Found {close_instrument}")
        return close_instrument
    except IndexError:
        print(f"Could not find a close match for {instrument}. Returning NaN.")
        return pd.NA

In [17]:
burst_list['instruments'] = burst_list['instruments'].apply(lambda x: find_closest_instrument(x) if x in low_appearance_instruments else x)

Looking for a close match for INDIA-UAIPUR. Found INDIA-UDAIPUR
Looking for a close match for INDIA-UAIPUR. Found INDIA-UDAIPUR
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Looking for a close match for ROSWELL-NW. Found ROSWELL-NM
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Looking for a close match for ROSWELL-NW. Found ROSWELL-NM
Looking for a close match for Australia-LMRO. Found Australia-ASSA
Looking for a close match for Australia-LMRO. Found Australia-ASSA
Could not find a close match for . Returning NaN.
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Could not find a close match for INDIA-NASHIK. Returning NaN.
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Could not find a close match for DENMARK. GLASGOW. Returning NaN.
Could not find a close match for SPAIN-ALCALA. Returning NaN.
Could no

In [18]:
burst_list[burst_list['instruments'].isna()]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
267,20210426,13:56-13:56,III,,13:56,13:56,20210426,20210426,2021-04-26 13:56:00,2021-04-26 13:56:00
451,20210509,13:54-13:56,III,,13:54,13:56,20210509,20210509,2021-05-09 13:54:00,2021-05-09 13:56:00
811,20210522,01:00-02:48,VI,,01:00,02:48,20210522,20210522,2021-05-22 01:00:00,2021-05-22 02:48:00
868,20210522,06:05-06:05,III,,06:05,06:05,20210522,20210522,2021-05-22 06:05:00,2021-05-22 06:05:00
889,20210522,06:16-06:17,III,,06:16,06:17,20210522,20210522,2021-05-22 06:16:00,2021-05-22 06:17:00
...,...,...,...,...,...,...,...,...,...,...
25253,20230223,06:12-06:15,V,,06:12,06:15,20230223,20230223,2023-02-23 06:12:00,2023-02-23 06:15:00
25281,20230223,07:47-07:50,III,,07:47,07:50,20230223,20230223,2023-02-23 07:47:00,2023-02-23 07:50:00
25288,20230223,07:47-07:50,III,,07:47,07:50,20230223,20230223,2023-02-23 07:47:00,2023-02-23 07:50:00
25310,20230223,08:08-08:10,III,,08:08,08:10,20230223,20230223,2023-02-23 08:08:00,2023-02-23 08:10:00


In [19]:
burst_list = burst_list.dropna(subset=['instruments'])

In [20]:
burst_list['database_instrument_name'] = burst_list['instruments'].apply(lambda x: extract_instrument_name(x))

In [21]:
burst_list[burst_list.instruments.str.contains('UNIGRAZ')]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end,database_instrument_name
1,20210120,12:37-12:37,III,AUSTRIA-UNIGRAZ,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00,austria_unigraz
10,20210127,09:27-09:27,III,AUSTRIA-UNIGRAZ,09:27,09:27,20210127,20210127,2021-01-27 09:27:00,2021-01-27 09:27:00,austria_unigraz
94,20210419,07:53-07:54,I,AUSTRIA-UNIGRAZ,07:53,07:54,20210419,20210419,2021-04-19 07:53:00,2021-04-19 07:54:00,austria_unigraz
115,20210421,09:03-09:03,III,AUSTRIA-UNIGRAZ,09:03,09:03,20210421,20210421,2021-04-21 09:03:00,2021-04-21 09:03:00,austria_unigraz
227,20210424,10:22-10:25,III,AUSTRIA-UNIGRAZ,10:22,10:25,20210424,20210424,2021-04-24 10:22:00,2021-04-24 10:25:00,austria_unigraz
...,...,...,...,...,...,...,...,...,...,...,...
25336,20230223,10:39-10:40,III,AUSTRIA-UNIGRAZ,10:39,10:40,20230223,20230223,2023-02-23 10:39:00,2023-02-23 10:40:00,austria_unigraz
25349,20230223,10:54-10:55,III,AUSTRIA-UNIGRAZ,10:54,10:55,20230223,20230223,2023-02-23 10:54:00,2023-02-23 10:55:00,austria_unigraz
25362,20230223,12:25-12:32,III,AUSTRIA-UNIGRAZ,12:25,12:32,20230223,20230223,2023-02-23 12:25:00,2023-02-23 12:32:00,austria_unigraz
25375,20230223,13:21-13:21,III,AUSTRIA-UNIGRAZ,13:21,13:21,20230223,20230223,2023-02-23 13:21:00,2023-02-23 13:21:00,austria_unigraz


In [22]:
burst_list['type'] = burst_list['type'].replace({'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6}).astype(int)

In [23]:
burst_list[burst_list.instruments.str.contains('UNIGRAZ')]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end,database_instrument_name
1,20210120,12:37-12:37,3,AUSTRIA-UNIGRAZ,12:37,12:37,20210120,20210120,2021-01-20 12:37:00,2021-01-20 12:37:00,austria_unigraz
10,20210127,09:27-09:27,3,AUSTRIA-UNIGRAZ,09:27,09:27,20210127,20210127,2021-01-27 09:27:00,2021-01-27 09:27:00,austria_unigraz
94,20210419,07:53-07:54,1,AUSTRIA-UNIGRAZ,07:53,07:54,20210419,20210419,2021-04-19 07:53:00,2021-04-19 07:54:00,austria_unigraz
115,20210421,09:03-09:03,3,AUSTRIA-UNIGRAZ,09:03,09:03,20210421,20210421,2021-04-21 09:03:00,2021-04-21 09:03:00,austria_unigraz
227,20210424,10:22-10:25,3,AUSTRIA-UNIGRAZ,10:22,10:25,20210424,20210424,2021-04-24 10:22:00,2021-04-24 10:25:00,austria_unigraz
...,...,...,...,...,...,...,...,...,...,...,...
25336,20230223,10:39-10:40,3,AUSTRIA-UNIGRAZ,10:39,10:40,20230223,20230223,2023-02-23 10:39:00,2023-02-23 10:40:00,austria_unigraz
25349,20230223,10:54-10:55,3,AUSTRIA-UNIGRAZ,10:54,10:55,20230223,20230223,2023-02-23 10:54:00,2023-02-23 10:55:00,austria_unigraz
25362,20230223,12:25-12:32,3,AUSTRIA-UNIGRAZ,12:25,12:32,20230223,20230223,2023-02-23 12:25:00,2023-02-23 12:32:00,austria_unigraz
25375,20230223,13:21-13:21,3,AUSTRIA-UNIGRAZ,13:21,13:21,20230223,20230223,2023-02-23 13:21:00,2023-02-23 13:21:00,austria_unigraz


In [25]:
burst_list.to_excel('burst_list.xlsx', index=False)

In [26]:
from database_functions import *
from database_utils import *
from tqdm import tqdm
import datetime

In [27]:
def insert_is_burst_status_between_dates_sql(tablename, start_date, end_date, type):
    """Insert is_burst status between two dates.

    Parameters
    ----------
    tablename : str
        The table name to insert the is_burst status for.
    start_date : `~datetime.datetime`
        The start date to insert the is_burst status for.
    end_date : `~datetime.datetime`
        The end date to insert the is_burst status for.
    type : int
        The type of burst to insert the burst_type status for. 


    Returns
    -------
    None

    Notes
    -----
    The function first finds the unique index data and the indices of the non-unique index data.
    It then combines the non-unique index data using the method specified by the `method` parameter.
    """
    start_date = start_date.strftime("%Y-%m-%d %H:%M:%S")
    end_date = end_date.strftime("%Y-%m-%d %H:%M:%S")
    with psycopg2.connect(CONNECTION) as conn:
        cursor = conn.cursor()
        cursor.execute(
            f"""
        UPDATE {tablename}
        SET burst_type = {type}
        WHERE datetime BETWEEN '{start_date}' AND '{end_date}'
        """
        )
        conn.commit()
        cursor.close()

In [28]:
MIN_DATE = datetime.datetime(2022, 1, 1)
burst_list_filtered = burst_list[burst_list['datetime_start'] >= MIN_DATE]

In [30]:
for table in tqdm(get_table_names_sql()):
    specific_instrument = table
    if table[-2:].isnumeric():
        table = table[:-3]
        burst_list_table = burst_list_filtered[burst_list_filtered['database_instrument_name'] == table].copy()
        if not 'burst_type' in get_column_names_sql(specific_instrument):
            add_new_column_default_value_sql(specific_instrument, 'burst_type', 'SMALLINT', 0)
        for i, row in burst_list_table.iterrows():
            insert_is_burst_status_between_dates_sql(specific_instrument, row.datetime_start, row.datetime_end, row.type)
    else:
        raise ValueError(f"Table name {table} is not in the correct format.")


100%|██████████| 3/3 [00:01<00:00,  2.74it/s]
