In [47]:
from burstextractor.burstlist import download_burst_data
from burstextractor.timeutils import extract_time, fix_typos_in_time, fix_24_hour_time, create_datetime
from burstextractor.data_utils import explode_instruments_long_clean_instruments, keep_only_type_I_to_VI
import pandas as pd
import numpy as np
import difflib

## Create Dataframe with the Burstlists

In [48]:
burst_list = download_burst_data([2021, 2022, 2023, 2024], months=range(1, 13), folder="ecallisto_files")

  data = pd.read_csv(
  data = pd.read_csv(


In [49]:
burst_list.sample(5)

Unnamed: 0,date,time,type,instruments
8861,20240229,14:46-15:07,VI,"AUSTRIA-UNIGRAZ, BIR, GERMANY-DLR, GLASGOW, GR..."
8060,20231204,21:34-21:35,III,Australia-ASSA
739,20210922,21:49-21:52,III,"ALASKA-COHOE, Australia-ASSA, MEXART"
8545,20240202,14:24-14:24,III,"ALGERIA-CRAAG, (AUSTRIA-UNIGRAZ), GLASGOW, GRE..."
5290,20230220,07:10-07:12,III,"ALMATY, Australia-ASSA, USTRIA-UNIGRAZ, HUMAIN..."


In [50]:
burst_list.shape

(11609, 4)

In [51]:
burst_list['time']

0        02:42-02:42
1        12:37-12:37
2        04:32-04:32
3        09:27-09:27
4        18:04-18:04
            ...     
11604    00:31-00:32
11605    01:37-01:37
11606    02:22-02:32
11607    04:20-04:26
11608    08:37-08:45
Name: time, Length: 11609, dtype: object

## Fix typos

In [52]:
extracted_digits = burst_list['time'].str.extract(r'(\d+).(\d+).(\d+).(\d+)', expand=True)
mask = ~extracted_digits.isna().any(axis=1)
extracted_digits = extracted_digits[mask].astype(int)
burst_list = burst_list[mask]

In [53]:
burst_list

Unnamed: 0,date,time,type,instruments
0,20210119,02:42-02:42,III,Australia-ASSA
1,20210120,12:37-12:37,III,"AUSTRIA-UNIGRAZ, [HUMAIN], MRT1, SOUTHAFRICA-S..."
2,20210127,04:32-04:32,III,"Australia-ASSA, INDIA-GAURI, SOUTHAFRICA-SANSA"
3,20210127,09:27-09:27,III,"AUSTRIA-UNIGRAZ, INDIA-GAURI, INDIA-OOTY, MRT1..."
4,20210218,18:04-18:04,III,"GREENLAND, MEXART, ROSWELL-NM"
...,...,...,...,...
11604,20241127,00:31-00:32,III,"ALASKA-COHOE, Australia-ASSA, TAIWAN-NCU"
11605,20241127,01:37-01:37,III,Australia-ASSA
11606,20241127,02:22-02:32,VI,"Australia-ASSA, INDIA-UDAIPUR, TAIWAN-NCU"
11607,20241127,04:20-04:26,III,"Australia-ASSA, INDIA-OOTY, INDIA-UDAIPUR, MON..."


In [54]:
impossible_times_bool = (extracted_digits[1] > 59) | (extracted_digits[3] > 59) | (extracted_digits[0] > 23) | (extracted_digits[2] > 23)
extracted_digits[impossible_times_bool]

Unnamed: 0,0,1,2,3
1693,6,6,6,88
2238,24,32,14,33
3876,23,59,24,0
3890,0,0,24,0
3920,3,50,24,0
4179,23,59,24,0
4467,23,58,24,0
4557,0,0,24,0
4576,0,0,24,0
6238,21,18,212,19


In [55]:
burst_list[impossible_times_bool]

Unnamed: 0,date,time,type,instruments
1693,20220210,06:06-06:88,V,"ALMATY, Australia-ASSA, INDIA-OOTY, INDIA-UDAI..."
2238,20220421,24:32-14:33,III,"AUSTRIA-MICHELBACH, Arecibo-Observatory, GLASG..."
3876,20220926,23:59:24:00,III,"ALASKA-COHOE, ALASKA-HAARP, Australia-ASSA"
3890,20220929,00:00-24:00,,CTM
3920,20221001,03:50-24:00,CTM,*
4179,20221110,23:59-24:00,III,"ALASKA-COHOE, Australia-ASSA"
4467,20221215,23:58-24:00,III,Arecibo-Observatory
4557,20221221,00:00-24:00,CTM,*
4576,20221222,00:00-24:00,CTM,*
6238,20230510,21:18-212:19,III,"(ALASKA-ANCHORAGE), ALASKA-COHOE, ALASKA-HAARP..."


In [56]:
burst_list = fix_typos_in_time(burst_list)
burst_list = extract_time(burst_list)
burst_list = fix_24_hour_time(burst_list)
burst_list = create_datetime(burst_list)

In [57]:
burst_list[impossible_times_bool]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
1693,20220210,06:06-06:08,V,"ALMATY, Australia-ASSA, INDIA-OOTY, INDIA-UDAI...",06:06,06:08,20220210,20220210,2022-02-10 06:06:00,2022-02-10 06:08:00
2238,20220421,14:32-14:33,III,"AUSTRIA-MICHELBACH, Arecibo-Observatory, GLASG...",14:32,14:33,20220421,20220421,2022-04-21 14:32:00,2022-04-21 14:33:00
3876,20220926,23:59-00:00,III,"ALASKA-COHOE, ALASKA-HAARP, Australia-ASSA",23:59,00:00,20220926,20220927,2022-09-26 23:59:00,2022-09-27 00:00:00
3890,20220929,00:00-00:00,,CTM,00:00,00:00,20220929,20220930,2022-09-29 00:00:00,2022-09-30 00:00:00
3920,20221001,03:50-00:00,CTM,*,03:50,00:00,20221001,20221002,2022-10-01 03:50:00,2022-10-02 00:00:00
4179,20221110,23:59-00:00,III,"ALASKA-COHOE, Australia-ASSA",23:59,00:00,20221110,20221111,2022-11-10 23:59:00,2022-11-11 00:00:00
4467,20221215,23:58-00:00,III,Arecibo-Observatory,23:58,00:00,20221215,20221216,2022-12-15 23:58:00,2022-12-16 00:00:00
4557,20221221,00:00-00:00,CTM,*,00:00,00:00,20221221,20221222,2022-12-21 00:00:00,2022-12-22 00:00:00
4576,20221222,00:00-00:00,CTM,*,00:00,00:00,20221222,20221223,2022-12-22 00:00:00,2022-12-23 00:00:00
6238,20230510,21:18-21:19,III,"(ALASKA-ANCHORAGE), ALASKA-COHOE, ALASKA-HAARP...",21:18,21:19,20230510,20230510,2023-05-10 21:18:00,2023-05-10 21:19:00


In [58]:
burst_list.to_excel('burst_list_unfiltered.xlsx')

In [59]:
burst_list = explode_instruments_long_clean_instruments(burst_list)
burst_list = keep_only_type_I_to_VI(burst_list)

## Fix wrong names

In [60]:
MIN_BURST_PER_INSTRUMENT = 5

In [61]:
low_appearance_instruments = burst_list.groupby('instruments').filter(lambda x: len(x) <= MIN_BURST_PER_INSTRUMENT).instruments.unique().copy()
low_appearance_instruments

array(['INDIA-UAIPUR', 'ROSWELL-NW', 'DENMARK. GLASGOW', 'INPE?', 'MRT',
       'MRO?', 'AUSTRTIA-MICHELBACH', '/INDIA-UDAIPUR', 'l MONGOLIA-UB',
       'SWISS-Landschlach', 'HUMAIN. SWISS-Landschlacht', 'GASGOW',
       'GLSAGOW', 'INDOENSIA', 'DENMAARK', 'Humain',
       'SWISS-LandschlachtEGYPT-Alexandria', 'INDIA-UDAIPUR MRT1',
       'NORWAY-NY-AALESUND', 'SP', 'IAIN-PERALEJOS', 'HUAMAIN',
       'NDIA-GAURI', 'HUMAIn', 'MRT1?', 'HUMAI', 'NDIA-UDAIPUR',
       'LASKA-COHOE', 'NDIA-OOTY', 'MEXARFT', 'POLAND', 'USTRIA-UNIGRAZ',
       'MRT21', 'THAILAND-Pathumthan', 'INDIAMONGOLIA-UB', 'SSRT-UDAIPUR',
       'Australia-ASSAArecibo-Observatory',
       'SSRT {more like drifting chain of type I}', 'INDIAALMATY',
       'SSRT-GAURI', 'INDOALASKA-COHOE', 'ROSWELL-NMNESIA',
       'Australia-ASSA {followed by blackout}', 'SSRTMalaysia-Banting',
       'INDONESIAINDIA-OOTY', 'ALMYTY', 'INDIA-OOTY?', 'SSRTFIN',
       'LAND-Siuntio', 'FINLAND-Siunti', 'SWISS-SCAN', 'WISS-HEITERSWIL',
     

In [62]:
burst_list.instruments.unique()

array(['Australia-ASSA', 'AUSTRIA-UNIGRAZ', 'HUMAIN', 'MRT1',
       'SOUTHAFRICA-SANSA', 'SWISS-Landschlacht', 'TRIEST', 'INDIA-GAURI',
       'INDIA-OOTY', 'GREENLAND', 'MEXART', 'ROSWELL-NM', 'KRIM',
       'GLASGOW', 'ALASKA-HAARP', 'ALMATY', 'INDIA-UDAIPUR', 'MRO',
       'INDIA-UAIPUR', 'INDONESIA', 'AUSTRIA-OE3FLB', 'HURBANOVO',
       'MONGOLIA-UB', 'MRT2', 'SWISS-HB9SCT', 'SWISS-IRSOL',
       'SWISS-MUHEN', 'SPAIN-PERALEJOS', 'BIR', 'DENMARK',
       'SWISS-HEITERSWIL', 'KASI', 'ALGERIA-CRAAG', 'SPAIN-ALCALA',
       'ROSWELL-NW', 'ALASKA-COHOE', 'AUSTRIA-MICHELBACH',
       'Australia-LMRO', '', 'INDIA-NASHIK', 'MRT3', 'DENMARK. GLASGOW',
       'SWISS-BLEN5M', 'SWISS-BLEN7M', 'URUGUAY', 'INPE', 'INPE?',
       'SPAIN-SIGUENZA', 'POLAND-Grotniki', 'MRT', 'GERMANY-DLR',
       'SWISS-BLEN7M-E', 'EGYPT-Alexandria', 'NORWAY-RANDABERG',
       'INDIA-GAURI?', 'SRI-Lanka', 'MRO?', 'AUSTRTIA-MICHELBACH',
       '/INDIA-UDAIPUR', 'l MONGOLIA-UB', 'SWISS-Landschlach',
       'Malays

In [63]:
def find_closest_instrument(instrument):
    """
    This function finds the closest instrument name match from a list,
    avoiding instruments in low_appearance_instruments.

    Parameters:
    - instrument (str): The name of the instrument you want to find a close match for.

    Returns:
    - str or pd.NA: Returns the closest matching instrument name or NaN if no match is found.
    """
    
    try:
        # Retrieve the unique list of instruments from the 'burst_list' dataframe.
        instruments = burst_list.instruments.unique().tolist()
        
        # Convert the list of instruments to a numpy array, excluding those 
        # present in the 'low_appearance_instruments' list.
        instruments = np.array([i for i in instruments if i not in low_appearance_instruments])
        
        # Use the 'get_close_matches' function from 'difflib' to find the closest match 
        # to the provided 'instrument' from the filtered list of 'instruments'. 
        # The function is set to return only one match with a similarity cutoff of 0.8.
        close_instrument = difflib.get_close_matches(instrument, instruments, n=1, cutoff=0.8)[0]
        
        # Print the found match for the user.
        print(f"Looking for a close match for {instrument}. Found {close_instrument}")
        return close_instrument
    
    except IndexError:
        # This block is reached if 'get_close_matches' does not find a suitable match 
        # (i.e., no matches above the cutoff of 0.8 similarity).
        print(f"Could not find a close match for {instrument}. Returning NaN.")
        return pd.NA


In [64]:
burst_list['instruments'] = burst_list['instruments'].apply(lambda x: find_closest_instrument(x) if x in low_appearance_instruments else x)

Looking for a close match for INDIA-UAIPUR. Found INDIA-UDAIPUR
Looking for a close match for INDIA-UAIPUR. Found INDIA-UDAIPUR
Looking for a close match for ROSWELL-NW. Found ROSWELL-NM
Looking for a close match for ROSWELL-NW. Found ROSWELL-NM
Could not find a close match for DENMARK. GLASGOW. Returning NaN.
Looking for a close match for INPE?. Found INPE
Looking for a close match for MRT. Found MRT3
Looking for a close match for MRO?. Found MRO
Looking for a close match for MRO?. Found MRO
Looking for a close match for MRO?. Found MRO
Looking for a close match for AUSTRTIA-MICHELBACH. Found AUSTRIA-MICHELBACH
Looking for a close match for AUSTRTIA-MICHELBACH. Found AUSTRIA-MICHELBACH
Looking for a close match for /INDIA-UDAIPUR. Found INDIA-UDAIPUR
Looking for a close match for l MONGOLIA-UB. Found MONGOLIA-UB
Looking for a close match for l MONGOLIA-UB. Found MONGOLIA-UB
Looking for a close match for SWISS-Landschlach. Found SWISS-Landschlacht
Looking for a close match for MRO?. Fo

In [65]:
burst_list[burst_list['instruments'].isna()]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
982,20210522,10:22-10:22,III,,10:22,10:22,20210522,20210522,2021-05-22 10:22:00,2021-05-22 10:22:00
7213,20220306,15:46-15:56,VI,,15:46,15:56,20220306,20220306,2022-03-06 15:46:00,2022-03-06 15:56:00
7234,20220307,13:36-13:36,III,,13:36,13:36,20220307,20220307,2022-03-07 13:36:00,2022-03-07 13:36:00
7239,20220307,14:50-14:50,III,,14:50,14:50,20220307,20220307,2022-03-07 14:50:00,2022-03-07 14:50:00
7347,20220309,13:19-13:19,III,,13:19,13:19,20220309,20220309,2022-03-09 13:19:00,2022-03-09 13:19:00
7818,20220328,11:23-11:38,II,,11:23,11:38,20220328,20220328,2022-03-28 11:23:00,2022-03-28 11:38:00
7855,20220328,12:49-12:50,III,,12:49,12:50,20220328,20220328,2022-03-28 12:49:00,2022-03-28 12:50:00
15846,20220723,20:33-20:35,III,,20:33,20:35,20220723,20220723,2022-07-23 20:33:00,2022-07-23 20:35:00
18015,20220920,11:20-11:24,III,,11:20,11:24,20220920,20220920,2022-09-20 11:20:00,2022-09-20 11:24:00
18155,20220921,10:06-10:08,III,,10:06,10:08,20220921,20220921,2022-09-21 10:06:00,2022-09-21 10:08:00


In [66]:
burst_list = burst_list.dropna(subset=['instruments'])

In [67]:
burst_list['type'] = burst_list['type'].replace({'I': 1, 'II': 2, 'III': 3, 'IV': 4, 'V': 5, 'VI': 6}).astype(int)

In [68]:
burst_list[burst_list.instruments.str.contains('Australia-ASSA') & burst_list.type.isin([5, 6])]

Unnamed: 0,date,time,type,instruments,time_start,time_end,date_start,date_end,datetime_start,datetime_end
113,20210420,03:19-03:21,6,Australia-ASSA,03:19,03:21,20210420,20210420,2021-04-20 03:19:00,2021-04-20 03:21:00
168,20210423,07:16-07:16,6,Australia-ASSA,07:16,07:16,20210423,20210423,2021-04-23 07:16:00,2021-04-23 07:16:00
332,20210506,03:00-03:04,6,Australia-ASSA,03:00,03:04,20210506,20210506,2021-05-06 03:00:00,2021-05-06 03:04:00
492,20210512,05:43-05:46,6,Australia-ASSA,05:43,05:46,20210512,20210512,2021-05-12 05:43:00,2021-05-12 05:46:00
534,20210513,06:28-06:30,6,Australia-ASSA,06:28,06:30,20210513,20210513,2021-05-13 06:28:00,2021-05-13 06:30:00
...,...,...,...,...,...,...,...,...,...,...
62626,20241119,23:14-23:35,6,Australia-ASSA,23:14,23:35,20241119,20241119,2024-11-19 23:14:00,2024-11-19 23:35:00
62648,20241120,07:43-07:56,6,Australia-ASSA,07:43,07:56,20241120,20241120,2024-11-20 07:43:00,2024-11-20 07:56:00
62689,20241123,02:15-02:28,6,Australia-ASSA,02:15,02:28,20241123,20241123,2024-11-23 02:15:00,2024-11-23 02:28:00
62817,20241125,02:58-03:20,6,Australia-ASSA,02:58,03:20,20241125,20241125,2024-11-25 02:58:00,2024-11-25 03:20:00


In [69]:
burst_list.to_excel('burst_list.xlsx', index=False)

# Manually fix the rest. 