This script is to perform the event burst statistics analyses from GIC data prepared by [Kellerman et al., 2021](https://zenodo.org/record/4444068)

### Imports and utility functions

In [26]:
import numpy as np
import pandas as pd
import glob
import os

import datetime
from datetime import datetime, timedelta

from scipy.integrate import trapz

from sunpy.net import Fido
from sunpy.net import attrs as a
from sunpy.timeseries import TimeSeries


In [27]:
# Note that these functions were copied from NERC_data_complexity.ipynb on November 27, 2023 and should be copied periodically as it is maintained there


def get_sw_one_minute(start_datetime,end_datetime):

    '''
    input datetimes must be in format '%Y/%m/%d %H:%M:%S'
    '''
    

#     sunpy_format = '%Y/%m/%d %H:%M'

#     trange = a.Time(start_datetime.strftime(sunpy_format), end_datetime.strftime(sunpy_format))
    trange = a.Time(start_datetime,end_datetime)
    dataset = a.cdaweb.Dataset('OMNI_HRO2_1MIN')
    result = Fido.search(trange, dataset)

    downloaded_files = Fido.fetch(result[0])
    print(downloaded_files)
    
    sw_data = TimeSeries(downloaded_files, concatenate=True)
    df_SW = sw_data.to_dataframe()
    df_SW['datetimes'] = df_SW.index
    strings_to_match = ['AE_INDEX','BX_GSE','BY_GSM','BZ_GSM']
    filtered_columns = df_SW.filter(items=strings_to_match, axis=1)
    df_SW_filtered = df_SW.drop(columns=df_SW.columns.difference(strings_to_match))

    print(df_SW_filtered.columns)
    datetimes_SW = [t.strftime('%Y/%m/%d %H:%M:%S') for t in df_SW_filtered.index]
    df_SW_filtered['datetimes'] = datetimes_SW



    return df_SW_filtered
    
    


def calculate_event_properties(eventID, sensorID, time_series, datetimes, threshold, min_consecutive_steps_event, min_consecutive_steps_eventbreak, positive_or_negative):
    '''
        eventID                             :               string identifying the event being analyzed
        sensorID                            :               string identifying the sensor being analyzed
        time_series                         :               the time series to be analyzed (GIC sensor readings)
        threshold                           :               level of GIC that determines an event
        min_consecutive_steps               :               number of consecutive time steps above threshold to define an event
        min_consecutive_steps_eventbreak    :               number of consecutive time steps below threshold to break an event
        positive_or_negative                :               'positive' if positive GIC values are to be examined, 'negative' if negative

    '''
    event_ctr = 0
    event_eventID = []
    event_sensorID = []
    event_start_idxs = []
    event_end_idxs = []
    event_start_times = []
    event_end_times = []
    event_sizes = []
    event_durations = []
    event_betweendistances = []
    start_idx = None
    end_idx = None
    area_under_curve = 0
#     event_duration = 0
#     event_distance = 0


    if positive_or_negative == 'positive':
        for i, value in enumerate(time_series):
                if value > threshold:
                    if start_idx is None:
                        start_idx = i
#                         print('start_idx = {}'.format(start_idx))
                    elif (end_idx is not None):
#                         if ( (i - end_idx) < min_consecutive_steps_eventbreak):
                        end_idx = None
            
                elif start_idx is not None:
                    if end_idx is None:
                        end_idx = i
#                         print('end_idx = {}'.format(end_idx))
                        
                    if ( (end_idx - start_idx) < min_consecutive_steps_event ) & ( (i - end_idx) >= min_consecutive_steps_eventbreak):
                        start_idx = None
                        continue
                    if ( ( end_idx - start_idx) >= min_consecutive_steps_event) & ( (i - end_idx) >= min_consecutive_steps_eventbreak): 
                        
                        event_ctr += 1
#                         print('\t i = {}, \n\t\t end_idx - start_idx = {} \n\t\t i-start_idx = {} \n\t\t i-end_idx = {}'.format (i, end_idx-start_idx, i - start_idx, i - end_idx))

                        event_eventID.append(eventID)
                        event_sensorID.append(sensorID)
        
                        event_start_idxs.append(start_idx)
                        event_end_idxs.append(end_idx)
                        event_start_times.append(datetimes.values[start_idx])
                        event_end_times.append(datetimes.values[end_idx])

                        # convert durations from indices to times
                        event_duration_time = ( pd.to_datetime(datetimes.values[end_idx]).to_pydatetime() -  pd.to_datetime(datetimes.values[start_idx]).to_pydatetime()  ).seconds

                        if event_ctr == 1:
                            event_betweendistance_time = np.nan
                        else: 
                            #difference between the start of the current event and the end of the previous event 
                            event_betweendistance_time = ( pd.to_datetime(datetimes.values[ event_start_idxs[event_ctr-1] ]).to_pydatetime() -  pd.to_datetime(datetimes.values[ event_end_idxs[event_ctr-2] ]).to_pydatetime()  ).seconds



                        event_durations.append(event_duration_time)
                        event_betweendistances.append(event_betweendistance_time)

                        # calculate the area under the event curve
                        time_series_event = time_series.values[start_idx:end_idx]
                        elaptimes_event = [(timestamp - pd.to_datetime(datetimes.values[start_idx]) ).total_seconds() for timestamp in pd.to_datetime(datetimes.values[start_idx:end_idx]) ]
                        valid_indices = np.isfinite(time_series_event)
                        elaptimes_valid = np.array(elaptimes_event)[valid_indices]
                        time_series_valid = time_series_event[valid_indices]

        #                 np.trapz(timeseries_valid[start_idx:end_idx], elaptimes_valid[start_idx:end_idx])
                        area_under_curve = trapz(time_series_valid-threshold, 
                                                 elaptimes_valid)
                        event_sizes.append(area_under_curve)

        #                 event_sizes.append(area_under_curve / event_duration)

                        start_idx = None
                        end_idx = None
                        area_under_curve = 0
                        event_duration_time = 0
                        event_betweendistance_time = 0 
                        elaptimes = []
                        valid_indices = []
                        elaptimes_valid = []
                        time_series_event = []
                        time_series_valid = []
            

                

    elif positive_or_negative == 'negative':
        threshold = threshold * -1.
        for i, value in enumerate(time_series):
            if value < threshold:
                if start_idx is None:
                    start_idx = i
#                         print('start_idx = {}'.format(start_idx))
                elif (end_idx is not None):
#                         if ( (i - end_idx) < min_consecutive_steps_eventbreak):
                    end_idx = None

            elif start_idx is not None:
                if end_idx is None:
                    end_idx = i
#                         print('end_idx = {}'.format(end_idx))

                if ( (end_idx - start_idx) < min_consecutive_steps_event ) & ( (i - end_idx) >= min_consecutive_steps_eventbreak):
                    start_idx = None
                    continue
                if ( ( end_idx - start_idx) >= min_consecutive_steps_event) & ( (i - end_idx) >= min_consecutive_steps_eventbreak): 

                    event_ctr += 1
#                         print('\t i = {}, \n\t\t end_idx - start_idx = {} \n\t\t i-start_idx = {} \n\t\t i-end_idx = {}'.format (i, end_idx-start_idx, i - start_idx, i - end_idx))

                    event_eventID.append(eventID)
                    event_sensorID.append(sensorID)

                    event_start_idxs.append(start_idx)
                    event_end_idxs.append(end_idx)
                    event_start_times.append(datetimes.values[start_idx])
                    event_end_times.append(datetimes.values[end_idx])

                    # convert durations from indices to times
                    event_duration_time = ( pd.to_datetime(datetimes.values[end_idx]).to_pydatetime() -  pd.to_datetime(datetimes.values[start_idx]).to_pydatetime()  ).seconds

                    if event_ctr == 1:
                        event_betweendistance_time = np.nan
                    else: 
                        #difference between the start of the current event and the end of the previous event 
                        event_betweendistance_time = ( pd.to_datetime(datetimes.values[ event_start_idxs[event_ctr-1] ]).to_pydatetime() -  pd.to_datetime(datetimes.values[ event_end_idxs[event_ctr-2] ]).to_pydatetime()  ).seconds



                    event_durations.append(event_duration_time)
                    event_betweendistances.append(event_betweendistance_time)

                    # calculate the area under the event curve
                    time_series_event = time_series.values[start_idx:end_idx]
                    elaptimes_event = [(timestamp - pd.to_datetime(datetimes.values[start_idx]) ).total_seconds() for timestamp in pd.to_datetime(datetimes.values[start_idx:end_idx]) ]
                    valid_indices = np.isfinite(time_series_event)
                    elaptimes_valid = np.array(elaptimes_event)[valid_indices]
                    time_series_valid = time_series_event[valid_indices]

    #                 np.trapz(timeseries_valid[start_idx:end_idx], elaptimes_valid[start_idx:end_idx])
                    area_under_curve = trapz( (time_series_valid*-1.) + threshold, 
                                             elaptimes_valid)
                    event_sizes.append(area_under_curve)

    #                 event_sizes.append(area_under_curve / event_duration)

                    start_idx = None
                    end_idx = None
                    area_under_curve = 0
                    event_duration_time = 0
                    event_betweendistance_time = 0 
                    elaptimes = []
                    valid_indices = []
                    elaptimes_valid = []
                    time_series_event = []
                    time_series_valid = []
            
            

#     # Convert the lists to pandas Series for easy analysis
#     events = pd.Series(events)
#     event_sizes = pd.Series(event_sizes)
#     event_durations = pd.Series(event_durations)
#     event_distances = pd.Series(event_distances)

    # convert to DF for easy analysis
    df_out = pd.DataFrame(columns=['event id',
                                   'sensor id',
                                   'event_start_idxs',
                                   'event_end_idxs',
                                   'event_start_times',
                                   'event_end_times',
                                   'event_sizes',
                                   'event_durations',
                                   'event_betweendistances'])
    df_out['event id'] = event_eventID
    df_out['sensor id'] = event_sensorID
    df_out['event_start_idxs'] = event_start_idxs
    df_out['event_end_idxs'] = event_end_idxs
    df_out['event_start_times'] = event_start_times
    df_out['event_end_times'] = event_end_times
    df_out['event_sizes'] = event_sizes
    df_out['event_durations'] = event_durations
    df_out['event_betweendistances'] = event_betweendistances
    

    return df_out
#     return events, event_start_idxs, event_end_idxs, event_sizes, event_durations, event_betweendistances


### Read in the sub-sampled QDC data (one-minute resolution)

In [24]:
qdc_directory = '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/'

qdc_data_dir = qdc_directory
qdc_files = glob.glob( os.path.join(qdc_data_dir,'*.csv') )
qdc_files



['/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_1_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_2_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_3_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_6_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_7_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_5_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv',
 '/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resamp

### Full pipeline

In [31]:
# set details for SW event defintion
threshold_IMF = np.sqrt(47.7) # following Tindale (https://eprints.lse.ac.uk/90428/1/Watkins__dependence-of-solar.pdf)
#     18 # following https://eprints.lse.ac.uk/90428/1/Watkins__dependence-of-solar.pdf     
min_consecutive_steps_event_IMF = 10 # must remain above threshold for X consecutive time steps to be an event; 5 minutes is minimum event time following Tindale
min_consecutive_steps_eventbreak_IMF = 10 # must fall below threshold for X consecutive time steps to end an event

threshold_AE = 70 # following Consolini (https://agupubs.onlinelibrary.wiley.com/doi/epdf/10.1029/1998GL900073)
min_consecutive_steps_event_AE = 10 # must remain above threshold for X consecutive time steps to be an event
min_consecutive_steps_eventbreak_AE = 10 # must fall below threshold for X consecutive time steps to end an event


# set details for event definition
threshold = 0.5
# thresholds = [0.2, 0.5, 0.8, 1.1]
min_consecutive_steps_event = 10 # must remain above threshold for X consecutive time steps to be an event
min_consecutive_steps_eventbreak = 6 # must fall below threshold for X consecutive time steps to end an event

plotting_flag = 0


for sign_GIC in ['positive','negative']:
    
    
    for f in qdc_files:#[9:10]:
        # create dataframe to store results
        df_total = pd.DataFrame(columns=[#'event id',
                                       'event_start_idxs',
                                       'event_end_idxs',
                                       'event_start_times',
                                       'event_end_times',
                                       'event_sizes',
                                       'event_durations',
                                       'event_betweendistances'])
        
        df = pd.read_csv(f,index_col=False)

        print('\n\n\n\n----------------------------\nworking on node: {}'.format(f[84:90])) #f.find('event')
        print('\t--->{}'.format(f))

        # only need to do the solar wind calculation once
        if f[84:90] == 'Node_1':
            # Calculate solar wind variables, starting by getting one minute values
            df_SW_oneminute = get_sw_one_minute(df['DateTime'].values[0],df['DateTime'].values[-1])
            df_SW_oneminute['IMF magnitude'] = np.sqrt( df_SW_oneminute['BX_GSE']**2 + df_SW_oneminute['BY_GSM']**2 + df_SW_oneminute['BZ_GSM']**2 )


            # Calculate the event statistics for IMF and AE index data
            df_IMF_total = calculate_event_properties(f[97:-4], 'N/A', df_SW_oneminute['IMF magnitude'], df_SW_oneminute['datetimes'], threshold_IMF, min_consecutive_steps_event_IMF,  min_consecutive_steps_eventbreak_IMF,'positive')
            df_AE_total = calculate_event_properties(f[97:-4], 'N/A', df_SW_oneminute['AE_INDEX'], df_SW_oneminute['datetimes'], threshold_AE, min_consecutive_steps_event_AE,  min_consecutive_steps_eventbreak_AE,'positive')



#     for threshold in thresholds
        df_out = calculate_event_properties(f[97:-4], f[84:90], df['GIC'], df['DateTime'], threshold, min_consecutive_steps_event,  min_consecutive_steps_eventbreak,sign_GIC)
#         df_out = calculate_event_properties(f[97:-4], f[84:90], df[sensor_ID], df['datetimes'], threshold, min_consecutive_steps_event,  min_consecutive_steps_eventbreak,'negative')

        if plotting_flag == 1:

            # Calculate datetimes
            datetimes_viz = [pd.to_datetime(d).to_pydatetime() for d in df['datetimes']] 

            # Define specific datetimes for vertical lines
            datetimes_events_start = [datetimes_viz[d] for d in df_out['event_start_idxs'].values]
            datetimes_events_end = [datetimes_viz[d] for d in df_out['event_end_idxs'].values]

            plot_time_series_with_lines(datetimes_viz, df[sensor_ID], threshold, datetimes_events_start, datetimes_events_end, sensor_ID, df_AE_total)


            input("Press Enter to continue...")
            plt.close()

        df_total = pd.concat([df_total, df_out], axis=0)


        # save output from the event
        filesave_base = f[f.find('Node'):f.find('Node')+6].replace(" ", "").replace("_","")+'_'
        if not os.path.exists('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(threshold)+'/'+sign_GIC+'/'):
            os.makedirs('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(threshold)+'/'+sign_GIC+'/')
        df_total.to_csv( os.path.join('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(threshold)+'/'+sign_GIC+'/', filesave_base + 'GICevents.csv') )
        df_IMF_total.to_csv( os.path.join('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(threshold)+'/'+sign_GIC+'/', filesave_base + 'IMFevents.csv') )
        df_AE_total.to_csv( os.path.join('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(threshold)+'/'+sign_GIC+'/', filesave_base + 'AEevents.csv') )

                                         





----------------------------
working on node: Node_1
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_1_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv


Files Downloaded:   0%|          | 0/14 [00:00<?, ?file/s]

['/Users/ryanmc/sunpy/data/omni_hro2_1min_20180801_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20180901_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181001_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181101_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181201_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190101_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190201_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190301_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190401_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190501_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190601_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190701_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190801_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190901_v01.cdf']








Index(['AE_INDEX', 'BX_GSE', 'BY_GSM', 'BZ_GSM'], dtype='object')




----------------------------
working on node: Node_2
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_2_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_3
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_3_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_6
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_6_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_7
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_7_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_5
	--->/Users/ryanmc/Documents/Conf

Files Downloaded:   0%|          | 0/14 [00:00<?, ?file/s]

['/Users/ryanmc/sunpy/data/omni_hro2_1min_20180801_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20180901_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181001_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181101_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20181201_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190101_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190201_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190301_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190401_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190501_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190601_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190701_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190801_v01.cdf', '/Users/ryanmc/sunpy/data/omni_hro2_1min_20190901_v01.cdf']








Index(['AE_INDEX', 'BX_GSE', 'BY_GSM', 'BZ_GSM'], dtype='object')




----------------------------
working on node: Node_2
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_2_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_3
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_3_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_6
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_6_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_7
	--->/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/QDC_data/resampled/Node_7_anon_gic_qdc_median_wdy10_wmin1800_fm3_wgic_resampled.csv




----------------------------
working on node: Node_5
	--->/Users/ryanmc/Documents/Conf

In [21]:
df

Unnamed: 0,DateTime,GIC,GIC_QDC,IQR
0,2018-08-01 00:00:00,-0.65,,
1,2018-08-01 00:01:00,-0.61,,
2,2018-08-01 00:02:00,-0.58,,
3,2018-08-01 00:03:00,-0.61,,
4,2018-08-01 00:04:00,-0.61,,
...,...,...,...,...
601915,2019-09-22 23:55:00,-0.00,-0.0833,0.110
601916,2019-09-22 23:56:00,0.02,-0.0837,0.109
601917,2019-09-22 23:57:00,0.05,-0.0840,0.108
601918,2019-09-22 23:58:00,0.03,-0.0843,0.107


'Node_1'

In [19]:
filesave_base = f[f.find('Node'):f.find('Node')+6].replace(" ", "").replace("_","")+'_'
os.path.join('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/'+str(0.5)+'/'+'positive'+'/', filesave_base + 'GICevents.csv')




'/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/burst_event_data/QDC/0.5/positive/Node1_GICevents.csv'