This module queries the GOES event catalog and makes a dataframe containing the flare onset, max, and end time as well as optlocation, active region, flare peak flux, and flare class

In [25]:
import pandas as pd
import datetime
from sunpy.instr.goes import get_goes_event_list
from sunpy.time import parse_time, TimeRange, is_time_in_given_format
from sunpy.instr.goes import flareclass_to_flux

In [4]:
#Query GOES event catalog to find all flares between 2005 and 2017
flare_2005 = datetime.datetime.strptime('2005-01-01T00:00:00.000', "%Y-%m-%dT%H:%M:00.000")
flare_2017 = datetime.datetime.strptime('2018-01-01T00:00:00.000', "%Y-%m-%dT%H:%M:00.000")   

t1_s = datetime.datetime.strftime(flare_2005, '%Y/%m/%d %H:%M')  
t2_s = datetime.datetime.strftime(flare_2017, '%Y/%m/%d %H:%M')

g_evt_list = get_goes_event_list(TimeRange(t1_s, t2_s))

In [43]:
#Create new dataframe to hold new flares
newFlr_df = pd.DataFrame(columns = ['FlrOnset','Flrmaxtime','Flrendtime','FlrPeakFlux','xrsclass','optlocation','region',
                                   'TypeII','TypeIIDur','TypeIV','TypeIVDur','cmeonset','cmespeed','cmewidth','FlrIntFlux',
                                   'FlrIntFlux2','tchianti','emchianti'])
newFlr_df = newFlr_df.fillna(0)

In [89]:
import numpy as np
import operator

FlrOnset = []
Flrmaxtime = []
Flrendtime = []
region = []
FlrPeakFlux = []
xrsclass = []
optlocation = []

flare_types = ['C','M','X']

#looping through the GOES event list
for i in range(len(g_evt_list)):
    
    #return the GOES flare class
    g_class = g_evt_list[i]['goes_class']
    max_flux = []
    
    #check if GOES class is C, M, or X
    if g_class[0] in flare_types:
        
        #if yes then get time and location information
        flare_start = datetime.datetime.strftime(g_evt_list[i]['start_time'], '%Y-%m-%dT%H:%M:00.000')
        flare_peak = datetime.datetime.strftime(g_evt_list[i]['peak_time'], '%Y-%m-%dT%H:%M:00.000')
        flare_end = datetime.datetime.strftime(g_evt_list[i]['end_time'], '%Y-%m-%dT%H:%M:00.000')
        flare_loc = g_evt_list[i]['goes_location']
        active_region = g_evt_list[i]['noaa_active_region']
        max_flux.append(flareclass_to_flux(g_class))
        
        #try to find peak flux value for event. If val returns empty then add NaN because catalog does not always properly
        #record flare class e.g. C?
        try:
            index, value = max(enumerate(max_flux), key=operator.itemgetter(1))
            val = value.to_value(unit=None) #remove astropy units from flare peak flux
        except:
            val = np.nan
        
        FlrOnset.append(flare_start)
        Flrmaxtime.append(flare_peak)
        Flrendtime.append(flare_end)
        region.append(active_region)
        FlrPeakFlux.append(val)
        xrsclass.append(g_class)
        
        #create appropriate strings to store flare location information
        if flare_loc[0] >= 0:
            EW_loc = 'W'
        elif flare_loc[0] < 0:
            EW_loc = 'E'
        if flare_loc[1] >= 0:
            NS_loc = 'N'
        elif flare_loc[1] < 0:
            NS_loc = 'S'
        
        #create the string e.g.N12E02 this format
        flrLoc = NS_loc + str(abs(flare_loc[1])) + EW_loc + str(abs(flare_loc[0]))
        optlocation.append(flrLoc)
        
        
        
        
        
#add to dataframe    
newFlr_df['FlrOnset'] = FlrOnset
newFlr_df['Flrmaxtime'] = Flrmaxtime
newFlr_df['Flrendtime'] = Flrendtime
newFlr_df['region'] = region
newFlr_df['FlrPeakFlux'] = FlrPeakFlux
newFlr_df['xrsclass'] = xrsclass
newFlr_df['optlocation'] = optlocation    

In [92]:
newFlr_df.to_csv('NewFlares_2005_2017.csv')

Now get GOES temperature and EM and integrated flux

In [None]:
#create datetime objects from date strings
flare_max = df['Flrmaxtime']
flare_start = df['FlrOnset']
max_time = []
start_time = []

#convert flare max and flare start times to datetime objects
for i in flare_max:
    max_dt = datetime.datetime.strptime(i, "%Y-%m-%dT%H:%M:%S.000")
    max_dt = pd.Timestamp(ts_input = max_dt)
    max_time.append(max_dt)

for j in flare_start:
    start_dt = datetime.datetime.strptime(j, "%Y-%m-%dT%H:%M:%S.000")
    start_dt = pd.Timestamp(ts_input = start_dt)
    start_time.append(start_dt) 



The following cell gets temp,em, and integrated flux values for the flares. Because the code crashes so much due to the internet connection it saves files at different checkpoints. 

NOTE: some files contain data from previous files. This problem has not yet been inspected for the sake of time. Be sure that you are not concatanating files with repeated data.

In [None]:
#read 1 minute GOES data

import numpy as np
import pandas as pd
import requests
from bs4 import BeautifulSoup
from sunpy.lightcurve import LightCurve
from sunpy.instr.goes import calculate_temperature_em

#comment out after first run
int_xs = [] #short xray
int_xl = [] #long xray
intflux_s = []
intflux_l = []
t_chianti = []
em_chianti = []
pkflux = []

#points at which to save a new file
checkpoints = [500,1000,1500,2000,2500,3000,3500,4000,4500,5000,5500,6000,6500,7000,7500,8000,8500,9000,9620]

#choose points to check progress of codes
check_prog = []

for j in range(len(start_time)):
    if j in check_prog:
        print(j)
    if isinstance(df['Flrmaxtime'][j],str) and isinstance(start_time[j],datetime.date): #check for nan and correct dates
        flare_dat = df['Flrmaxtime'][j]
        year = datetime.datetime.strptime(flare_dat,'%Y-%m-%dT%H:%M:00.000').strftime('%Y')
        month = datetime.datetime.strptime(flare_dat,'%Y-%m-%dT%H:%M:00.000').strftime('%m')

        monthstart = '01'
        leap_years = np.arange(1984,2020,4)
        
        #date for end of the month
        days_31 = ['01','03','05','07','08','10','12']
        days_30 = ['04','06','09','11']

        if (month in days_31) == True:
            monthend = '31'

        elif (month in days_30) == True:
            monthend = '30'

        elif month == '02':
            if (year in leap_years) == True:
                monthend = '29'
            else:
                monthend = '28'
        
        #access url where goes data is kept
        temp_url = "https://satdat.ngdc.noaa.gov/sem/goes/data/avg/"+year+"/"+month+'/'
        res = requests.get(temp_url)
        soup = BeautifulSoup(res.content,'lxml')
        table = soup.find_all('table')
        df_ = pd.read_html(str(table))[1]
        #goes_s =  list(df[0])[-1]
        goes_s = list(df_.Name)[-2]
        goes_num = goes_s[4:6]
        path_to_csv = "https://satdat.ngdc.noaa.gov/sem/goes/data/avg/"+year+"/"+month+'/'+goes_s+"csv/g"+goes_num+"_xrs_1m_"+year+month+monthstart+"_"+year+month+monthend+".csv"

        url = path_to_csv

        import urllib
        try: response = urllib.request.urlopen(url)
        except: url = "https://satdat.ngdc.noaa.gov/sem/goes/data/avg/"+year+"/"+month+'/'+list(df_.Name)[-3]+"csv/g"+list(df_.Name)[-3][4:6]+"_xrs_1m_"+year+month+monthstart+"_"+year+month+monthend+".csv"
        
        try:
            response = urllib.request.urlopen(url)

            data = response.read().decode('utf-8')

            datasplit = data.split('\n')

            #put data in readable format and looks for line where xray data begins
            for rownum, row in enumerate(datasplit):
                if row[0:8] == "time_tag":
                    nskiprows = rownum + 1

            #parsing function
            parser = lambda x: pd.datetime.strptime(x, "%Y-%m-%d %H:%M:%S.%f")          

            #read in cvs file from url and parse column time_tag with the function parse
            if url[59:61] == '12':
                c = pd.read_csv(url, skiprows=nskiprows, parse_dates=['time_tag'] ,date_parser = parser,names = ['time_tag','xs','xl'])
            else:
                c = pd.read_csv(url, skiprows=nskiprows, parse_dates=['time_tag'] ,date_parser = parser,names = ['time_tag','xs','xl'],usecols = [0,3,6])

            #set time_tag to the index
            c.set_index('time_tag',inplace=True)  

            #find just the subset of dates between the flare start time and end time - has to be a date time format
            t1_s = datetime.datetime.strftime(start_time[j], '%Y/%m/%d %H:%M')  
            t2_s = datetime.datetime.strftime(end_time[j], '%Y/%m/%d %H:%M')
            data_subset = c[t1_s:t2_s]  

            #sum over the time - 60 second time intervals
            sum_xrsa = (60 * data_subset['xs']).sum()
            sum_xrsb = (60 * data_subset['xl']).sum()

            int_xs.append(sum_xrsa)
            int_xl.append(sum_xrsb)

            try:
                #integrated flux with background radiation subtracted
                flx_s = (60 * (data_subset['xs'] - data_subset['xs'][0])).sum()
                flx_l = (60 * (data_subset['xl'] - data_subset['xl'][0])).sum()
            except:
                flx_s = np.nan
                flx_l = np.nan

            intflux_s.append(flx_s)
            intflux_l.append(flx_l)

            ###temperature and emission measures
            #convert the 1 min GOES data into a GOES lightcurve object - required for temp and em calculation
            lc = LightCurve.create({"xrsa": data_subset['xs'], "xrsb":data_subset['xl']}, index = data_subset.index)

            #add the satellite id to the meta data of the lightcurve object i.e. GOES 10, GOES 11, GOES 12
            name1 = goes_s
            name2 = name1.upper().split('S')
            name3 = name2[0] + 'S ' + name2[1][0:2]
            lc.meta["TELESCOP"] = name3

            #if an error arises and the temp is not in the accpetable range check to see if there are any 
            #negative values equal to -9999.0 in the array. keep only the rows that don't have an exray 
            #flux of -99999 (or whatever value they use that marks a bad value)
            lc.data.xrsa = lc.data.xrsa[lc.data.xrsa != -99999.0]
            lc.data.xrsb = lc.data.xrsb[lc.data.xrsb != -99999.0]

            try:
                #calculate the temperature and emission measure
                lc_new = calculate_temperature_em(lc) 

                #flare max temp and em
                temp = lc_new.data.temperature.max()
                em = lc_new.data.em.max()

                t_chianti.append(temp)
                em_chianti.append(em)

                #flare peak flux 
                flrpk = data_subset['xl'].max()
                pkflux.append(flrpk)

            except:
                t_chianti.append(np.nan)
                em_chianti.append(np.nan)
                pkflux.append(flrpk)
    
        except:
            int_xs.append(np.nan)
            int_xl.append(np.nan)
            intflux_s.append(np.nan)
            intflux_l.append(np.nan)
            t_chianti.append(np.nan)
            em_chianti.append(np.nan)
            pkflux.append(np.nan)
    
    else:
        int_xs.append(np.nan)
        int_xl.append(np.nan)
        intflux_s.append(np.nan)
        intflux_l.append(np.nan)
        t_chianti.append(np.nan)
        em_chianti.append(np.nan)
        pkflux.append(np.nan)
    
    if j in checkpoints:
        temp_df = pd.DataFrame(columns= ['IntFlux_s','IntFlux_l','Intbkg_s','Intbkg_l','tchianti','emchianti'])
        temp_df = temp_df.fillna(np.nan)
        temp_df['IntFlux_s'] = int_xs
        temp_df['IntFlux_l'] = int_xl
        temp_df['Intbkg_s'] = intflux_s
        temp_df['Intbkg_l'] = intflux_l
        temp_df['tchianti'] = t_chianti
        temp_df['emchianti'] = em_chianti
        
        file_str = 'new_events_' + str(j) + 'n.csv'
        temp_df.to_csv(file_str) #change number
        print(j)



This cell needs to contain some code to bring together the incremental files to make final_df

In [None]:
# #fixing indices
# intfluxl = final_df['IntFlux_l'][:]
# intfluxs = final_df['IntFlux_s'][:]
# t_chianti = final_df['tchianti'][:]
# em_chianti = final_df['emchianti'][:]

# intfluxl.index = range(0,len(intfluxl))
# intfluxs.index = range(0,len(intfluxs))
# t_chianti.index = range(0,len(t_chianti))
# em_chianti.index = range(0,len(em_chianti))

# df['FlrIntFlux'] = intfluxl
# df['FlrIntFlux2'] = intfluxs
# df['tchianti'] = t_chianti
# df['emchianti'] = em_chianti


In [None]:
final_df.to_csv('new_events_df.csv') 

Only run the next cell if you need to calculate the flare peak flux because it is very slow

In [None]:
# ####ONLY RUN THIS IF YOU NEED THE FLARE PEAK FLUX####

# #from datetime import timedelta, datetime #, strptime, strftime
# from sunpy.instr.goes import get_goes_event_list
# from sunpy.time import parse_time, TimeRange, is_time_in_given_format
# from sunpy.instr.goes import flareclass_to_flux
# from sunpy.lightcurve import GOESLightCurve
# from datetime import timedelta
# import datetime
# import time

# flare_max = df['Flrmaxtime']

# goes_class = []
# start_time = []
# end_time = []

# x = -1
# for g in range(len(flare_max)):
#     time.sleep(0.1)
#     if isinstance(g,str) == True: #make sure its not nan
#         x = x+1
#         error = 0 #to check whether or not we were able to match peak times (refer to nested for loop)
#         flare_dt = datetime.datetime.strptime(g, "%Y-%m-%dT%H:%M:00.000") 

#         #convert to the format that needed for sunny TimeFrame function - add some time to the start and end times
#         t1 = flare_dt - timedelta(seconds = (60*60*12))   
#         t2 = flare_dt + timedelta(seconds = (60*60*12))   

#         t1_s = datetime.datetime.strftime(t1, '%Y/%m/%d %H:%M')  
#         t2_s = datetime.datetime.strftime(t2, '%Y/%m/%d %H:%M')

#         #query the goes event list for all flares before and after the flare peak time of the event you are interested in
#         g_evt_list = get_goes_event_list(TimeRange(t1_s, t2_s))

#         for i in g_evt_list:
#             if i['peak_time'] == flare_dt: #match peak times in our data and the goes catalog
#                 gc = i['goes_class']
#                 et = i['end_time']
#                 st = i['start_time']
#                 flx = flareclass_to_flux(gc)
#                 goes_class.append(flx)
#                 end_time.append(et)
#                 start_time.append(st)
#                 error = 1
                
#         if error == 0: #only if peak times didn't match
#             goes_class.append(np.nan)
#             end_time.append(np.nan)
#             start_time.append(np.nan)
        
        
        
#     else:
#         goes_class.append(np.nan)
#         end_time.append(np.nan)
#         start_time.append(np.nan)
        
# #sep_new['Importance'] = goes_class