# Drag Database V3

Satellite drag database to develop new machine learning algorithm that incoporates altitude.

## Datasets

All at 5 minute cadence

- Grace B
    - add geomagnetic coordinates
- Omni
- FISM2
- MSIS profiles

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import os, sys
import pandas as pd
import numpy as np
import xarray as xr
import glob

# for converting to
# geomagnetic coord
import aacgmv2

# add read_io module to current path ()
# and import
file_path = 'D:\\GitHub\\DataIO\\'
sys.path.append(os.path.dirname(file_path))
import data_io as dio

In [7]:
# dates to read in
sdate = '2018-01-01'
edate = '2024-01-01'

# number of years
ldate = pd.to_datetime(edate)-pd.to_datetime(sdate)  
ldate = int(ldate.total_seconds()/(365.2*86400)+1)

In [8]:
# load grace data
sat = 'GC'
gr_d, gr_u, gr_m = dio.toleos_den.load_toleos(sat=sat,sdate=sdate,edate=edate)

d_min = gr_d['DateTime'].min()
d_max = gr_d['DateTime'].max()

In [12]:
print(gr_m)

# Column  2:         Time (hh:mm:ss.sss)
# Column  3:         Time system: UTC or GPS (differs per mission)
# Column  4:  f10.3  Altitude (m), GRS80
# Column  5:   f8.3  Geodetic longitude (deg), GRS80
# Column  6:   f7.3  Geodetic latitude (deg), GRS80
# Column  7:   f6.3  Local solar time (h)
# Column  8:   f7.3  Argument of latitude (deg)
# Column  9:  e15.8  Density derived from accelerometer measurements (kg/m3)
# Column 10:  e15.8  Running orbit average of density (kg/m3)
# Column 11:   f4.1  Flag for density: 0 = nominal data, 1 = anomalous data (-)
# Column 12:   f4.1  Flag for running orbit average density: 0 = nominal data, 1 = anomalous data (-)



In [13]:
# load omni data
om_d, om_m = dio.load_omni(res='5m',sdate=sdate, nd=ldate)
om_d = om_d[(om_d['DateTime'] >= d_min-pd.DateOffset(minutes=5)) & (om_d['DateTime'] <= d_max+pd.DateOffset(minutes=5))]

In [14]:
# read fsim2 data and truncate to similar range as grace data
fi_d, fi_m = dio.load_fism2()
fi_d = fi_d[(fi_d['DateTime'] >= d_min-pd.DateOffset(minutes=5)) & (fi_d['DateTime'] <= d_max+pd.DateOffset(minutes=5))]

## Combine the DataFrames together

Use a time delta of 2.5 minutes, the time cadence of the OMNI and FISM2 data sets. 

When combining the grace data use a time delta of 50 seconds (the largest differnce in the grace cadence)


In [15]:
tol = pd.Timedelta('2.5 minute')

In [16]:
# create database of fism and omni data

fi_d = fi_d.rename(columns={'DateTime':'DateTime_fism2'})
fi_d.index = fi_d['DateTime_fism2']

om_d.index = om_d['DateTime']

database = pd.merge_asof(left=om_d,right=fi_d,right_index=True,left_index=True,direction='nearest',tolerance=tol)
database = database.rename(columns={'DateTime':'DateTime_omni'})

om_d.shape
database.shape

(272909, 75)

In [17]:
# add the grace data

gr_d = gr_d.rename(columns={'DateTime':'DateTime_gr'})
gr_d.index = gr_d['DateTime_gr']

tol = pd.Timedelta('50 second')

database = pd.merge_asof(left=database,right=gr_d,right_index=True,left_index=True,direction='nearest',tolerance=tol)

om_d.shape
database.shape

(272909, 88)

In [18]:
# get storm times

# read in storm start and end times
storm_txt = 'D:\\GitHub\\SatDrag\\data\\storms_drag_epochs_no_overlap.txt'
storm_time = pd.read_csv(storm_txt, header=None, skiprows=1, 
                     delim_whitespace=1, names = ['t_st','t_dst','t_en'], parse_dates=[0, 1, 2],
                     infer_datetime_format=True)

storm_time['t_st'].min()
storm_time['t_st'].max()

  storm_time = pd.read_csv(storm_txt, header=None, skiprows=1,
  storm_time = pd.read_csv(storm_txt, header=None, skiprows=1,


Timestamp('2017-12-02 01:00:00')

In [20]:
storm_time.tail()

Unnamed: 0,t_st,t_dst,t_en
438,2017-10-11 05:00:00,2017-10-14 05:00:00,2017-10-18 21:00:00
439,2017-10-23 22:00:00,2017-10-26 14:00:00,2017-10-27 15:00:00
440,2017-11-05 06:00:00,2017-11-08 01:00:00,2017-11-12 22:00:00
441,2017-11-18 14:00:00,2017-11-21 06:00:00,2017-11-24 17:00:00
442,2017-12-02 01:00:00,2017-12-04 20:00:00,2017-12-08 22:00:00


## Add storm times to the dataframe

In [21]:
database.columns

Index(['DateTime_omni', 'Year', 'DOY', 'Hour', 'Minute', 'IMF_id', 'SW_id',
       'IMF_pt', 'SW_pt', 'Per_int', 'Timeshift', 'RMS_Timeshift',
       'RMS_PhaseFrontNormal', 'Time_btwn_observations', 'B', 'Bx_GSEGSM',
       'By_GSE', 'Bz_GSE', 'By_GSM', 'Bz_GSM', 'RMS_SD_B',
       'RMS_SD_field_vector', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho',
       'Tp', 'dynP', 'Esw', 'Beta', 'AlfvenMach', 'X(s/c), GSE', 'Y(s/c), GSE',
       'Z(s/c), GSE', 'BSN location, Xgse', 'BSN location, Ygse',
       'BSN location, Zgse', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index', 'Na_Np Ratio',
       'MagnetosonicMach', 'Goes Proton flux (>10 MeV)',
       'Goes Proton flux (>30 MeV)', 'Goes Proton flux (>60 MeV)',
       'DateTime_fism2', '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '94400_16',
 

In [22]:
# loop through the storms and set
# flags in the satellite drag database
# for the different times

database = database.reset_index()
database['storm'] = -1
database['storm phase'] = -1

for index, row in storm_time.iterrows():
    stp = (database['DateTime']>=row['t_st']) & (database['DateTime']<row['t_en'])
    mpp = (database['DateTime']>=row['t_st']) & (database['DateTime']<row['t_dst'])
    rpp = (database['DateTime']>=row['t_dst']) & (database['DateTime']<row['t_en'])

    database.loc[stp,'storm'] = 1
    database.loc[mpp,'storm phase'] = 1
    database.loc[rpp,'storm phase'] = 2

In [23]:
database.columns

Index(['DateTime', 'DateTime_omni', 'Year', 'DOY', 'Hour', 'Minute', 'IMF_id',
       'SW_id', 'IMF_pt', 'SW_pt', 'Per_int', 'Timeshift', 'RMS_Timeshift',
       'RMS_PhaseFrontNormal', 'Time_btwn_observations', 'B', 'Bx_GSEGSM',
       'By_GSE', 'Bz_GSE', 'By_GSM', 'Bz_GSM', 'RMS_SD_B',
       'RMS_SD_field_vector', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho',
       'Tp', 'dynP', 'Esw', 'Beta', 'AlfvenMach', 'X(s/c), GSE', 'Y(s/c), GSE',
       'Z(s/c), GSE', 'BSN location, Xgse', 'BSN location, Ygse',
       'BSN location, Zgse', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index', 'Na_Np Ratio',
       'MagnetosonicMach', 'Goes Proton flux (>10 MeV)',
       'Goes Proton flux (>30 MeV)', 'Goes Proton flux (>60 MeV)',
       'DateTime_fism2', '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '

In [24]:
database = database.dropna(subset='DateTime_gr')
print(database.shape)

(269327, 91)


## Add AACGM Coordinates

In [18]:
lat = database['lat'].to_numpy()
lon = database['lon'].to_numpy()
alt = database['alt'].to_numpy()/1000.
dt = pd.to_datetime(database['DateTime_gr'].to_numpy())



In [20]:
x = [
    aacgmv2.get_aacgm_coord(glat,glon,galt,gdt)
    for glat, glon, galt, gdt in 
    zip(lat,lon,alt,dt)
    ]

unable to perform conversion at 10.4, -22.8 509.7 km, 2002-04-04 12:10:00 using method 4 <AACGM_v2_Convert returned error code -1>. Recall that AACGMV2 is undefined near the equator.
unable to perform conversion at 5.4, -19.9 483.9 km, 2002-04-04 23:55:00 using method 4 <AACGM_v2_Convert returned error code -1>. Recall that AACGMV2 is undefined near the equator.
unable to perform conversion at 4.1, -13.9 510.2 km, 2002-04-06 11:25:00 using method 4 <AACGM_v2_Convert returned error code -1>. Recall that AACGMV2 is undefined near the equator.
unable to perform conversion at 11.7, -10.9 483.3 km, 2002-04-06 23:10:00 using method 4 <AACGM_v2_Convert returned error code -1>. Recall that AACGMV2 is undefined near the equator.
unable to perform conversion at 10.6, -9.8 510.3 km, 2002-04-07 11:05:00 using method 4 <AACGM_v2_Convert returned error code -1>. Recall that AACGMV2 is undefined near the equator.
unable to perform conversion at 5.1, -6.9 482.5 km, 2002-04-07 22:50:00 using method 4 <

In [24]:
x = np.array(x)
database['mlat'] = x[:,0]
database['mlon'] = x[:,1]
database['mlt'] = x[:,2]

In [25]:
database.head()

Unnamed: 0,DateTime,DateTime_omni,Year,DOY,Hour,Minute,IMF_id,SW_id,IMF_pt,SW_pt,...,dens_x,dens_mean,flag_0,flag_1,DateTime_gr,storm,storm phase,mlat,mlon,mlt
0,2002-04-04 00:50:00,2002-04-04 00:50:00,2002,94,0,50,71.0,71.0,5.0,5.0,...,1.344974e-12,1.242133e-12,0.0,0.0,2002-04-04 00:50:00,1,2,-71.403237,-126.574355,11.594145
1,2002-04-04 00:55:00,2002-04-04 00:55:00,2002,94,0,55,71.0,71.0,5.0,4.0,...,1.46832e-12,1.239372e-12,0.0,0.0,2002-04-04 00:55:00,1,2,-52.080051,-136.756996,10.997453
2,2002-04-04 01:00:00,2002-04-04 01:00:00,2002,94,1,0,71.0,71.0,5.0,5.0,...,1.643068e-12,1.23661e-12,0.0,0.0,2002-04-04 01:00:00,1,2,-33.379533,-141.350071,10.773452
3,2002-04-04 01:05:00,2002-04-04 01:05:00,2002,94,1,5,71.0,71.0,5.0,5.0,...,1.700921e-12,1.234232e-12,0.0,0.0,2002-04-04 01:05:00,1,2,-18.508417,-144.383796,10.653469
4,2002-04-04 01:10:00,2002-04-04 01:10:00,2002,94,1,10,71.0,71.0,5.0,5.0,...,1.745759e-12,1.231855e-12,0.0,0.0,2002-04-04 01:10:00,1,2,18.645368,-146.158078,10.617515


## Save data

In [25]:
fn = 'D:\\data\\SatDensities\\satdrag_database_grace_{sat}_v3.hdf5'

database = database.reset_index(drop=True)
database.to_hdf(fn,key='database', mode='w', format='table', complevel=7)

In [26]:
database = pd.read_hdf('D:\\data\\SatDensities\\satdrag_database_grace_{sat}_v3.hdf5')

In [27]:
database.columns

Index(['DateTime', 'DateTime_omni', 'Year', 'DOY', 'Hour', 'Minute', 'IMF_id',
       'SW_id', 'IMF_pt', 'SW_pt', 'Per_int', 'Timeshift', 'RMS_Timeshift',
       'RMS_PhaseFrontNormal', 'Time_btwn_observations', 'B', 'Bx_GSEGSM',
       'By_GSE', 'Bz_GSE', 'By_GSM', 'Bz_GSM', 'RMS_SD_B',
       'RMS_SD_field_vector', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho',
       'Tp', 'dynP', 'Esw', 'Beta', 'AlfvenMach', 'X(s/c), GSE', 'Y(s/c), GSE',
       'Z(s/c), GSE', 'BSN location, Xgse', 'BSN location, Ygse',
       'BSN location, Zgse', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index', 'Na_Np Ratio',
       'MagnetosonicMach', 'Goes Proton flux (>10 MeV)',
       'Goes Proton flux (>30 MeV)', 'Goes Proton flux (>60 MeV)',
       'DateTime_fism2', '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '

In [29]:
database.tail(4)

Unnamed: 0,DateTime,DateTime_omni,Year,DOY,Hour,Minute,IMF_id,SW_id,IMF_pt,SW_pt,...,lat,lst,arglat,dens_x,dens_mean,flag_0,flag_1,DateTime_gr,storm,storm phase
269323,2020-12-31 23:40:00,2020-12-31 23:40:00,2020,366,23,40,51.0,51.0,5.0,4.0,...,-75.583,12.883,255.573,1.315893e-13,9.573114e-14,0.0,0.0,2020-12-31 23:40:00,-1,-1
269324,2020-12-31 23:45:00,2020-12-31 23:45:00,2020,366,23,45,51.0,51.0,5.0,3.0,...,-85.351,23.788,274.608,1.067437e-13,9.630565e-14,0.0,0.0,2020-12-31 23:45:00,-1,-1
269325,2020-12-31 23:50:00,2020-12-31 23:50:00,2020,366,23,50,51.0,51.0,5.0,3.0,...,-66.521,0.466,293.629,5.837071e-14,9.688016e-14,0.0,0.0,2020-12-31 23:50:00,-1,-1
269326,2020-12-31 23:55:00,2020-12-31 23:55:00,2020,366,23,55,51.0,51.0,5.0,3.0,...,-47.568,0.547,312.64,3.284377e-14,9.697857e-14,0.0,0.0,2020-12-31 23:55:00,-1,-1


### Add GOES XRS

In [40]:
fn = 'D:\\data\\SatDensities\\satdrag_database_grace_{sat}_v3.hdf5'
database = pd.read_hdf(fn)

In [43]:
g_sat = ['g11','g15']
g_tol = pd.Timedelta('1 minute')

# columns we are interested in comparing
gcol = ['xrsa_flux','xrsb_flux']

# data dir
d_dir = "D:/data/GOES/XRS/"
# dictionary for data 
dat = {}
t = []


# read in netcdf files with xarray
for x in g_sat:
    p = glob.glob(f"{d_dir}*{x}*.nc")
    if len(p) != 1:
        continue
    

    gdr = xr.open_dataset(p[0])
    gdf = gdr.to_dataframe()[gcol].add_suffix(f'_{x}')
    gdf = gdf.reset_index()
    gdf['DateTime'] = pd.to_datetime(gdf['time'])
    gdf = gdf.drop('time',axis=1)
    database = pd.merge_asof(left=database,right=gdf,left_on='DateTime',right_on='DateTime',
                             direction='nearest',tolerance=g_tol)
    del gdf
    

In [44]:
database.columns

Index(['DateTime', 'DateTime_omni', 'Year', 'DOY', 'Hour', 'Minute', 'IMF_id',
       'SW_id', 'IMF_pt', 'SW_pt', 'Per_int', 'Timeshift', 'RMS_Timeshift',
       'RMS_PhaseFrontNormal', 'Time_btwn_observations', 'B', 'Bx_GSEGSM',
       'By_GSE', 'Bz_GSE', 'By_GSM', 'Bz_GSM', 'RMS_SD_B',
       'RMS_SD_field_vector', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho',
       'Tp', 'dynP', 'Esw', 'Beta', 'AlfvenMach', 'X(s/c), GSE', 'Y(s/c), GSE',
       'Z(s/c), GSE', 'BSN location, Xgse', 'BSN location, Ygse',
       'BSN location, Zgse', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index', 'Na_Np Ratio',
       'MagnetosonicMach', 'Goes Proton flux (>10 MeV)',
       'Goes Proton flux (>30 MeV)', 'Goes Proton flux (>60 MeV)',
       'DateTime_fism2', '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '

In [45]:
database.to_hdf(fn,key='database', mode='w', format='table', complevel=7)


In [19]:
g_df = gdf.reset_index()
g_df['DateTime'] = pd.to_datetime(g_df['time'].to_numpy())