#### Getting and saving all the data from Murphy+25
for downstream experiments

In [None]:
import numpy as np
import pandas as pd
import h5py as h5

In [1]:
rnd = 17 # random state

In [None]:
def dat_create(dat, col, log_col, lt_col, y_col, t_col):

    x_dat = dat[col+t_col+[y_col]].dropna().copy()

    if log_col:
       for i in log_col:
            try:
                x_dat[i] = np.log10(x_dat[i])
            except:
                print(f'Could not log column {i}')
    
    if lt_col:
        for i in lt_col:
            try:
                if dat[i].max() > 24:
                    x_dat[f'cos_{i}'] = np.cos(dat[i]*2*np.pi/360.)
                    x_dat[f'sin_{i}'] = np.sin(dat[i]*2*np.pi/360.)
                else:
                    x_dat[f'cos_{i}'] = np.cos(dat[i]*2*np.pi/24.)
                    x_dat[f'sin_{i}'] = np.sin(dat[i]*2*np.pi/24.)    
            except:
                print(f'Could not add {i} as a cos/sin time column')
    
    x_dat = x_dat[~x_dat.isin([np.nan, np.inf, -np.inf]).any(axis=1)].dropna()
    y_dat = x_dat[y_col].copy()
    x_dat = x_dat.drop(columns=y_col)    
    
    return x_dat, y_dat

In [4]:
target_dat='./satdrag_database_grace_B.hdf5'
df_orig = pd.read_hdf(target_dat)

In [5]:
df_orig.columns

Index(['DateTime', 'DateTime_omni', 'B', 'Bx_GSEGSM', 'By_GSE', 'Bz_GSE',
       'By_GSM', 'Bz_GSM', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho', 'Tp',
       'dynP', 'Esw', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index', 'Goes Proton flux (>10 MeV)',
       'Goes Proton flux (>30 MeV)', 'Goes Proton flux (>60 MeV)',
       'DateTime_fism2', '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '94400_16',
       '94400_17', '94400_18', '98100_19', '100700_20', '103850_21',
       '113000_22', 'DateTime_si', 'F10', 'F81', 'S10', 'S81c', 'M10', 'M81c',
       'Y10', 'Y81c', 'DateTime_gr', 'CenterLat', 'SatLat', 'SatLon',
       'SatHeight', 'SatLT', 'SatDipoleLat', 'SatMagLon', 'SatMagLT',
       'SatDensity', '400kmDensity', '410kmDensity', 'NRLMSISe00atSat',
       'DenUncertainty', 'NumPts', '

In [6]:
"""
full_col = ['B', 'Bx_GSEGSM', 'By_GSM', 'Bz_GSM', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho', 'Tp',
       'dynP', 'Esw', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index',
        '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '94400_16',
       '94400_17', '94400_18', '98100_19', '100700_20', '103850_21',
       '113000_22', 'F10', 'F81', 'S10', 'S81c', 'M10', 'M81c',
       'Y10', 'Y81c', 'SatLat']
"""

# Note: we remove integrated solar indices since there are very few of them in the dataset
full_col = ['B', 'Bx_GSEGSM', 'By_GSM', 'Bz_GSM', 'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho', 'Tp',
       'dynP', 'Esw', 'AE', 'AL', 'AU', 'SYM_D index', 'SYM_H index',
       'ASY_D index', 'ASY_H index', 'PC index',
        '225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '94400_16',
       '94400_17', '94400_18', '98100_19', '100700_20', '103850_21',
       '113000_22', 'SatLat']

full_log_col=['225_00', '600_01', '1300_02', '2500_03', '5100_04',
       '11250_05', '18950_06', '25700_07', '30500_08', '43000_09', '59500_10',
       '72400_11', '72400_12', '85550_13', '85550_14', '85550_15', '94400_16',
       '94400_17', '94400_18', '98100_19', '100700_20', '103850_21']

extra_vars = ['storm','storm phase']

y_col='400kmDensity'
t_col=['DateTime']

fgeo_col = ['SYM_H index', 'AE', 'SatLat','1300_02', '43000_09', '85550_13', '94400_18']
fgeo_log_col = ['1300_02', '43000_09', '85550_13', '94400_18'] 

lt_col=['SatMagLT']

In [78]:
# Check to see what are the limiting factors for dataset size (i.e. no NaNs)
for nme in full_col:
    print(nme, df_orig[nme].dropna().shape[0])

B 1005050
Bx_GSEGSM 1005050
By_GSM 1005050
Bz_GSM 1005050
Vsw 982486
Vx_GSE 982486
Vy_GSE 982486
Vz_GSE 982486
Prho 982486
Tp 982347
dynP 984172
Esw 981772
AE 1043137
AL 1043137
AU 1043137
SYM_D index 1043137
SYM_H index 1043137
ASY_D index 1043137
ASY_H index 1043137
PC index 1037748
225_00 999073
600_01 999073
1300_02 999073
2500_03 999073
5100_04 999073
11250_05 999073
18950_06 999073
25700_07 999073
30500_08 999073
43000_09 999073
59500_10 999073
72400_11 999073
72400_12 999073
85550_13 999073
85550_14 999073
85550_15 999073
94400_16 999073
94400_17 999073
94400_18 999073
98100_19 999073
100700_20 999073
103850_21 999073
113000_22 999073
F10 3622
F81 3622
S10 3622
S81c 3622
M10 3622
M81c 3622
Y10 3622
Y81c 3622
SatLat 948517


NOTE: to get original FISM2-GEO dataset, use `fgeo_col` and `fgeo_log_col`. Otherwise, pass in the columns for whatever stuff you need.

In [7]:
col = fgeo_col # full_col
log_col = fgeo_log_col # full_log_col
fn = 'FI_GEO_RFdat_AIMFAHR_data.h5' # 'FULL_RF_data.h5' 

kcol = [col,[y_col],t_col,lt_col]
kflt = [item for sublist in kcol for item in sublist]
df = df_orig[kflt].dropna()
reg_x, reg_y = dat_create(dat=df,col=col,log_col=log_col,lt_col=lt_col,y_col=y_col,t_col=t_col)
reg_y = reg_y*(10**12) #normalize density cause it's really small

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [8]:
reg_x.shape, reg_y.shape

((829967, 47), (829967,))

In [9]:
# now train/test split
train_x, test_x, train_y, test_y = train_test_split(reg_x, reg_y, 
                                                        test_size=0.3, 
                                                        random_state=rnd)

In [10]:
train_d = train_x.join([train_y], how='left')
test_d = test_x.join([test_y], how='left')

In [11]:
df_oos = pd.read_hdf('./satdrag_database_grace_A.hdf5')
df_1 = df_oos[kflt].dropna()
reg_x, reg_y = dat_create(dat=df_1,col=col,log_col=log_col,lt_col=lt_col,y_col=y_col,t_col=t_col)
reg_y = reg_y*(10**12)
oos_d = reg_x.join([reg_y], how='left')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [12]:
df_oos2 = pd.read_hdf('./satdrag_database_grace_CHAMP_SI_int.hdf5')
df_2 = df_oos2[kflt].dropna()
reg_x, reg_y = dat_create(dat=df_2,col=col,log_col=log_col,lt_col=lt_col,y_col=y_col,t_col=t_col)
reg_y = reg_y*(10**12)
oos2_d = reg_x.join([reg_y], how='left')

  result = getattr(ufunc, method)(*inputs, **kwargs)


In [13]:
fn = os.path.join('./',fn)

train_d.to_hdf(fn, key="train_d", mode="a", format="table", data_columns=True)
test_d.to_hdf(fn, key="test_d", mode="a", format="table", data_columns=True)
oos_d.to_hdf(fn, key="oos_d", mode="a", format="table", data_columns=True)
oos2_d.to_hdf(fn, key="oos2_d", mode="a", format="table", data_columns=True)

  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attribute_name(name)
  check_attrib