# Density RF Model 

Barebones notebook to reproduce the Murphy et al. [2024] density model. 

Only the model is reproduced, no testing is done (residuals, hyperparameters, permutation importance, etc.). This was done in another set of analysis and removing it here simplifies the notebook. 

In [39]:
import pandas as pd
import numpy as np
import time
import gc
import pickle
import os

from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import median_absolute_error
from sklearn.metrics import r2_score

In [2]:
# random state and random forest parameters
# random state ensures the same model is generated

rnd=17
rf_params = {
    "n_estimators": 500,
    "max_depth": None,
    "min_samples_split": 2,
    "min_samples_leaf":5,
    "warm_start":False,
    "oob_score":True,
    "random_state": rnd,
    "max_features":0.5,
    "n_jobs":10
    }

In [71]:
def dat_create(dat, col, log_col, lt_col, y_col, t_col):

    x_dat = dat[col+t_col+[y_col]].dropna().copy()

    if log_col:
       for i in log_col:
            try:
                x_dat[i] = np.log10(x_dat[i])
            except:
                print(f'Could not log column {i}')
    
    if lt_col:
        for i in lt_col:
            try:
                if dat[i].max() > 24:
                    x_dat[f'cos_{i}'] = np.cos(dat[i]*2*np.pi/360.)
                    x_dat[f'sin_{i}'] = np.sin(dat[i]*2*np.pi/360.)
                else:
                    x_dat[f'cos_{i}'] = np.cos(dat[i]*2*np.pi/24.)
                    x_dat[f'sin_{i}'] = np.sin(dat[i]*2*np.pi/24.)    
            except:
                print(f'Could not add {i} as a cos/sin time column')
    
    x_dat = x_dat[~x_dat.isin([np.nan, np.inf, -np.inf]).any(axis=1)].dropna()
    y_dat = x_dat[y_col].copy()
    x_dat = x_dat.drop(columns=y_col)    
    
    return x_dat, y_dat

In [5]:
def rf_model(col=['1300_02', 'SYM_H index','SatLat'], 
             y_col='400kmDensity', 
             t_col=['DateTime'], 
             log_col=['1300_02'], 
             lt_col=['SatMagLT'], 
             rf_params=rf_params, 
             target_dat='D:\\data\\SatDensities\\satdrag_database_grace_B.hdf5', 
             oos_dat='D:\\data\\SatDensities\\satdrag_database_grace_A.hdf5',
             oos_dat2='D:\\data\\SatDensities\\satdrag_database_grace_CHAMP_SI_int.hdf5',
             n_repeats=10):
    
    
    rnd = rf_params['random_state']
    
    dat_dic = {'feature_cols':col,
               'target_cols':y_col,
               'time_cols':t_col,
               'log_col':log_col,
               'lt_col':lt_col}

    kcol = [col,[y_col],t_col,lt_col]
    kflt = [item for sublist in kcol for item in sublist]
    df = pd.read_hdf(target_dat)
    df = df[kflt].dropna()

    reg_x, reg_y = dat_create(dat=df,col=col,log_col=log_col,lt_col=lt_col,
                              y_col=y_col,t_col=t_col)
    reg_y = reg_y*(10**12)
    

    # create data set from out of sample data
    df_oos = pd.read_hdf(oos_dat)
    oos_x, oos_y = dat_create(dat=df_oos,col=col,log_col=log_col,lt_col=lt_col,
                              y_col=y_col,t_col=t_col)
    oos_y = oos_y*(10**12)
    oos_t = oos_x[t_col]
    oos_x = oos_x.drop(columns=t_col)
    

    df_oos2 = pd.read_hdf(oos_dat2)
    oos_x2, oos_y2 = dat_create(dat=df_oos2,col=col,log_col=log_col,lt_col=lt_col,
                                y_col=y_col,t_col=t_col)
    oos_y2 = oos_y2*(10**12)
    oos_t2 = oos_x2[t_col]
    oos_x2 = oos_x2.drop(columns=t_col)

    del df
    del df_oos
    del df_oos2
    gc.collect
    
    # create train test splits
    train_x, test_x, train_y, test_y = train_test_split(reg_x, reg_y, 
                                                        test_size=0.3, 
                                                        random_state=rnd)

    # get and drop DateTime column
    train_t = train_x[t_col].copy()
    test_t = test_x[t_col].copy()

    train_x = train_x.drop(columns=t_col)
    test_x = test_x.drop(columns=t_col)

    print('Train and fit model')

    start = time.time()
    print("Time elapsed working on RandomForest")

    rfr = RandomForestRegressor(**rf_params)
    rfr.fit(train_x, train_y)

    end = time.time()
    print("Time consumed in working: ",end - start)

    #Make predictions and calculate error
    predictions = rfr.predict(test_x)
    pre_oos = rfr.predict(oos_x)
    pre_oos2 = rfr.predict(oos_x2)
    pre_tr = rfr.predict(train_x)
    
    # combine data sets into single dataframes
    train_d = train_x.join([train_y,train_t], how='left')
    test_d = test_x.join([test_y,test_t], how='left')
    oos_d = oos_x.join([oos_y,oos_t], how='left')
    oos2_d = oos_x2.join([oos_y2,oos_t2], how='left')
    
    # add predictions to the dataframes
    train_d[y_col+'_pred'] = pre_tr
    test_d[y_col+'_pred'] = predictions
    oos_d[y_col+'_pred'] = pre_oos
    oos2_d[y_col+'_pred'] = pre_oos2
    
    
    
    return rfr, train_d, test_d, oos_d, oos2_d, dat_dic

In [6]:
def rf_run(y_col='400kmDensity', 
           lt_col=['SatMagLT'],
           pre_f = False,
           app_f = False
           ):
    """
    Run a set of random forest models 

    Returns
    -------
    None.
    
    Saves data frames to file for subsequent analysis

    """
    
    # out_dir 
    o_dir = 'D:\\data\\SatDensities\\'
    
    # repeats for permutation importance
    n_repeats = 5
    # columns that are not used in the model but are returned
    # to make subsequent analysis easier
    t_col = ['DateTime','storm','storm phase']
    
    # columns to log for fism and geo datasets
    fi_log = ['1300_02', '43000_09', '85550_13', '94400_18']
    
    # solar indice columns
    si_col = ['F10', 'F81', 'S10', 'S81c', 'M10', 'M81c', 'Y10', 'Y81c', 'SatLat']
            
    # fism2 columns
    fi_col = ['1300_02', '43000_09', '85550_13', '94400_18', 'SatLat']
 
    # fism2 and geo columns
    fgeo_col = ['1300_02', '43000_09', '85550_13', '94400_18', 'SYM_H index', 'AE', 'SatLat']

    # labels
    data_labels = ['SI','FI','FI_GEO']
    data_sets = [si_col, fi_col, fgeo_col]

    data_labels = ['FI_GEO']
    data_sets = [fgeo_col]
    
    for col, d_in in zip(data_sets,data_labels):
        
        print(d_in)

        rf_dat = rf_model(y_col=y_col, lt_col=lt_col,
                          col=col, t_col=t_col, log_col=fi_log, 
                          n_repeats=n_repeats)
        
        fn = f'{d_in}_RFdat'
        if pre_f:
            fn = f'{pre_f}{fn}'
        if app_f:
            fn = f'{fn}{app_f}'
            
        fn = f'{fn}.pkl'
        fn = os.path.join(o_dir,fn)
        
        with open(fn, 'wb') as f:
            pickle.dump(rf_dat, f)
            
        del rf_dat
        gc.collect

        

In [7]:
rf_run(app_f='_AIMFAHR')

FI_GEO


  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)
  result = getattr(ufunc, method)(*inputs, **kwargs)


Train and fit model
Time elapsed working on RandomForest
Time consumed in working:  598.991756439209


In [8]:
new = pd.read_pickle("D:\data\SatDensities\FI_GEO_RFdat_AIMFAHR.pkl")


In [9]:
new[-1]

{'feature_cols': ['1300_02',
  '43000_09',
  '85550_13',
  '94400_18',
  'SYM_H index',
  'AE',
  'SatLat'],
 'target_cols': '400kmDensity',
 'time_cols': ['DateTime', 'storm', 'storm phase'],
 'log_col': ['1300_02', '43000_09', '85550_13', '94400_18'],
 'lt_col': ['SatMagLT']}

In [10]:
new[-2].head()

Unnamed: 0,1300_02,43000_09,85550_13,94400_18,SYM_H index,AE,SatLat,cos_SatMagLT,sin_SatMagLT,400kmDensity,DateTime,storm,storm phase,400kmDensity_pred
105120,7.648391,10.081208,9.545769,9.306003,-5.0,23.0,43.7912,-0.58071,-0.81411,3.659148,2003-01-01 00:00:00,1,2,4.082639
105121,7.650381,10.061229,9.536066,9.306486,-4.0,20.0,24.33522,-0.504256,-0.863554,4.745748,2003-01-01 00:05:00,1,2,4.586757
105122,7.650341,10.06138,9.536143,9.30649,-4.0,26.0,6.12783,-0.444752,-0.895654,5.470819,2003-01-01 00:10:00,1,2,4.619495
105123,7.65022,10.061382,9.535996,9.30639,-3.0,23.0,-15.32528,-0.369925,-0.929062,5.239831,2003-01-01 00:15:00,1,2,3.989298
105124,7.650038,10.061252,9.535645,9.306199,-3.0,27.0,-33.78976,-0.274488,-0.961591,5.05054,2003-01-01 00:20:00,1,2,3.744127


In [11]:
dat = pd.read_hdf('D:\\data\\SatDensities\\satdrag_database_grace_CHAMP_SI_int.hdf5')

In [None]:
dat.loc[105120,'43000_09']

9.545769348360656

In [34]:
new[1].head()

Unnamed: 0,1300_02,43000_09,85550_13,94400_18,SYM_H index,AE,SatLat,cos_SatMagLT,sin_SatMagLT,400kmDensity,DateTime,storm,storm phase,400kmDensity_pred
200511,7.490862,9.911153,9.455465,9.228625,-15.0,105.0,-60.03617,-0.494025,0.869448,0.987248,2004-06-27 05:15:00,-1,-1,0.958129
242348,7.465065,9.924064,9.463639,9.232483,-6.0,18.0,-60.03854,-0.870029,0.493001,3.009913,2004-11-19 11:40:00,-1,-1,2.395532
994822,7.696125,9.986703,9.501317,9.272544,5.0,22.0,-87.11732,0.919029,0.394191,2.751377,2012-01-15 05:50:00,-1,-1,2.378069
779234,6.966885,9.780912,9.396972,9.17152,-10.0,78.0,25.74049,0.544046,0.839055,0.329241,2009-12-27 16:10:00,-1,-1,0.294765
623312,6.596638,9.738198,9.379986,9.153376,3.0,19.0,-20.07795,0.625324,-0.780365,0.132017,2008-07-04 06:40:00,-1,-1,0.140145


In [8]:
rfr = est[0]

In [10]:
rfr.__getstate__()

{'estimator': DecisionTreeRegressor(),
 'n_estimators': 500,
 'estimator_params': ('criterion',
  'max_depth',
  'min_samples_split',
  'min_samples_leaf',
  'min_weight_fraction_leaf',
  'max_features',
  'max_leaf_nodes',
  'min_impurity_decrease',
  'random_state',
  'ccp_alpha'),
 'base_estimator': 'deprecated',
 'bootstrap': True,
 'oob_score': True,
 'n_jobs': 10,
 'random_state': 17,
 'verbose': 0,
 'warm_start': False,
 'class_weight': None,
 'max_samples': None,
 'criterion': 'squared_error',
 'max_depth': None,
 'min_samples_split': 2,
 'min_samples_leaf': 5,
 'min_weight_fraction_leaf': 0.0,
 'max_features': 0.5,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'ccp_alpha': 0.0,
 'feature_names_in_': array(['1300_02', '43000_09', '85550_13', '94400_18', 'SYM_H index', 'AE',
        'SatLat', 'cos_SatMagLT', 'sin_SatMagLT'], dtype=object),
 'n_features_in_': 9,
 'n_outputs_': 1,
 'estimator_': DecisionTreeRegressor(),
 'estimators_': [DecisionTreeRegressor(max_feature

In [35]:
import sys, os

# add read_io module to current path ()
# and import
file_path = 'D:\\GitHub\\DataIO\\'
sys.path.append(os.path.dirname(file_path))
import data_io as dio

In [64]:
sdate = '2024-05-10'
edate = '2024-05-15'

om_d, om_m = dio.load_omni(res='5m',sdate=sdate, edate=edate)

d_min = pd.to_datetime(sdate)
d_max = pd.to_datetime(edate)
om_d = om_d[(om_d['DateTime'] >= d_min-pd.DateOffset(minutes=5)) & (om_d['DateTime'] <= d_max+pd.DateOffset(minutes=5))]

In [65]:
# read fsim2 data and truncate to similar range as grace data
fi_d, fi_m = dio.load_fism2_daily(sdate=sdate, edate=edate)

In [66]:
fi_d = fi_d[['1300_02', '43000_09', '85550_13', '94400_18','DateTime']]
om_d = om_d[['SYM_H index', 'AE','DateTime']]

In [67]:
fi_d = fi_d.rename(columns={'DateTime':'DateTime_fism2'})
tol = pd.Timedelta('2.5 minute')
fi_d.index = fi_d['DateTime_fism2']
om_d.index = om_d['DateTime']

database = pd.merge_asof(left=om_d,right=fi_d,right_index=True,left_index=True,direction='nearest',tolerance=tol)
database = database.rename(columns={'DateTime':'DateTime_omni'})

In [70]:
database.head(2)

Unnamed: 0_level_0,SYM_H index,AE,DateTime_omni,1300_02,43000_09,85550_13,94400_18,DateTime_fism2
DateTime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-05-09 23:55:00,7.0,,2024-05-09 23:55:00,,,,,NaT
2024-05-10 00:00:00,6.0,117.0,2024-05-10 00:00:00,118508700.0,14107830000.0,4022602000.0,2340119000.0,2024-05-10


In [76]:
cols = {'feature_cols': ['1300_02',
  '43000_09',
  '85550_13',
  '94400_18',
  'SYM_H index',
  'AE'],
 'target_cols': '400kmDensity',
 'time_cols': ['DateTime', 'storm', 'storm phase'],
 'log_col': ['1300_02', '43000_09', '85550_13', '94400_18'],
 'lt_col': ['SatMagLT']}

In [77]:
reg_x, _ = dat_create(dat=database,col=cols['feature_cols'],
                      log_col=cols['log_col'],
                      lt_col=cols['lt_col'],
                      y_col='DateTime_omni',t_col=['DateTime_omni'])


Could not add SatMagLT as a cos/sin time column


In [80]:
reg_x.to_hdf("D:\data\SatDensities\FI_GEO_RFdat_AIMFAHR_inputs_MayStorm.hdf",format='table',key='inputs',complevel=3)