In [1]:
import pandas as pd
from ydata_profiling import ProfileReport
from ydata_profiling.utils.cache import cache_file

import phik
from phik.report import plot_correlation_matrix
from phik import report

import matplotlib.pyplot as plt

%matplotlib inline

#print all output in a cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [2]:
# Read in the satellite drag database
fn = 'D:\\data\\VL_sdrag\\satdrag_database_zlib.hdf5'
columns = ['400kmDensity','SYM/H_INDEX_nT', '1-M_AE_nT', 'DAILY_SUNSPOT_NO_', 'DAILY_F10.7_', 
       'SOLAR_LYMAN-ALPHA_W/m^2', 'mg_index (core to wing ratio (unitless))',
       'irradiance (W/m^2/nm)', 'storm', 'storm phase']
df = pd.read_hdf(fn,columns=columns)

df['d_diff'] = df['400kmDensity'].diff()
df = df.dropna()


In [3]:
df.columns

Index(['400kmDensity', 'SYM/H_INDEX_nT', '1-M_AE_nT', 'DAILY_SUNSPOT_NO_',
       'DAILY_F10.7_', 'SOLAR_LYMAN-ALPHA_W/m^2',
       'mg_index (core to wing ratio (unitless))', 'irradiance (W/m^2/nm)',
       'storm', 'storm phase', 'd_diff'],
      dtype='object')

In [4]:
df.head()

Unnamed: 0,400kmDensity,SYM/H_INDEX_nT,1-M_AE_nT,DAILY_SUNSPOT_NO_,DAILY_F10.7_,SOLAR_LYMAN-ALPHA_W/m^2,mg_index (core to wing ratio (unitless)),irradiance (W/m^2/nm),storm,storm phase,d_diff
1,1.276178e-12,-2.0,92.0,70.0,94.5,0.00697,0.26771,0.005673,1,1,-2.12424e-13
2,1.230694e-12,-2.0,130.0,70.0,94.5,0.00697,0.26771,0.005673,1,1,-4.5484e-14
3,1.220807e-12,-3.0,85.0,70.0,94.5,0.00697,0.26771,0.005673,1,1,-9.887e-15
4,1.304111e-12,-3.0,75.0,70.0,94.5,0.00697,0.26771,0.005673,1,1,8.3304e-14
5,1.465795e-12,-3.0,98.0,70.0,94.5,0.00697,0.26771,0.005673,1,1,1.61684e-13


In [5]:
df.describe()

Unnamed: 0,400kmDensity,SYM/H_INDEX_nT,1-M_AE_nT,DAILY_SUNSPOT_NO_,DAILY_F10.7_,SOLAR_LYMAN-ALPHA_W/m^2,mg_index (core to wing ratio (unitless)),irradiance (W/m^2/nm),storm,storm phase,d_diff
count,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0,4670940.0
mean,1.484615e-12,-11.41293,176.3493,47.52017,97.61624,0.006825335,0.2685655,0.005515092,0.1216672,0.4610534,-1.02021e-19
std,1.470658e-12,19.14744,214.1171,50.3869,54.29083,0.0007783128,0.004569255,0.0005460839,0.9925711,1.343722,1.700765e-13
min,7.457587e-20,-490.0,1.0,0.0,65.1,0.00588,0.26296,0.004873058,-1.0,-1.0,-1.175625e-11
25%,4.975017e-13,-18.0,39.0,9.0,71.5,0.006174,0.2647066,0.005050188,-1.0,-1.0,-3.6112e-14
50%,9.641593e-13,-8.0,88.0,31.0,85.2,0.006577,0.26704,0.005328221,1.0,1.0,4.7815e-16
75%,1.931388e-12,-1.0,236.0,73.0,111.1,0.007319,0.2714976,0.005854947,1.0,2.0,3.793913e-14
max,2.63181e-11,151.0,4192.0,281.0,999.9,0.009751,0.28494,0.00734935,1.0,2.0,1.223247e-11


In [6]:
df.shape

(4670940, 11)

In [7]:
correlations = {"auto": {"calculate": True},
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
        "kendall": {"calculate": True},
        "phi_k": {"calculate": True},
        "cramers": {"calculate": True},
    }
profile = ProfileReport(df.sample(1000000), correlations=correlations)
profile.to_file("all_data.html")

cc = profile.description_set["correlations"]
cc["phi_k"]["400kmDensity"]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.599569
1-M_AE_nT                                   0.286666
DAILY_SUNSPOT_NO_                           0.614090
DAILY_F10.7_                                0.507440
SOLAR_LYMAN-ALPHA_W/m^2                     0.680973
mg_index (core to wing ratio (unitless))    0.663269
irradiance (W/m^2/nm)                       0.685398
storm                                       0.196851
storm phase                                 0.177687
d_diff                                      0.325193
Name: 400kmDensity, dtype: float64

# Look at the profiles for storm break down

In [15]:
# storm/quiet data 
st_t = (df['storm']>0.95) & (df['storm']<1.05) # storm times
qt_t = (df['storm']<0) # quiet times
mn_p = (df['storm phase']>0.95) & (df['storm phase']<1.05) # main phase
rc_p = (df['storm phase']>1.95) & (df['storm phase']<2.05) # recovery phase

In [16]:
# sanity check that the data is the correct size
print(df[st_t].shape)
print(df[qt_t].shape)
print(df[mn_p].shape)
print(df[rc_p].shape)
print(df.shape)

df[st_t].shape[0]+df[qt_t].shape[0]
df[qt_t].shape[0]+df[mn_p].shape[0]+df[rc_p].shape[0]

(2619620, 11)
(2051320, 11)
(1034367, 11)
(1585253, 11)
(4670940, 11)


4670940

4670940

In [17]:
st_ii = [st_t,qt_t,mn_p,rc_p] # list of indexes to loop over
st_tt = ['storm','quiet','main','recovery'] # list of names

for sind, stype in zip(st_ii,st_tt):
    
    profile = ProfileReport(df[sind].sample(1000000), correlations=correlations)
    profile.to_file(f'{stype}_output.html')

    print(f'{stype}_output.html')
    cc = profile.description_set["correlations"]
    cc["phi_k"]["400kmDensity"]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

storm_output.html


400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.599561
1-M_AE_nT                                   0.252881
DAILY_SUNSPOT_NO_                           0.608755
DAILY_F10.7_                                0.491249
SOLAR_LYMAN-ALPHA_W/m^2                     0.674732
mg_index (core to wing ratio (unitless))    0.660126
irradiance (W/m^2/nm)                       0.681807
storm phase                                 0.021816
d_diff                                      0.242041
Name: 400kmDensity, dtype: float64

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

quiet_output.html


400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.378453
1-M_AE_nT                                   0.379510
DAILY_SUNSPOT_NO_                           0.578110
DAILY_F10.7_                                0.562134
SOLAR_LYMAN-ALPHA_W/m^2                     0.794366
mg_index (core to wing ratio (unitless))    0.767470
irradiance (W/m^2/nm)                       0.788961
d_diff                                      0.361020
Name: 400kmDensity, dtype: float64

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

main_output.html


400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.617623
1-M_AE_nT                                   0.301015
DAILY_SUNSPOT_NO_                           0.631958
DAILY_F10.7_                                0.508198
SOLAR_LYMAN-ALPHA_W/m^2                     0.692969
mg_index (core to wing ratio (unitless))    0.678561
irradiance (W/m^2/nm)                       0.691599
d_diff                                      0.263169
Name: 400kmDensity, dtype: float64

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

recovery_output.html


400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.617370
1-M_AE_nT                                   0.194703
DAILY_SUNSPOT_NO_                           0.591789
DAILY_F10.7_                                0.478831
SOLAR_LYMAN-ALPHA_W/m^2                     0.657249
mg_index (core to wing ratio (unitless))    0.644341
irradiance (W/m^2/nm)                       0.665928
d_diff                                      0.230440
Name: 400kmDensity, dtype: float64

# What does the 'all' look like when we deal with class imbalance?

In [20]:
#reg_df =df[st_t].sample(500000)
reg_df = pd.concat([df[st_t].sample(500000),df[qt_t].sample(500000)])
reg_df.describe()

Unnamed: 0,400kmDensity,SYM/H_INDEX_nT,1-M_AE_nT,DAILY_SUNSPOT_NO_,DAILY_F10.7_,SOLAR_LYMAN-ALPHA_W/m^2,mg_index (core to wing ratio (unitless)),irradiance (W/m^2/nm),storm,storm phase,d_diff
count,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0,1000000.0
mean,1.448488e-12,-10.740491,169.020393,46.338448,96.671341,0.006805,0.268475,0.0055,0.0,0.302266,1.675198e-16
std,1.453106e-12,18.423993,208.216124,49.952263,52.488133,0.000773,0.004557,0.000543,1.000001,1.347381,1.628727e-13
min,4.683731e-16,-487.0,1.0,0.0,65.1,0.00588,0.26296,0.004873,-1.0,-1.0,-7.241425e-12
25%,4.805903e-13,-17.0,37.0,5.0,71.2,0.006154,0.264674,0.005036,-1.0,-1.0,-3.532432e-14
50%,9.31384e-13,-8.0,82.0,30.0,84.2,0.006554,0.266948,0.005316,0.0,0.0,4.572e-16
75%,1.872047e-12,-1.0,223.0,71.0,109.9,0.007295,0.271344,0.005841,1.0,2.0,3.70282e-14
max,2.4308e-11,143.0,4174.0,281.0,999.9,0.009751,0.28494,0.007349,1.0,2.0,8.884224e-12


In [21]:

profile = ProfileReport(reg_df, correlations=correlations)
profile.to_file('qt_st_output.html')

print('qt_st_output.html')
cc = profile.description_set["correlations"]
cc["phi_k"]["400kmDensity"]

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

  discretized_df.loc[:, column] = self._discretize_column(
  (2 * xtie * ytie) / m + x0 * y0 / (9 * m * (size - 2)))


Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

qt_st_output.html


400kmDensity                                1.000000
SYM/H_INDEX_nT                              0.569963
1-M_AE_nT                                   0.259532
DAILY_SUNSPOT_NO_                           0.631548
DAILY_F10.7_                                0.513907
SOLAR_LYMAN-ALPHA_W/m^2                     0.699166
mg_index (core to wing ratio (unitless))    0.681083
irradiance (W/m^2/nm)                       0.705264
storm                                       0.213384
storm phase                                 0.191931
d_diff                                      0.266869
Name: 400kmDensity, dtype: float64