In [None]:
import pandas as pd
from ydata_profiling import ProfileReport
from ydata_profiling.utils.cache import cache_file

import phik
from phik.report import plot_correlation_matrix
from phik import report

import matplotlib.pyplot as plt

%matplotlib inline

#print all output in a cell 
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"



In [None]:
# Read in the satellite drag database
#fn = 'D:\\data\\VL_sdrag\\satdrag_database_zlib.hdf5'
#columns = ['400kmDensity','SYM/H_INDEX_nT', '1-M_AE_nT', 'DAILY_SUNSPOT_NO_', 'DAILY_F10.7_', 
#       'SOLAR_LYMAN-ALPHA_W/m^2', 'mg_index (core to wing ratio (unitless))',
#       'irradiance (W/m^2/nm)', 'storm', 'storm phase']
#df = pd.read_hdf(fn,columns=columns)

fn = 'D:\\data\\SatDensities\\satdrag_database_grace_B.hdf5'
df = pd.read_hdf(fn)

In [None]:
df.columns

In [None]:
columns = ['Bx_GSEGSM','By_GSM', 'Bz_GSM', 
           'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho', 'Tp',
           'dynP', 'Esw', 'AE', 'AL', 'AU', 'SYM_H index',
           '225', '600', '1300', '2500', '5100', '11250',
           '18950', '25700', '30500', '43000', '59500', '72400', '85550', '94400',
           '98100', '100700', '103850', '113000', 'F10', 'F81',
           'S10', 'S81c', 'M10', 'M81c', 'Y10', 'Y81c', 
           '400kmDensity', 'storm', 'storm phase']  

columns_solar = ['F10', 'F81',
           'S10', 'S81c', 'M10', 'M81c', 'Y10', 'Y81c', 
           '400kmDensity']  
columns_fism = ['225', '600', '1300', '2500', '5100', '11250',
           '18950', '25700', '30500', '43000', '59500', '72400', '85550', '94400',
           '98100', '100700', '103850', '113000','400kmDensity']
columns_swgeo = ['Bx_GSEGSM','By_GSM', 'Bz_GSM', 
           'Vsw', 'Vx_GSE', 'Vy_GSE', 'Vz_GSE', 'Prho', 'Tp',
           'dynP', 'Esw', 'AE', 'AL', 'AU', 'SYM_H index',
           '400kmDensity', 'storm', 'storm phase']  

pro_df = df[columns_fism]
pro_df.columns

In [None]:
pro_df = pro_df.dropna()
pro_df.head()


In [None]:
pro_df.describe()

In [None]:
pro_df.shape

In [None]:
sample_sz=1000

In [None]:
correlations = {"auto": {"calculate": True},
        "pearson": {"calculate": True},
        "spearman": {"calculate": True},
        "kendall": {"calculate": True},
        "phi_k": {"calculate": True},
        "cramers": {"calculate": True},
    }
profile = ProfileReport(pro_df.sample(100), correlations=correlations)
profile.to_file("all_data.html")

cc = profile.description_set["correlations"]
cc["phi_k"]["400kmDensity"]

# Look at the profiles for storm break down

In [None]:
# storm/quiet data 
st_t = (pro_df['storm']>0.95) & (pro_df['storm']<1.05) # storm times
qt_t = (pro_df['storm']<0) # quiet times
mn_p = (pro_df['storm phase']>0.95) & (pro_df['storm phase']<1.05) # main phase
rc_p = (pro_df['storm phase']>1.95) & (pro_df['storm phase']<2.05) # recovery phase

In [None]:
# sanity check that the data is the correct size
print(pro_df[st_t].shape)
print(pro_df[qt_t].shape)
print(pro_df[mn_p].shape)
print(pro_df[rc_p].shape)
print(pro_df.shape)

pro_df[st_t].shape[0]+pro_df[qt_t].shape[0]
pro_df[qt_t].shape[0]+pro_df[mn_p].shape[0]+pro_df[rc_p].shape[0]

In [None]:
st_ii = [st_t,qt_t,mn_p,rc_p] # list of indexes to loop over
st_tt = ['storm','quiet','main','recovery'] # list of names

for sind, stype in zip(st_ii,st_tt):
    
    profile = ProfileReport(pro_df[sind].sample(sample_sz), correlations=correlations)
    profile.to_file(f'{stype}_output.html')

    print(f'{stype}_output.html')
    cc = profile.description_set["correlations"]
    cc["phi_k"]["400kmDensity"]

# What does the 'all' look like when we deal with class imbalance?

In [None]:
#reg_df =df[st_t].sample(500000)
reg_df = pd.concat([pro_df[st_t].sample(sample_sz/2),pro_df[qt_t].sample(sample_sz/2)])
reg_df.describe()

In [None]:

profile = ProfileReport(reg_df, correlations=correlations)
profile.to_file('qt_st_output.html')

print('qt_st_output.html')
cc = profile.description_set["correlations"]
cc["phi_k"]["400kmDensity"]