In [None]:
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.dates as mdates
import numpy as np
import pandas as pd
import os
import glob
import random 


from datetime import date, datetime, timedelta, time

from sunpy.net import Fido
from sunpy.net import attrs as a
from sunpy.timeseries import TimeSeries



Steps
1. Choose event
2. Get GIC data and clean
3. Get MAG data and clean
4. Get solar wind and geo index data
5. Integrate based on datetime 
6. Output integrated data files for GIC and MAG separately


### Utility Functions

In [None]:
def nerc_string_time_reader(df,columnLabel):
    '''
        df = dataframe with NERC data readings
        columnLabel = column label for the time strings  
    '''
    
    dt_list = [datetime.strptime(val, '%m/%d/%Y %I:%M:%S %p') for val in df[columnLabel].values]
    
    return dt_list


def nerc_gic_data_cleaning(dir_location, df_GIC):
    '''
        Returns a string with Device IDs that indicate bad stations to remove from the DF of GIC observations
    '''
    
    running_bad_station_list = []
    
    # Remove Stations with error quality reports
    for file in glob.glob(os.path.join(dir_location,'gic_monitor_missing_data*')):
        file_errors = file
#         file_errors = 'gic_monitor_missing_data_data_quality_reports_2015E06.csv'
    bad_gic_data = pd.read_csv(file_errors)
    print('bad gic stations for this event = \n{}'.format(list(set(bad_gic_data['Device ID'].values))))

    for l in list(set(bad_gic_data['Device ID'].values)):
        running_bad_station_list.append(str(l))
        
    # Remove Stations with low numbers of observations or whose observations are constant
    for c in df_GIC.columns.to_list():

        nan_mask = ~np.isnan(df_GIC[c])

        if df_GIC[c].count() < 2000.:
            running_bad_station_list.append(c)
        elif ( len(np.unique((df_GIC[c][nan_mask]))) < 10 ): # less than 10 unique data points
            print('station {} has < 10 unique data points'.format(c))
            running_bad_station_list.append(c)
        
    
    return running_bad_station_list


def nerc_gic_data_reshaping(df):
    '''
        df must have the following columns:
            GICDeviceID
            SampleDateTime
            GICMeasured
    '''
    list_times = nerc_string_time_reader(df,'SampleDateTime')
#     list_times
    
    df_times = pd.Series(list_times,name='datetimes')
    df_new = pd.concat([df,df_times],axis=1)
    df_new = df_new.set_index('datetimes')
    colName = str(df_new['GICDeviceID'].values[0])
    df_new = df_new.rename(columns={"GICMeasured": colName})
    df_new = df_new.drop(columns=['GICDeviceID','SampleDateTime'])

    
    return df_new

In [None]:

# Get the current working directory
cwd = os.getcwd()

# Print the current working directory
print("Current working directory: {0}".format(cwd))


### Choose event and get files

In [None]:
event_no = 8


event_list = np.loadtxt('event_list.txt', str)

for i in range(len(event_list)):
    print('\t events list item {}:{}'.format(i,event_list[i]))

files_dir_gic = 'data/'+event_list[event_no,3]+'/GIC/'


print(files_dir_gic) 

### Get GIC data

In [None]:
# Create a dataframe of datetimes spanning the event and that will be used as the common index for all time series

format = '%Y-%m-%dT%H:%M'

event_start = datetime.strptime(event_list[event_no,1], format)
event_end = datetime.strptime(event_list[event_no,2], format) + timedelta(days=1)
df_event_dates = pd.date_range(event_start,event_end,freq='10s')

df_event_dates = df_event_dates.to_pydatetime()
df_event_dates = pd.DataFrame(df_event_dates,columns=['datetimes'])
# df_event_dates = df_event_dates.index.rename('Datetimes', inplace=True)
df_event_dates


In [None]:
# loop over files in the directory for a given event and create merged DFs for GICs

df_GIC = df_event_dates

for f in glob.glob(os.path.join(files_dir_gic,'*csv')):#[0:10]:
    
    # Skip missing data files
    if ('missing' in f) | ('monitor' in f) | ('magnetometers' in f):
        print('-------> skipping file = {}'.format(f))
        continue
    print(f)
    
    # Read and reshape the data
    df_loop = pd.read_csv(f)
    
    df_loop = nerc_gic_data_reshaping(df_loop)
            
    # Merge into full dataframe
    df_GIC = pd.merge(df_GIC, df_loop, on='datetimes',how='left')

In [None]:
df_GIC

In [None]:
df_GIC.columns[1:].to_list()

In [None]:
# import plotly.express as px
import plotly.graph_objects as go


# fig = plt.figure(figsize=(10,5))
fig=go.Figure()

for i in range(0,2):
    rnd_st = df_GIC.columns.to_list()[random.randint(0,len(df_GIC.columns)-1)]

    print('plotting station = {}'.format(rnd_st))
#     px.scatter(df_GIC,x='datetimes',y=rnd_st)
    fig.add_trace(go.Scatter(x=df_GIC['datetimes'], 
                             y=df_GIC[rnd_st],
                             mode='markers'))
    
#     plt.scatter(df_GIC['datetimes'].values,df_GIC[rnd_st],label=rnd_st)
# plt.grid(True)
fig.show()

### Apply filtering of GIC data


In [None]:
bad_stations = nerc_gic_data_cleaning(files_dir_gic, df_GIC)
print(bad_stations)


In [None]:

print('Number of stations prior to removal of bad data = {}'.format(df_GIC.shape[1]))
df_GIC = df_GIC.copy(True).drop(columns=bad_stations)
print('Number of stations after removal of bad data = {}'.format(df_GIC.shape[1]))


### Align with solar wind and geo indices

In [None]:
sunpy_format = '%Y/%m/%d %H:%M'

trange = a.Time(event_start.strftime(sunpy_format), event_end.strftime(sunpy_format))
dataset = a.cdaweb.Dataset('OMNI_HRO2_5MIN')
result = Fido.search(trange, dataset)

downloaded_files = Fido.fetch(result[0])
print(downloaded_files)

In [None]:
pd.set_option('display.max_columns', None)

sw_data = TimeSeries(downloaded_files, concatenate=True)
df_sw = sw_data.to_dataframe()
df_sw['datetimes'] = df_sw.index

print(df_sw.columns)

In [None]:
# Filter out the filling values in the solar wind + geomag indices data

dict_mag = {9999.990234375: np.nan}
dict_vel = {99999.8984375: np.nan}
dict_den = {999.989990234375: np.nan}
dict_pre = {99.98999786376953: np.nan}
dict_ind = {99999: np.nan}

df_sw = df_sw.replace({'F': dict_mag})
df_sw = df_sw.replace({'BX_GSE': dict_mag})
df_sw = df_sw.replace({'BY_GSM': dict_mag})
df_sw = df_sw.replace({'BZ_GSM': dict_mag})
df_sw = df_sw.replace({'flow_speed': dict_vel})
df_sw = df_sw.replace({'proton_density': dict_den})
df_sw = df_sw.replace({'Pressure': dict_pre})
df_sw = df_sw.replace({'SYM_H': dict_ind})
df_sw = df_sw.replace({'AE_INDEX': dict_ind})
df_sw = df_sw.replace({'AL_INDEX': dict_ind})
df_sw = df_sw.replace({'AU_INDEX': dict_ind})

### Merge solar wind, geo indices, and GIC

In [None]:
df_final = pd.merge(df_GIC,df_sw,on='datetimes',how='left')
print(df_final)

### Visualize as check on process

In [None]:
plot_ID = '10119'


fig, axarr = plt.subplots(8, sharex=True)
plt.subplots_adjust(hspace = .001) 

axarr[0].set_xlim([event_start,event_end])

axarr[0].set_ylabel('|B| [nT]')
axarr[0].scatter(df_final['datetimes'], df_final['F'], color='black', s=8)

axarr[1].set_ylabel('B_GSM [nT]')
axarr[1].scatter(df_final['datetimes'], df_final['BX_GSE'], color='tab:blue', s=8)
axarr[1].scatter(df_final['datetimes'], df_final['BY_GSM'], color='tab:orange', s=8)
axarr[1].scatter(df_final['datetimes'], df_final['BZ_GSM'], color='tab:green', s=8)

axarr[2].set_ylabel('V [km/s]')
axarr[2].scatter(df_final['datetimes'], df_final['flow_speed'], color='black', s=8)

axarr[3].set_ylabel('Np [#/cc]')
axarr[3].scatter(df_final['datetimes'], df_final['proton_density'], color='black', s=8)

axarr[4].set_ylabel('P [nPa]')
axarr[4].scatter(df_final['datetimes'], df_final['Pressure'], color='black', s=8)

axarr[5].set_ylabel('SYM-H [nT]')
axarr[5].scatter(df_final['datetimes'], df_final['SYM_H'], color='black', s=8)

axarr[6].set_ylabel('AE [nT]')
axarr[6].scatter(df_final['datetimes'], df_final['AE_INDEX'], color='black', s=8)

axarr[7].set_ylabel('GIC')
axarr[7].scatter(df_final['datetimes'], df_final[plot_ID], color='black', lw=0.8)

axarr[7].xaxis.set_major_locator(mdates.HourLocator([0,6,12,18]))
axarr[7].xaxis.set_major_formatter(mdates.DateFormatter('%H:00'))
axarr[7].tick_params(axis='x', which='major')

xaxis_copy = axarr[7].secondary_xaxis('bottom')
xaxis_copy.xaxis.set_major_locator(mdates.DayLocator(interval=1))
xaxis_copy.xaxis.set_major_formatter(mdates.DateFormatter('%n %Y-%m-%d'))
xaxis_copy.tick_params(axis='x', which='major')

fig.set_size_inches(12,10)
fig.patch.set_facecolor('white')

plt.show()

In [None]:
'/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/data/integrated_data_'+event_list[event_no][3]+'.csv'

### Save the dataframe

In [None]:
df_final.to_csv('/Users/ryanmc/Documents/Conferences/Jack_Eddy_Symposium_2022/dev/integrated_data/integrated_data_'+event_list[event_no][3]+'.csv')




