This file covers some of the basic things that we had to do to the data to get it to a usable state, including filtering out invalid values and resampling to match the timescale for the hourly OMNI dataset.


This code imports the packages we need to do data processing and reads a csv file.

In [2]:
import numpy as np
import pandas as pd
df = pd.read_csv('testData2.csv') #hourly data from 10/6/2018 to 10/6/2019

#https://cdaweb.sci.gsfc.nasa.gov/index.html/

#Quick note on data from the website: it puts in a bunch of junk at the beginning
#and at the end of the document that pandas doesn't ignore for whatever reason

df.columns #these are all the column headers that I pulled

Index(['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ', 'HELIOGRAPHIC_LATITUDE_deg',
       'HELIOGRAPHIC_LONGITUDE_deg', 'BR_(RTN)_nT', 'BT_(RTN)_nT',
       'BN_(RTN)_nT', 'FIELD_MAGNITUDE_AVG._nT', 'BULK_FLOW_SPEED_km/s',
       'ELEVATION_ANGLE_Deg', 'AZIMUTH_ANGLE_Deg', 'ION_DENSITY_N/cm3',
       'TEMPERATURE_Deg_K'],
      dtype='object')

In [3]:
df.describe() #here are some descriptive statistics about the data

Unnamed: 0,HELIOGRAPHIC_LATITUDE_deg,HELIOGRAPHIC_LONGITUDE_deg,BR_(RTN)_nT,BT_(RTN)_nT,BN_(RTN)_nT,FIELD_MAGNITUDE_AVG._nT,BULK_FLOW_SPEED_km/s,ELEVATION_ANGLE_Deg,AZIMUTH_ANGLE_Deg,ION_DENSITY_N/cm3,TEMPERATURE_Deg_K
count,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0,8761.0
mean,0.048956,180.774272,-5.923980999999999e+29,-5.923980999999999e+29,-5.923980999999999e+29,-5.923980999999999e+29,-1.5180919999999998e+29,-1.5180919999999998e+29,-1.5180919999999998e+29,-1.5409199999999999e+29,-1.5409199999999999e+29
std,5.119186,103.02834,2.3608649999999998e+30,2.3608649999999998e+30,2.3608649999999998e+30,2.3608649999999998e+30,1.22279e+30,1.22279e+30,1.22279e+30,1.231807e+30,1.231807e+30
min,-7.3,0.0,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31,-1.0000000000000001e+31
25%,-5.1,92.6,-2.8,-1.9,-1.3,3.2,341.0,-2.2,-1.3,3.4,31076.0
50%,0.1,181.5,-1.0,0.6,-0.2,4.1,382.0,-1.1,0.0,5.1,51926.0
75%,5.2,268.9,1.5,2.4,0.9,5.2,447.0,0.2,1.2,7.5,90767.0
max,7.3,360.0,10.0,11.8,14.4,18.0,752.0,6.9,10.9,58.0,527626.0


In [5]:
df.head()

Unnamed: 0,EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ,HELIOGRAPHIC_LATITUDE_deg,HELIOGRAPHIC_LONGITUDE_deg,BR_(RTN)_nT,BT_(RTN)_nT,BN_(RTN)_nT,FIELD_MAGNITUDE_AVG._nT,BULK_FLOW_SPEED_km/s,ELEVATION_ANGLE_Deg,AZIMUTH_ANGLE_Deg,ION_DENSITY_N/cm3,TEMPERATURE_Deg_K
0,2018-10-06T00:00:00.000Z,6.5,295.7,1.7,0.5,-2.6,3.3,366.0,0.9,-2.3,8.5,21400.0
1,2018-10-06T01:00:00.000Z,6.5,295.7,0.6,0.2,-2.1,2.4,373.0,-0.1,-1.6,12.0,19325.0
2,2018-10-06T02:00:00.000Z,6.5,295.8,0.9,1.5,0.1,2.2,369.0,-1.5,-1.7,12.5,14626.0
3,2018-10-06T03:00:00.000Z,6.5,295.8,0.4,0.3,0.9,1.6,367.0,-1.9,-1.7,13.3,15408.0
4,2018-10-06T04:00:00.000Z,6.5,295.9,-0.7,-1.2,-0.1,1.8,367.0,-2.2,-1.2,10.5,22965.0


This is the code that I used to merge the positon and solar wind data from the Artemis mission from 08/01/2019 to 09/01/2019
and average the solar wind data into hourly data, I had to filter out entries that were essentially null.

In [1]:
import numpy as np
import pandas as pd
posFrame = pd.read_csv('ArtemisPosition.csv')
windFrame = pd.read_csv('ArtemisSolarWind.csv')
windFrame = windFrame.loc[:, ~windFrame.columns.str.contains('^Unnamed')]
windFrame['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(windFrame['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'])
windFrame['date'] = pd.to_datetime(windFrame['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ']).dt.date
posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'])
posFrame['date'] = pd.to_datetime(posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ']).dt.date
mergedFrame = pd.merge(windFrame, posFrame, on='date')
mergedFrame = mergedFrame.drop(['date', 'EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'], axis=1)
mergedFrame = mergedFrame[mergedFrame.Ion_Density_n_cc != -1.0E+31]
hourlyMergedFrame = mergedFrame.resample('H', on = 'EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ').mean()

This is the code that was used to process the solar wind data from the OMNI dataset from 07/15/2019 to 09/15/2019. The data is already at an hourly resolution, but I had to filter out bad values.

In [2]:
#this dataframe represents the position and solar wind data from the OMNI dataset from 07/15/2019 to 09/15/2019
#the data is at an hourly resolution, I had to filter out bad values
import numpy as np
import pandas as pd
omniFrame = pd.read_csv('OnmiForArtemis.csv')
omniFrame = omniFrame.loc[:, ~omniFrame.columns.str.contains('^Unnamed')]
omniFrame = omniFrame[omniFrame.BULK_FLOW_SPEED_km_s != -1.0E+31]

This is the data to clean and merge the solar wind and position data for the maven mission from 08/01/2015 to 09/01/2015
We will also need to convert the position to a scalar instead of a vector and the temperature to K instead of eV (I'm unsure if this conversion is valid)

In [3]:
import numpy as np
import pandas as pd
posFrame = pd.read_csv('MavenPosition.csv')
windFrame = pd.read_csv('MavenSolarWind.csv')
windFrame = windFrame.loc[:, ~windFrame.columns.str.contains('^Unnamed')]
windFrame['EPOCH__yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(windFrame['EPOCH__yyyy-mm-ddThh:mm:ss.sssZ'])
windFrame['date'] = pd.to_datetime(windFrame['EPOCH__yyyy-mm-ddThh:mm:ss.sssZ']).dt.date
posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'])
posFrame['date'] = pd.to_datetime(posFrame['EPOCH_yyyy-mm-ddThh:mm:ss.sssZ']).dt.date
mergedFrame = pd.merge(windFrame, posFrame, on='date')
mergedFrame = mergedFrame.drop(['date', 'EPOCH_yyyy-mm-ddThh:mm:ss.sssZ'], axis=1)
mergedFrame = mergedFrame[mergedFrame.ION_DENSITY_N_CC != -1.0E+31]
mergedFrame['BULK_FLOW_VELOCITY_km_s'] = np.linalg.norm(mergedFrame[['VELOCITY_X_VECTOR_km_s','VELOCITY_Y_VECTOR_km_s','VELOCITY_Z_VECTOR_km_s']].values,axis=1)
mergedFrame = mergedFrame.drop(['ION_TEMP_eV', 'VELOCITY_X_VECTOR_km_s', 'VELOCITY_Y_VECTOR_km_s', 'VELOCITY_Z_VECTOR_km_s'], axis=1)
hourlyMergedFrame = mergedFrame.resample('H', on = 'EPOCH__yyyy-mm-ddThh:mm:ss.sssZ').mean()
#hourlyMergedFrame.to_csv(r'MavenMerged.csv')

This is the code to process the Omni data for the Maven mission data above.

In [4]:
import numpy as np
import pandas as pd
omniFrame = pd.read_csv('OmniForMaven.csv')
omniFrame = omniFrame.loc[:, ~omniFrame.columns.str.contains('^Unnamed')]
omniFrame = omniFrame[omniFrame.BULK_FLOW_SPEED_km_s != -1.0E+31]
omniFrame = omniFrame[omniFrame.ION_DENSITY_N_CC != -1.0E+31]

This is the code that was used to create the first dataset for a ML model using a naive approach for time shift.

In [37]:
import numpy as np
import pandas as pd
import datetime as dt
aF = pd.read_csv('ArtemisMerged.csv')
oF = pd.read_csv('OnmiForArtemis.csv')
aF = aF.drop(['Ion_Density_n_cc'], axis=1)
oF = oF.loc[:, ~oF.columns.str.contains('^Unnamed')]
oF = oF.drop(['ION_DENSITY_N_cm3'], axis=1)
aF['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(aF['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'])
oF['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'] = pd.to_datetime(oF['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'])
aF['new_time'] = aF['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'] - pd.Timedelta(hours = 10)
aF['Time_offset_hours'] = 10
aF = aF.drop(['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ'], axis=1)
mF = pd.merge(oF, aF, how='right', left_on='EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ', right_on='new_time')
mF['HGI_Lat_Diff'] = abs(mF['HELIOGRAPHIC_LATITUDE_deg'] - mF['HGI_LAT_deg'])
mF['HGI_Lon_Diff'] = abs(mF['HELIOGRAPHIC_LONGITUDE_deg'] - mF['HGI_LON_deg'])
mF['Distance_Traveled_AU'] = mF['Distance_From_Sun_AU'] - 1
mF = mF.drop(['EPOCH_TIME_yyyy-mm-ddThh:mm:ss.sssZ', 'HELIOGRAPHIC_LATITUDE_deg', 'HELIOGRAPHIC_LONGITUDE_deg', 'Distance_From_Sun_AU', 'HGI_LAT_deg', 'HGI_LON_deg', 'new_time'], axis = 1)
mF = mF.rename({'BULK_FLOW_SPEED_km_s': 'OMNI_VELOCITY_kms', 'Ion_Velocity_km_s': 'ARTEMIS_VELOCITY_kms'}, axis=1)
mF = mF.dropna(axis=0, how='any', thresh=None, subset=None, inplace=False)
mF = mF[mF.OMNI_VELOCITY_kms != -1.0E+31]
#mF.to_csv('DataForFirstModel.csv')