In [2]:
import random
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import datetime

# Transition Marker
### Data Import and Formating
First the manually created transition list is imported alongside the magnetometer measurements taken _via_ MESSENGER's MAG instrument. Dates for the crossings and the magnetometer measurements are converted to datetime format. Then a 10 minute stop gap is added to the start and end of each transition period in order to prevent transition regions from being sampled in the training data. 

In [3]:
print("Importing Data...")

#crossing_list = pd.read_csv(dir_path + "/MESSENGER/" + "Mercury_Boundary_Crossing_List.csv", sep = ',',skipinitialspace = True, index_col = False)
crossing_list = pd.read_csv("Mercury_Boundary_Crossing_List.csv", sep = ',', index_col = False)

#Converting transitions and Messenger data to datetime format

stopgap_minutes = 10



crossing_list["Start_Date"] = pd.to_datetime(crossing_list["Start_Date"], format = "%Y-%m-%d %H:%M:%S") - datetime.timedelta(minutes = stopgap_minutes)

crossing_list["End_Date"] = pd.to_datetime(crossing_list["End_Date"], format = "%Y-%m-%d %H:%M:%S") + datetime.timedelta(minutes = stopgap_minutes)



Importing Data...


In [4]:
crossing_list = crossing_list.drop("Type", axis = 1).to_numpy()

In [None]:
print("Importing Messenger Data...")

#Importing Messenger Data

df = pd.read_csv("validation_list.csv")

#df = pd.read_csv(dir_path + "/test_list.csv")

df["Date"] = pd.to_datetime(df["Date"], format = "%Y-%m-%d %H:%M:%S")

Importing Messenger Data...


### Transition Labeling
A boolean column is added to the magnetometer dataset. Each row will have a value of 0 if no crossing or stop gap is present at that timepoint, and a value of 1 if there is a crossing/stop gap present. 

To create these labels, each entry in the manually created crossing list is iterated through. If the start or end date of the crossings are present in the magnetometer data, the dates between the start of the crossing stop gap and the end of the crossing stop gap are given a value of 1. 

In [None]:
#Initial column for transitions (to be changed later)

df["Transition"] = 0

df["Transition"]= df["Transition"].astype('bool')

In [None]:
print("Labeling Data...")

#Labeling date ranges where transitions occurred (with a 10 minute buffer either side)

for tran in range(0, len(crossing_list)):
    
    if df.loc[df["Date"] == crossing_list[tran][0]].empty:
        
        if df.loc[df["Date"] == crossing_list[tran][1]].empty:
            pass
        
        else: 
            df["Transition"][df["Date"].between(
            crossing_list[tran][0],
            crossing_list[tran][1])
        ] = 1
            
    else: 
        df["Transition"][df["Date"].between(
            crossing_list[tran][0],
            crossing_list[tran][1])
        ] = 1

del crossing_list

# Class Balancing 
### 
First the dataset is split into 3 parts corresponding to locations where MESSENGER was in the magnetosphere, magnetosheath and solar wind respectively and was not within 10 minutes of a transition. Then the location MESSENGER spent the least amount of time in is identified. 

In [None]:
df["Location"].value_counts()

In [None]:
All_0s = np.where(df.loc[df["Transition"] == 0]["Location"] == 0)[0]

All_1s = np.where(df.loc[df["Transition"] == 0]["Location"] == 1)[0]

All_2s = np.where(df.loc[df["Transition"] == 0]["Location"] == 2)[0]



classBalance = np.min([len(All_0s),len(All_1s),len(All_2s)]) #Must be smaller than smallest class size





### Random Sampling
To remove the above class imbalance, the two largest datasets are randomly sampled to produce an index the same size as the smallest dataset.

In [9]:
if len(All_0s) != classBalance:

    train_0s_index = random.sample(list(All_0s),classBalance)

else:

    train_0s_index = All_0s



if len(All_1s) != classBalance:

    train_1s_index = random.sample(list(All_1s),classBalance)

else:

    train_1s_index = All_1s

    

if len(All_2s) != classBalance:

    train_2s_index = random.sample(list(All_2s),classBalance) 

else:

    train_2s_index = All_2s

In [17]:
df.iloc[train_0s_index]["Location"].value_counts()

0    1932267
Name: Location, dtype: int64

In [18]:
df.iloc[train_1s_index]["Location"].value_counts()

1    1932267
Name: Location, dtype: int64

In [19]:
df.iloc[train_2s_index]["Location"].value_counts()

2    1932267
Name: Location, dtype: int64

### Extracting the Class Balanced, Transitionless Data

If we take a subset of the entire MESSENGER data from the above indicies, we get an array which has an equal number of observations in the Solar Wind, Magnetosheath and Magnetosphere and no observations within 10 minutes of a stop gap. The dataframe is peared down to this using .iloc, sorted by date and exported to a csv. 

In [20]:
df.iloc[np.concatenate([np.array(train_0s_index) , np.array(train_1s_index), np.array(train_2s_index)])].sort_values(by=['Date']).to_csv("discontinusous_tansitionless_class_balanced_random_sample.csv", index = False)