In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
%cd /content/drive/MyDrive/'2024 UW ENGINE Capstone'
BASE_PATH = "/content/drive/MyDrive/2024 UW ENGINE Capstone/data/"

/content/drive/.shortcut-targets-by-id/1oyb6sI_Gh8b_o0AIjqSCq4UljphkuJTI/2024 UW ENGINE Capstone


In [None]:
import pandas as pd
import os
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

In [None]:
def load_data(source, file_name, datatype="raw", sep=","):
    return pd.read_csv(f'{BASE_PATH}{source}/{datatype}/{file_name}', sep=sep)

In [None]:
def is_exist_nan_and_fill(df, name):
    nan_cols = df.isna().any()
    nan_cols = list(nan_cols[nan_cols == True].keys())

    if len(nan_cols) != 0:
        # print(f'There is NaN inside {name} data')
        df[nan_cols] = df[nan_cols].interpolate(method ='linear', axis=0, limit_direction='both')

    assert(df.isna().any().sum() == 0)

    return df

In [None]:
def prepare_magnetic_pertubation_dataset(omni_data, sm_data, clockwise_angle_data, year):
    sm_data_features = sm_data[['datetime', 'glon', 'glat', 'mlon', 'mlat', 'mlt', 'dbn_nez', 'dbe_nez']]
    del sm_data
    omni_data_features = omni_data[['DateTime', 'BX_GSE', 'BY_GSM', 'BZ_GSM', 'flow_speed', 'proton_density', 'T', 'Pressure', 'SYM_H']]

    num_expected = len(sm_data_features)

    sm_data_features = is_exist_nan_and_fill(sm_data_features, 'superMAG')
    omni_data_features = is_exist_nan_and_fill(omni_data_features, 'OMNI')
    assert(len(omni_data_features) == 1472544)
    clockwise_angle_data = is_exist_nan_and_fill(clockwise_angle_data, 'CLOCKWISE_ANGLE')
    assert(len(clockwise_angle_data) >= 105120)

    sm_ljoin_omni = sm_data_features.merge(omni_data_features, left_on='datetime', right_on='DateTime', how='left')
    del sm_data_features
    dataset = sm_ljoin_omni.merge(clockwise_angle_data, left_on='datetime', right_on='DateTime', how='left')
    # del clockwise_angle_data
    del sm_ljoin_omni

    dataset.drop(columns=['DateTime_x', 'DateTime_y'], inplace=True)
    num_instances = len(dataset)

    assert(dataset.isna().any().sum() == 0)
    assert(num_expected == num_instances)

    output_file = BASE_PATH + f"ML/dataset_{year}.csv"
    dataset.to_csv(output_file, mode='a', header=not os.path.exists(output_file), index=False)
    # print(dataset)

    del dataset

    return num_instances

In [None]:
def create_dataset(chunk_size=200000):
    # load OMNI data
    omni_data = load_data("OMNI", 'OMNI_2010_2023.csv', datatype='clean')

    for i in range(2010, 2024):
        print(f"Processing {i} data ...")
        # Load data file
        sm_data = load_data("SuperMAG & NERC", f'resample_{i}.csv')

        # n = 200000  #chunk row size
        sm_data_list = [sm_data[i:i+chunk_size] for i in range(0,sm_data.shape[0],chunk_size)]
        num_expected = len(sm_data)
        del sm_data

        clockwise_angle_data = load_data("OMNI", f'IMF_CLOCK_ANGLE_{i}.csv', datatype='clean')

        total_instance = 0

        for sm_data_chunk in tqdm(sm_data_list):
            total_instance += prepare_magnetic_pertubation_dataset(omni_data, sm_data_chunk, clockwise_angle_data, i)
            # del sm_data_chunk

        del clockwise_angle_data
        del sm_data_list

        assert(total_instance == num_expected)

In [None]:
# del sm_data
# del clockwise_angle_data
create_dataset()

Processing 2018 data ...


100%|██████████| 127/127 [10:56<00:00,  5.17s/it]


Processing 2019 data ...


100%|██████████| 112/112 [09:34<00:00,  5.13s/it]


Processing 2020 data ...


100%|██████████| 114/114 [09:43<00:00,  5.12s/it]


Processing 2021 data ...


100%|██████████| 110/110 [09:27<00:00,  5.16s/it]


Processing 2022 data ...


100%|██████████| 120/120 [10:11<00:00,  5.10s/it]


Processing 2023 data ...


100%|██████████| 38/38 [03:13<00:00,  5.08s/it]


In [None]:
dataset_2012 = pd.read_csv(BASE_PATH + f"ML/datasest_2012.csv")

In [None]:
dataset_2012

Unnamed: 0.1,Unnamed: 0,datetime,glon,glat,mlon,mlat,mlt,dbn_nez,dbe_nez,BX_GSE,BY_GSM,BZ_GSM,flow_speed,proton_density,T,Pressure,SYM_H,CLOCK_ANGLE_GSM
0,0.0,2012-05-02 12:50:00,17.35,59.9,95.946266,56.883396,14.30632,2.95894,2.313076,3.39,-2.45,-2.15,280.0,3.11,6849.0,0.49,-15,215.66
1,1.0,2012-05-02 12:55:00,17.35,59.9,95.946266,56.883396,14.377602,2.861759,2.509415,3.85,-2.36,-1.91,272.6,4.16,14775.0,0.62,-16,232.94
2,2.0,2012-05-02 13:00:00,17.35,59.9,95.946266,56.883396,14.448345,2.495336,3.134742,3.76,-2.25,-1.93,274.0,4.94,13738.0,0.74,-16,227.0
3,3.0,2012-05-02 13:05:00,17.35,59.9,95.946266,56.883396,14.518598,3.45088,2.865015,3.83,-2.01,-1.99,274.2,5.18,14400.0,0.78,-16,228.68
4,4.0,2012-05-02 13:10:00,17.35,59.9,95.946266,56.883396,14.588467,1.871775,4.020354,3.74,-2.05,-2.08,275.5,5.4,15476.0,0.82,-15,223.06
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1662240,62232.0,2012-10-24 23:40:00,123.26,66.75,-164.56088,62.00877,7.989081,34.588468,-53.161952,-0.79,2.49,-2.31,333.1,3.46,24181.0,0.77,-3,129.1
1662241,62233.0,2012-10-24 23:45:00,123.26,66.75,-164.56088,62.00877,8.071655,34.597927,-53.80355,-1.2,2.78,-1.7,331.0,3.42,23810.0,0.76,-2,120.7
1662242,62234.0,2012-10-24 23:50:00,123.26,66.75,-164.56088,62.00877,8.154383,34.018217,-52.935619,-1.13,2.76,-1.55,330.3,3.77,31258.0,0.82,-1,119.28
1662243,62235.0,2012-10-24 23:55:00,123.26,66.75,-164.56088,62.00877,8.237288,33.186641,-51.043111,-0.55,2.79,-2.0,337.6,3.44,22652.0,0.78,-1,126.08
