In [None]:
import numpy as np
import pandas as pd
import os
import glob
import datetime

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
inp_files = glob.glob('/content/drive/MyDrive/ace_level2_98_11/centered_20min_input/*.csv')
tar_file = '/content/drive/MyDrive/ace_level2_98_11/SWICS_all_years.csv'

In [None]:
inp_df = pd.DataFrame()
for file in inp_files:
    temp = pd.read_csv(file, engine="python", encoding="UTF-8")
    inp_df = pd.concat([inp_df, temp])

In [None]:
tar_df = pd.read_csv(tar_file, engine="python", encoding="UTF-8")

In [None]:
tar_df

Unnamed: 0,datetime,C6to5,O7to6,FetoO,datetime_re,Vp_avg,label1,label2
0,1998-02-04 00:09:16.416,1.76360,0.367830,0.042351,1998-02-04 00:00:00,,,
1,1998-02-04 02:10:26.976,1.44520,0.365160,0.151770,1998-02-04 02:20:00,,,
2,1998-02-04 04:11:38.400,0.93360,0.185370,0.538620,1998-02-04 04:20:00,,,
3,1998-02-04 06:12:49.824,0.87715,0.371970,0.481570,1998-02-04 06:20:00,,,
4,1998-02-04 08:14:01.248,1.50650,0.307440,0.575380,1998-02-04 08:20:00,,,
...,...,...,...,...,...,...,...,...
59174,2011-08-21 13:40:48.000,0.47252,0.050033,0.092960,2011-08-21 13:40:00,412.823697,slow,slow
59175,2011-08-21 15:40:50.304,0.57866,0.033956,0.140230,2011-08-21 15:40:00,404.887535,slow,slow
59176,2011-08-21 17:40:50.880,0.57597,0.029067,0.081298,2011-08-21 17:40:00,403.272516,slow,slow
59177,2011-08-21 19:40:53.184,1.02060,0.070335,0.115320,2011-08-21 19:40:00,400.375427,slow,slow


In [None]:
inp_df

Unnamed: 0,datetime,Vp,Alpha_ratio,entropy,Alfvenicity,label1,label2,Np,Tp,coulomb num,sigma_r
0,1998-02-04 0:00,,,,,,,,,,
1,1998-02-04 0:20,,,,,,,,,,
2,1998-02-04 0:40,,,,,,,,,,
3,1998-02-04 1:00,,,,,,,,,,
4,1998-02-04 1:20,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...
16772,2011-08-21 22:40,402.700667,,,,slow,,,90045.46667,,
16773,2011-08-21 23:00,397.862000,,,,slow,,,97787.30769,,
16774,2011-08-21 23:20,423.855294,,,,slow,,,63540.81250,,
16775,2011-08-21 23:40,403.462778,,,,slow,,,57720.86667,,


In [None]:
inp_df['datetime'] = pd.to_datetime(inp_df['datetime'])
tar_df['datetime_re'] = pd.to_datetime(tar_df['datetime_re'])

In [None]:
inp_time = inp_df['datetime']
tar_time = tar_df['datetime_re'][1:]

In [None]:
time_to_idx = {t: i for i, t in enumerate(inp_time)}
idxs = np.array([time_to_idx[t] for t in tar_time], dtype=int)

inp_data_col = ['Vp', 'Alpha_ratio', 'entropy', 'Alfvenicity', 'Np', 'Tp', 'coulomb num']
inp_df['Alfvenicity'] = inp_df['Alfvenicity'].abs() # abs(Alfvenicity)
tar_data_col = ['C6to5', 'O7to6', 'FetoO']

inp_array = inp_df[inp_data_col].values
tar_array = tar_df[tar_data_col].values[1:]

inp_windows = np.full((len(idxs), 7, len(inp_data_col)), np.nan, dtype=float)

for i, idx in enumerate(idxs):
    inp_windows[i] = inp_array[idx-3:idx+4]

In [None]:
inp_windows.shape

(59178, 7, 7)

In [None]:
tar_array.shape

(59178, 3)

In [None]:
len(tar_time)

59178

In [None]:
tar_missing = set(np.where(np.isnan(tar_array))[0])
inp_missing = set(np.where(np.isnan(inp_windows).sum(axis=1) >= 5.)[0])

In [None]:
missing_idx = tar_missing | inp_missing
valid_idx = sorted(list(set(range(len(tar_array))) - missing_idx))

In [None]:
len(valid_idx)

39738

In [None]:
inp = inp_windows[valid_idx]
tar = tar_array[valid_idx]
time = tar_time.values[valid_idx]

In [None]:
print("입력 데이터(inp) shape:", inp.shape)
print("타겟 데이터(tar) shape:", tar.shape)
print("시간 데이터(time) shape:", time.shape)

입력 데이터(inp) shape: (39738, 7, 7)
타겟 데이터(tar) shape: (39738, 3)
시간 데이터(time) shape: (39738,)


In [None]:
np.savez_compressed(
    'full_dataset.npz',
    inp=inp,
    tar=tar,
    time=time
)


In [None]:
loaded_dataset = np.load('full_dataset.npz', allow_pickle=True)

print(list(loaded_dataset.keys()))
print(loaded_dataset['inp'].shape)
print(loaded_dataset['tar'].shape)

['inp', 'tar', 'time']
(39738, 7, 7)
(39738, 3)
