# Preprocessing — EEMD/CEEMDAN on Monthly (L2) Tide Data

Build decomposed IMFs for each station using EEMD and CEEMDAN.
- Data: L2 monthly tide series (`Data/Monthly_*.txt`)
- Parameters: trials=2048 (stable), epsilon=1.0, parabolic extrema
- Outputs: `Data_mid/<Station>_EEMD_2048.parquet`, `Data_mid/<Station>_CEEMDAN_2048.parquet`

In [9]:
# Imports and paths
import os
from pathlib import Path
from datetime import datetime

import numpy as np
import pandas as pd
from tqdm import tqdm

DATA_DIR = Path('Data')
MID_DIR = Path('Data_mid')
MID_DIR.mkdir(parents=True, exist_ok=True)

# Decomposition parameters
TRIALS = 2048   # Note: user mentioned '2024'; using 2048 for stability and filename consistency
EPSILON = 0.6   # CEEMDAN noise strength
EXTREMA_DETECTION = 'parabol'
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)


In [10]:
# PyEMD imports (install hint if missing)
try:
    from PyEMD import CEEMDAN, EEMD
except Exception as e:
    print('PyEMD not found. Install with: pip install EMD-signal')
    raise


In [11]:
# Data loader for Monthly_*.txt (YYYY MM DD hh mm  value)
def load_monthly_txt(path: Path) -> pd.DataFrame:
    rows = []
    with open(path, 'r', encoding='utf-8', errors='ignore') as f:
        for ln in f:
            ln = ln.strip()
            if not ln:
                continue
            parts = ln.split()
            try:
                y, mo, d, hh, mm = map(int, parts[:5])
                val = float(parts[-1])
            except Exception:
                # Skip non-data lines
                continue
            rows.append((datetime(y, mo, d, hh, mm), val))
    df = pd.DataFrame(rows, columns=['date', 'value']).sort_values('date').reset_index(drop=True)
    return df

def list_stations(data_dir: Path = DATA_DIR):
    files = sorted(data_dir.glob('Monthly_*.txt'))
    stations = [p.stem.replace('Monthly_', '') for p in files]
    return files, stations


In [12]:
# Decomposition wrappers
def run_ceemdan(values: np.ndarray, trials: int = TRIALS, epsilon: float = EPSILON):
    ce = CEEMDAN(trials=trials, epsilon=epsilon)
    ce.EMD.extrema_detection = EXTREMA_DETECTION
    eIMFs = ce.ceemdan(values)
    return eIMFs

def run_eemd(values: np.ndarray, trials: int = TRIALS):
    ee = EEMD(trials=trials)
    ee.EMD.extrema_detection = EXTREMA_DETECTION
    eIMFs = ee.eemd(values)
    return eIMFs

def imfs_to_df(eIMFs: np.ndarray, dates: pd.Series) -> pd.DataFrame:
    cols = [f'IMF_{i}' for i in range(eIMFs.shape[0])]
    df = pd.DataFrame(eIMFs.T, index=pd.to_datetime(dates).values, columns=cols)
    return df


## Run and Save
Iterate over stations and write EEMD/CEEMDAN results to `Data_mid/`.
Warning: CEEMDAN with 2048 trials can be time-consuming.

In [13]:
files, stations = list_stations()
len(files), stations[:5]


(21, ['Anheung', 'Boryeong', 'Busan', 'Chujado', 'Gadeokdo'])

In [14]:
# Set to a small number for a quick dry-run (e.g., 1 or 2). Use None for all.
RUN_LIMIT = None  # e.g., 2

processed = []
for i, p in enumerate(tqdm(files, desc='Decomposing')):
    if RUN_LIMIT is not None and i >= RUN_LIMIT:
        break
    station = p.stem.replace('Monthly_', '')
    df = load_monthly_txt(p)
    if df.empty:
        print('Skip empty:', station)
        continue
    s = df['value'].values.astype(float)
    t = df['date']

    # EEMD
    try:
        eimfs_eemd = run_eemd(s, trials=TRIALS)
        df_eemd = imfs_to_df(eimfs_eemd, t)
        out_eemd = MID_DIR / f'{station}_EEMD_{TRIALS}_{EPSILON}.parquet'
        df_eemd.to_parquet(out_eemd)
    except Exception as e:
        print('EEMD failed:', station, e)

    # CEEMDAN
    try:
        eimfs_ce = run_ceemdan(s, trials=TRIALS, epsilon=EPSILON)
        df_ce = imfs_to_df(eimfs_ce, t)
        out_ce = MID_DIR / f'{station}_CEEMDAN_{TRIALS}_{EPSILON}.parquet'
        df_ce.to_parquet(out_ce)
    except Exception as e:
        print('CEEMDAN failed:', station, e)

    processed.append(station)

processed[:5], len(processed)


Decomposing: 100%|██████████| 21/21 [28:01<00:00, 80.07s/it]


(['Anheung', 'Boryeong', 'Busan', 'Chujado', 'Gadeokdo'], 21)

## Quick verification
Load one saved file to ensure the schema.

In [8]:
# Preview one output if any processed
from glob import glob
outs = sorted(glob(str(MID_DIR / '*_CEEMDAN_*.parquet')))
if outs:
    pd.read_parquet(outs[0]).head()
else:
    print('No outputs found yet.')
