In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import numpy as np
import pandas as pd
import numpy.lib.stride_tricks as st

# --------------------------------------------------
# Paths and basic configuration
# --------------------------------------------------
PATH = "/content/drive/MyDrive/Datamining-TSC-Project/new_processed_data.parquet"
CFG = {
    "time_col": "time",
    "window": 36,        # sliding window length (hours)
    "trend_h": 12,       # recent hours for trend checks
    "ma_hours": [3, 6, 12],  # moving average windows
}

# --------------------------------------------------
# Angle utility functions
# --------------------------------------------------
def wrap360(x):
    # Wrap angles into [0, 360)
    return (x % 360.0 + 360.0) % 360.0

def angle_diff_deg(a, b):
    # Smallest signed angle difference a - b in degrees
    return (a - b + 180.0) % 360.0 - 180.0

def wave_dir_convert(old_wave_dir):
    # Convert wave direction to wind-direction convention
    return wrap360(270.0 - old_wave_dir)

# --------------------------------------------------
# Moving average feature generator
# --------------------------------------------------
def add_moving_averages(df, cols, ma_hours):
    # Add rolling mean features for selected columns
    for col in cols:
        for h in ma_hours:
            df[f"{col}_ma{h}"] = df[col].rolling(window=h, min_periods=h).mean()
    return df

# --------------------------------------------------
# Load and clean data
# --------------------------------------------------
df = pd.read_parquet(PATH)

df["time"] = pd.to_datetime(df["time"])
df = df.sort_values("time").drop_duplicates("time").reset_index(drop=True)

# Keep only relevant columns
df = df[
    [
        "time",
        "Wind speed",
        "Wind Direction",
        "Wave Period",
        "Wave Direction",
        "Wave Height",
        "Wave Power",
        "Pressure",
        "temperature",
        "Surge Height",
        "Total Water Level",
        "Wave Steepness",
    ]
].copy()

# Rename columns to short, consistent names
df.rename(
    columns={
        "Wind speed": "ws",
        "Wind Direction": "wd",
        "Wave Period": "tp",
        "Wave Direction": "wdir",
        "Wave Height": "hs",
        "Wave Power": "pwr",
        "Pressure": "mslp",
        "temperature": "temp",
        "Surge Height": "surge",
        "Total Water Level": "twl",
        "Wave Steepness": "steep",
    },
    inplace=True,
)

# --------------------------------------------------
# Windâ€“wave direction alignment features
# --------------------------------------------------
df["wdir"] = wave_dir_convert(df["wdir"].to_numpy(np.float32))

wd = df["wd"].to_numpy(np.float32)
wdir = df["wdir"].to_numpy(np.float32)

# Angle difference between wind and wave directions
dwd_deg = angle_diff_deg(wd, wdir).astype(np.float32)
dwd_rad = np.deg2rad(dwd_deg).astype(np.float32)

# Encode direction difference with sin/cos
df["dwd_sin"] = np.sin(dwd_rad).astype(np.float32)
df["dwd_cos"] = np.cos(dwd_rad).astype(np.float32)

# Drop raw direction columns
df.drop(columns=["wd", "wdir"], inplace=True)

# --------------------------------------------------
# Add moving average features
# --------------------------------------------------
ma_cols = [
    "hs", "ws", "pwr", "mslp",
    "temp", "surge", "twl", "steep", "tp"
]
df = add_moving_averages(df, ma_cols, CFG["ma_hours"])

# --------------------------------------------------
# Sliding window statistics
# --------------------------------------------------
W = CFG["window"]
H = CFG["trend_h"]

hs   = df["hs"].to_numpy(np.float32)
pwr  = df["pwr"].to_numpy(np.float32)
mslp = df["mslp"].to_numpy(np.float32)
ws   = df["ws"].to_numpy(np.float32)
dwd_cos = df["dwd_cos"].to_numpy(np.float32)

# Create rolling windows
hs_w   = st.sliding_window_view(hs,   W)
pwr_w  = st.sliding_window_view(pwr,  W)
mslp_w = st.sliding_window_view(mslp, W)
ws_w   = st.sliding_window_view(ws,   W)
dwd_cos_w = st.sliding_window_view(dwd_cos, W)

# Window-based severity metrics (mean + 2*std)
hs_metric36  = hs_w.mean(axis=1)  + 2.0 * hs_w.std(axis=1)
pwr_metric36 = pwr_w.mean(axis=1) + 2.0 * pwr_w.std(axis=1)

# --------------------------------------------------
# Train / validation / test split by time
# --------------------------------------------------
start_times = df["time"].iloc[:len(hs_metric36)].to_numpy()

train_mask = start_times < np.datetime64("2015-01-01")
val_mask   = (start_times >= np.datetime64("2015-01-01")) & (start_times < np.datetime64("2020-01-01"))
test_mask  = start_times >= np.datetime64("2020-01-01")

# --------------------------------------------------
# Percentile-based severity thresholds (train only)
# --------------------------------------------------
hs_p75, hs_p92, hs_p98, hs_p995 = np.percentile(
    hs_metric36[train_mask], [75, 92, 98, 99.5]
)
pwr_p75, pwr_p92, pwr_p98, pwr_p995 = np.percentile(
    pwr_metric36[train_mask], [75, 92, 98, 99.5]
)

# Map continuous values to severity classes
def severity(x, p75, p92, p98, p995):
    y = np.zeros_like(x, dtype=np.int8)
    y[(x >= p75) & (x < p92)]  = 1
    y[(x >= p92) & (x < p98)]  = 2
    y[(x >= p98) & (x < p995)] = 3
    y[(x >= p995)]             = 4
    return y

sev_hs  = severity(hs_metric36,  hs_p75,  hs_p92,  hs_p98,  hs_p995)
sev_pwr = severity(pwr_metric36, pwr_p75, pwr_p92, pwr_p98, pwr_p995)

# Base severity: worst of wave height or power
base = np.maximum(sev_hs, sev_pwr)

# --------------------------------------------------
# Trend-based reinforcement rules (train only)
# --------------------------------------------------
train_hours_mask = df["time"] < "2015-01-01"

hs_th   = np.percentile(hs[train_hours_mask],  92)
pwr_th  = np.percentile(pwr[train_hours_mask], 92)
ws_th   = np.percentile(ws[train_hours_mask],  92)
mslp_th = np.percentile(mslp[train_hours_mask], 20)
align_th = np.percentile(dwd_cos[train_hours_mask], 75)

# Count how many storm-like conditions persist in last H hours
cnt = (
    (hs_w[:, -H:]  >= hs_th).sum(axis=1)  >= 6
).astype(int) + (
    (pwr_w[:, -H:] >= pwr_th).sum(axis=1) >= 6
).astype(int) + (
    (ws_w[:, -H:]  >= ws_th).sum(axis=1)  >= 6
).astype(int) + (
    (mslp_w[:, -H:] <= mslp_th).sum(axis=1) >= 6
).astype(int) + (
    (dwd_cos_w[:, -H:] >= align_th).sum(axis=1) >= 4
).astype(int)

# Final labels (trend count can be used later if needed)
y = base.copy()

# --------------------------------------------------
# Sanity checks
# --------------------------------------------------
print("Columns:", df.columns.tolist())
print("Class dist:")
for name, m in [("train", train_mask), ("val", val_mask), ("test", test_mask)]:
    print(name, pd.Series(y[m]).value_counts(normalize=True).sort_index().to_dict())


Columns: ['time', 'ws', 'tp', 'hs', 'pwr', 'mslp', 'temp', 'surge', 'twl', 'steep', 'dwd_sin', 'dwd_cos', 'hs_ma3', 'hs_ma6', 'hs_ma12', 'ws_ma3', 'ws_ma6', 'ws_ma12', 'pwr_ma3', 'pwr_ma6', 'pwr_ma12', 'mslp_ma3', 'mslp_ma6', 'mslp_ma12', 'temp_ma3', 'temp_ma6', 'temp_ma12', 'surge_ma3', 'surge_ma6', 'surge_ma12', 'twl_ma3', 'twl_ma6', 'twl_ma12', 'steep_ma3', 'steep_ma6', 'steep_ma12', 'tp_ma3', 'tp_ma6', 'tp_ma12']
Class dist:
train {0: 0.725734690152414, 1: 0.1796986705606766, 2: 0.07084131909585957, 3: 0.017933740987496578, 4: 0.005791579203553284}
val {0: 0.7295774005111354, 1: 0.17819003285870755, 2: 0.06863818912011684, 3: 0.015950164293537787, 4: 0.007644213216502373}
test {0: 0.735375345217173, 1: 0.18250291009517722, 2: 0.060712573893593226, 3: 0.016889964165886836, 4: 0.0045192066281697215}


DELTA6 deniyoruz

In [None]:
import numpy as np
import numpy.lib.stride_tricks as st

from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --------------------------------------------------
# Reproducibility
# --------------------------------------------------
SEED = 42
np.random.seed(SEED)

# Sliding window length
W = int(W)

# --------------------------------------------------
# Base (raw) feature set
# --------------------------------------------------
BASE_COLS = [
    "ws", "tp", "mslp", "surge", "twl", "steep",
    "temp",
    "dwd_sin", "dwd_cos"
]

# --------------------------------------------------
# Delta feature setup (simple trend information)
# --------------------------------------------------
DELTA_H = 6
DELTA_BASE = ["ws", "mslp", "surge", "twl", "steep", "temp"]

# Create delta features if they do not already exist
for c in DELTA_BASE:
    dname = f"{c}_d{DELTA_H}"
    if dname not in df.columns:
        df[dname] = df[c] - df[c].shift(DELTA_H)

DELTA_COLS = [f"{c}_d{DELTA_H}" for c in DELTA_BASE]

# Final feature list: RAW + delta features
COLS = BASE_COLS + DELTA_COLS
print("Using features (hs/pwr excluded, temperature included) + DELTA6:")
print(COLS)

# --------------------------------------------------
# Build sliding windows: (num_windows, W, num_features)
# --------------------------------------------------
arrs = [df[c].to_numpy(np.float32) for c in COLS]
X_list = [st.sliding_window_view(a, window_shape=W) for a in arrs]
X = np.stack(X_list, axis=-1).astype(np.float32)

# Labels and split masks (aligned to windowed data)
y_ = np.asarray(y)[:len(X)]
train_m = np.asarray(train_mask)[:len(X)]
val_m   = np.asarray(val_mask)[:len(X)]
test_m  = np.asarray(test_mask)[:len(X)]

# --------------------------------------------------
# Drop windows containing NaN or inf values
# (delta features naturally introduce NaNs at the start)
# --------------------------------------------------
finite_mask = np.isfinite(X).all(axis=(1, 2))
if not finite_mask.all():
    print(f"Dropping {(~finite_mask).sum()} windows due to NaN/inf.")
    X = X[finite_mask]
    y_ = y_[finite_mask]
    train_m = train_m[finite_mask]
    val_m   = val_m[finite_mask]
    test_m  = test_m[finite_mask]

# --------------------------------------------------
# Flatten windows for Linear SVM (2D input required)
# --------------------------------------------------
X2 = X.reshape(X.shape[0], -1).astype(np.float32)

# Train / validation / test split
X_train, y_train = X2[train_m], y_[train_m]
X_val,   y_val   = X2[val_m],   y_[val_m]
X_test,  y_test  = X2[test_m],  y_[test_m]

n_classes = int(np.max(y_train)) + 1
print("Shapes:",
      "\n  X_train:", X_train.shape,
      "\n  X_val:  ", X_val.shape,
      "\n  X_test: ", X_test.shape,
      "\n  #classes:", n_classes)

# --------------------------------------------------
# Standardization (fit on TRAIN only to avoid leakage)
# --------------------------------------------------
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)
print("Standardization: ON (train-only).")

# --------------------------------------------------
# Class weights to handle imbalance
# --------------------------------------------------
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("Class weights:", class_weight)

# --------------------------------------------------
# Linear SVM configuration
# --------------------------------------------------
C = 1.0
max_iter = 8000

# Heuristic choice for dual formulation
# (dual=True is preferred when n_samples <= n_features)
dual_choice = (X_train_s.shape[0] <= X_train_s.shape[1])

try:
    clf = LinearSVC(
        C=C,
        class_weight=class_weight,
        random_state=SEED,
        max_iter=max_iter,
        dual=dual_choice
    )
    clf.fit(X_train_s, y_train)
except Exception as e:
    # Fallback: flip dual flag if optimization fails
    print(f"dual={dual_choice} failed, retrying with opposite. Error:", repr(e))
    clf = LinearSVC(
        C=C,
        class_weight=class_weight,
        random_state=SEED,
        max_iter=max_iter,
        dual=(not dual_choice)
    )
    clf.fit(X_train_s, y_train)

# --------------------------------------------------
# Evaluation helper
# --------------------------------------------------
def report(name, Xs, ys):
    yp = clf.predict(Xs)
    print(f"\n{name}")
    print("F1-macro:", f1_score(ys, yp, average="macro"))
    print(classification_report(ys, yp, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(ys, yp))

# --------------------------------------------------
# Final evaluation on each split
# --------------------------------------------------
report("TRAIN", X_train_s, y_train)
report("VAL",   X_val_s,   y_val)
report("TEST",  X_test_s,  y_test)


Using features (hs/pwr excluded, temperature included) + DELTA6:
['ws', 'tp', 'mslp', 'surge', 'twl', 'steep', 'temp', 'dwd_sin', 'dwd_cos', 'ws_d6', 'mslp_d6', 'surge_d6', 'twl_d6', 'steep_d6', 'temp_d6']
Dropping 6 windows due to NaN/inf.
Shapes: 
  X_train: (262962, 540) 
  X_val:   (43824, 540) 
  X_test:  (43813, 540) 
  #classes: 5
Standardization: ON (train-only).
Class weights: {0: 0.27557651497288377, 1: 1.1130902241317278, 2: 2.823146706747544, 3: 11.151908396946565, 4: 34.532107682206174}

TRAIN
F1-macro: 0.6131243596244593
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    190845
           1       0.74      0.63      0.68     47249
           2       0.51      0.65      0.57     18629
           3       0.45      0.44      0.45      4716
           4       0.48      0.35      0.41      1523

    accuracy                           0.87    262962
   macro avg       0.63      0.61      0.61    262962
weighted avg       0.87  

delta6 ve ma6 combined

In [None]:
import numpy as np
import numpy.lib.stride_tricks as st

from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight

# --------------------------------------------------
# Reproducibility
# --------------------------------------------------
SEED = 42
np.random.seed(SEED)

# Sliding window length
W = int(W)

# --------------------------------------------------
# Base (raw) feature set
# --------------------------------------------------
BASE_COLS = [
    "ws", "tp", "mslp", "surge", "twl", "steep",
    "temp",
    "dwd_sin", "dwd_cos"
]

# --------------------------------------------------
# Delta feature setup (simple trend information)
# --------------------------------------------------
DELTA_H = 6
DELTA_BASE = ["ws", "mslp", "surge", "twl", "steep", "temp"]

# Create delta features if they do not already exist
for c in DELTA_BASE:
    dname = f"{c}_d{DELTA_H}"
    if dname not in df.columns:
        df[dname] = df[c] - df[c].shift(DELTA_H)

DELTA_COLS = [f"{c}_d{DELTA_H}" for c in DELTA_BASE]

# Final feature list: RAW + delta features
COLS = BASE_COLS + DELTA_COLS
print("Using features (hs/pwr excluded, temperature included) + DELTA6:")
print(COLS)

# --------------------------------------------------
# Build sliding windows: (num_windows, W, num_features)
# --------------------------------------------------
arrs = [df[c].to_numpy(np.float32) for c in COLS]
X_list = [st.sliding_window_view(a, window_shape=W) for a in arrs]
X = np.stack(X_list, axis=-1).astype(np.float32)

# Labels and split masks (aligned to windowed data)
y_ = np.asarray(y)[:len(X)]
train_m = np.asarray(train_mask)[:len(X)]
val_m   = np.asarray(val_mask)[:len(X)]
test_m  = np.asarray(test_mask)[:len(X)]

# --------------------------------------------------
# Drop windows containing NaN or inf values
# (delta features naturally introduce NaNs at the start)
# --------------------------------------------------
finite_mask = np.isfinite(X).all(axis=(1, 2))
if not finite_mask.all():
    print(f"Dropping {(~finite_mask).sum()} windows due to NaN/inf.")
    X = X[finite_mask]
    y_ = y_[finite_mask]
    train_m = train_m[finite_mask]
    val_m   = val_m[finite_mask]
    test_m  = test_m[finite_mask]

# --------------------------------------------------
# Flatten windows for Linear SVM (2D input required)
# --------------------------------------------------
X2 = X.reshape(X.shape[0], -1).astype(np.float32)

# Train / validation / test split
X_train, y_train = X2[train_m], y_[train_m]
X_val,   y_val   = X2[val_m],   y_[val_m]
X_test,  y_test  = X2[test_m],  y_[test_m]

n_classes = int(np.max(y_train)) + 1
print("Shapes:",
      "\n  X_train:", X_train.shape,
      "\n  X_val:  ", X_val.shape,
      "\n  X_test: ", X_test.shape,
      "\n  #classes:", n_classes)

# --------------------------------------------------
# Standardization (fit on TRAIN only to avoid leakage)
# --------------------------------------------------
scaler = StandardScaler(with_mean=True, with_std=True)
X_train_s = scaler.fit_transform(X_train)
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)
print("Standardization: ON (train-only).")

# --------------------------------------------------
# Class weights to handle imbalance
# --------------------------------------------------
classes = np.unique(y_train)
cw = compute_class_weight(class_weight="balanced", classes=classes, y=y_train)
class_weight = {int(c): float(w) for c, w in zip(classes, cw)}
print("Class weights:", class_weight)

# --------------------------------------------------
# Linear SVM configuration
# --------------------------------------------------
C = 1.0
max_iter = 8000

# Heuristic choice for dual formulation
# (dual=True is preferred when n_samples <= n_features)
dual_choice = (X_train_s.shape[0] <= X_train_s.shape[1])

try:
    clf = LinearSVC(
        C=C,
        class_weight=class_weight,
        random_state=SEED,
        max_iter=max_iter,
        dual=dual_choice
    )
    clf.fit(X_train_s, y_train)
except Exception as e:
    # Fallback: flip dual flag if optimization fails
    print(f"dual={dual_choice} failed, retrying with opposite. Error:", repr(e))
    clf = LinearSVC(
        C=C,
        class_weight=class_weight,
        random_state=SEED,
        max_iter=max_iter,
        dual=(not dual_choice)
    )
    clf.fit(X_train_s, y_train)

# --------------------------------------------------
# Evaluation helper
# --------------------------------------------------
def report(name, Xs, ys):
    yp = clf.predict(Xs)
    print(f"\n{name}")
    print("F1-macro:", f1_score(ys, yp, average="macro"))
    print(classification_report(ys, yp, zero_division=0))
    print("Confusion matrix:")
    print(confusion_matrix(ys, yp))

# --------------------------------------------------
# Final evaluation on each split
# --------------------------------------------------
report("TRAIN", X_train_s, y_train)
report("VAL",   X_val_s,   y_val)
report("TEST",  X_test_s,  y_test)


Using features (hs/pwr excluded, temperature included) + LITE MA6 + DELTA6:
['ws', 'tp', 'mslp', 'surge', 'twl', 'steep', 'temp', 'dwd_sin', 'dwd_cos', 'ws_ma6', 'mslp_ma6', 'surge_ma6', 'twl_ma6', 'steep_ma6', 'temp_ma6', 'ws_d6', 'mslp_d6', 'surge_d6', 'twl_d6', 'steep_d6', 'temp_d6']
Dropping 6 windows due to NaN/inf.
Shapes: 
  X_train: (262962, 756) 
  X_val:   (43824, 756) 
  X_test:  (43813, 756) 
  #classes: 5
Standardization: ON (train-only).
Class weights: {0: 0.27557651497288377, 1: 1.1130902241317278, 2: 2.823146706747544, 3: 11.151908396946565, 4: 34.532107682206174}

TRAIN
F1-macro: 0.6133133741058392
              precision    recall  f1-score   support

           0       0.96      0.97      0.96    190845
           1       0.74      0.63      0.68     47249
           2       0.51      0.65      0.57     18629
           3       0.46      0.44      0.45      4716
           4       0.48      0.35      0.41      1523

    accuracy                           0.87    2629