In [1]:
import numpy as np
import pandas as pd
import glob
import os
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
# Load your datasets (adjust the paths if needed)
X1 = np.load('deep_learning_data/mo_ew2_accdata_21_10_139-1336_x.npy')
y1 = pd.read_csv('deep_learning_data/mo_ew2_accdata_21_10_139-1336_y.csv')

X2 = np.load('deep_learning_data/mo_ew3_accdata_16_12_1419-1449_x.npy')
y2 = pd.read_csv('deep_learning_data/mo_ew3_accdata_16_12_1419-1449_y.csv')

In [15]:
# Check dataset info
print("X1 shape:", X1.shape)
print("y1 unique:", np.unique(y1))
print("y1 shape:", y1.shape)
print("X2 shape:", X2.shape)
print("y2 unique:", np.unique(y2))
print("y2 shape:", y2.shape)

X1 shape: (1186, 24, 7)
y1 unique: [-1.  0.  1.]
y1 shape: (1185, 1)
X2 shape: (1182, 24, 7)
y2 unique: [-1.  0.  1.]
y2 shape: (1181, 1)


In [16]:
# Get paths for all X and y files
x_files = glob.glob('deep_learning_data/*_x.npy')
y_files = glob.glob('deep_learning_data/*_y.csv')

# Sort to ensure matching pairs
x_files.sort()
y_files.sort()

print(f"Found {len(x_files)} dataset pairs")

Found 44 dataset pairs


In [18]:
assert len(x_files) == len(y_files), "Mismatch in number of X vs Y files!"

In [19]:
def load_data(x_path, y_path):
    X = np.load(x_path)                     # shape (n_windows, 24, 7)
    y = pd.read_csv(y_path).values.squeeze() # shape (n_labels,)
    n = min(len(X), len(y))
    return X[:n], y[:n]

# Load & accumulate
all_X, all_y = [], []
for xf, yf in zip(x_files, y_files):
    X, y = load_data(xf, yf)
    all_X.append(X)
    all_y.append(y)

# Concatenate across sessions
X_all = np.concatenate(all_X, axis=0)       # (total_windows, 24, 7)
y_all = np.concatenate(all_y, axis=0)       # (total_windows,)

print(X_all.shape)
print(y_all.shape)

(53836, 24, 7)
(53836,)


In [20]:
print(f"Total number of sessions: {len(x_files)}")
print(f"Final X shape: {X_all.shape}")
print(f"Final y shape: {y_all.shape}")
print(f"Unique labels in y: {np.unique(y_all)}")
print(f"Label distribution: {np.bincount(y_all.astype(int) + 1)}")  # +1 to handle -1 labels

Total number of sessions: 44
Final X shape: (53836, 24, 7)
Final y shape: (53836,)
Unique labels in y: [-1.  0.  1.]
Label distribution: [17954 17943 17939]


In [22]:
# Train/Val/Test split (70%/15%/15%) with stratification
X_temp, X_test, y_temp, y_test = train_test_split(
    X_all, y_all, test_size=0.15, stratify=y_all, random_state=42
)
# Now split the 85% into 70/15 overall:
val_ratio = 0.15 / 0.85
X_train, X_val, y_train, y_val = train_test_split(
    X_temp, y_temp, test_size=val_ratio, stratify=y_temp, random_state=42
)

print(f"Train / Val / Test shapes:\n"
      f"  X_train: {X_train.shape}, y_train: {y_train.shape}\n"
      f"  X_val:   {X_val.shape},   y_val:   {y_val.shape}\n"
      f"  X_test:  {X_test.shape},  y_test:  {y_test.shape}")

Train / Val / Test shapes:
  X_train: (37684, 24, 7), y_train: (37684,)
  X_val:   (8076, 24, 7),   y_val:   (8076,)
  X_test:  (8076, 24, 7),  y_test:  (8076,)


In [23]:
# Z-score normalization (fit on train only)
scaler = StandardScaler()
# flatten time & channels: (N, T, C) → (N*T, C)
train_flat = X_train.reshape(-1, X_train.shape[-1])
scaler.fit(train_flat)

def zscore_normalize(X):
    flat = X.reshape(-1, X.shape[-1])
    scaled = scaler.transform(flat)
    return scaled.reshape(X.shape)

X_train = zscore_normalize(X_train)
X_val = zscore_normalize(X_val)  # Fixed missing parenthesis
X_test = zscore_normalize(X_test)

print("Done! Your data is now ready for modeling.")

Done! Your data is now ready for modeling.
