In [4]:
import os, glob, json
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

def find_project_root() -> str:
    """
    Try current dir and up to 4 parents to locate a folder that contains 'backend/ml/dataset/raw'.
    If not found, return current working directory.
    """
    cwd = os.getcwd()
    candidates = [cwd]
    # try parents
    cur = cwd
    for _ in range(4):
        cur = os.path.dirname(cur)
        if cur and cur not in candidates:
            candidates.append(cur)
    for base in candidates:
        raw_dir = os.path.join(base, "backend", "ml", "dataset", "raw")
        if os.path.isdir(raw_dir):
            return base
    return cwd

PROJECT_ROOT = find_project_root()
RAW_DIR = os.path.join(PROJECT_ROOT, "backend", "ml", "dataset", "raw")
PROCESSED_DIR = os.path.join(PROJECT_ROOT, "backend", "ml", "dataset", "processed")
os.makedirs(PROCESSED_DIR, exist_ok=True)

# ---------------------------
# 1. Load & Concatenate Raw CSVs
# ---------------------------
raw_files = sorted(glob.glob(os.path.join(RAW_DIR, "*.csv")))
if not raw_files:
    raise FileNotFoundError(f"No raw CSV files found in {RAW_DIR}")

dfs = []
for fp in raw_files:
    df = pd.read_csv(fp, on_bad_lines='skip')  # skip malformed lines
    dfs.append(df)
data = pd.concat(dfs, ignore_index=True)
print(f"Loaded {len(dfs)} files → {data.shape[0]} rows")

# ---------------------------
# 2. Basic Feature Engineering
# ---------------------------
# Example features — adapt to your dataset columns if needed
data["Hold_Time"] = data["Release_Time"] - data["Press_Time"]
features = ["Press_Time", "Release_Time", "Hold_Time"]
X = data[features].values
y = (data["Hold_Time"] > data["Hold_Time"].median()).astype(int).values  # example label

# ---------------------------
# 3. Scale + Split + Reshape to sequences
# ---------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

SEQ_LEN = 32
num_samples = X_scaled.shape[0] - SEQ_LEN
X_seq = np.array([X_scaled[i:i+SEQ_LEN] for i in range(num_samples)])
y_seq = np.array([y[i:i+SEQ_LEN] for i in range(num_samples)])

X_train, X_temp, y_train, y_temp = train_test_split(X_seq, y_seq, test_size=0.3, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

# ---------------------------
# 4. Save arrays & config
# ---------------------------
np.save(os.path.join(PROCESSED_DIR, "X_train.npy"), X_train)
np.save(os.path.join(PROCESSED_DIR, "y_train.npy"), y_train)
np.save(os.path.join(PROCESSED_DIR, "X_val.npy"), X_val)
np.save(os.path.join(PROCESSED_DIR, "y_val.npy"), y_val)
np.save(os.path.join(PROCESSED_DIR, "X_test.npy"), X_test)
np.save(os.path.join(PROCESSED_DIR, "y_test.npy"), y_test)

with open(os.path.join(PROCESSED_DIR, "feature_config.json"), "w") as f:
    json.dump({
        "seq_len": SEQ_LEN,
        "features": features,
        "scaler_mean": scaler.mean_.tolist(),
        "scaler_scale": scaler.scale_.tolist()
    }, f, indent=2)

print("✅ Preprocessing complete. Files saved in:", PROCESSED_DIR)


Loaded 6 files → 8948 rows
✅ Preprocessing complete. Files saved in: c:\Users\geloq\OneDrive\Desktop\pd-keyboard-app\backend\ml\dataset\processed
