In [None]:
# --- First cell for all notebooks: ---
# works locally and in Google Colab 

import sys
import os


# Detect if running in Google Colab
try:
    import google.colab
    IN_COLAB = True
except ImportError:
    IN_COLAB = False

if IN_COLAB:
    # Mount Google Drive in Colab
    from google.colab import drive
    drive.mount('/content/drive')
    
    # Set BASE_PATH assuming repo is cloned in Colab
    BASE_PATH = "/content/Real-Time-Anomaly-Detection-in-IoMT-AD-Project"
else:
    # Local environment: BASE_PATH = project root (three levels up from Models/Chaima/notebooks)
    BASE_PATH = os.path.abspath(os.path.join(os.getcwd(), "..", "..", ".."))

print(f"üìÅ BASE_PATH detected as: {BASE_PATH}")

# Add Models/Chaima folder to Python path for imports
sys.path.append(os.path.join(BASE_PATH, "Models", "Chaima"))
print("‚úÖ PYTHONPATH updated. Models/Chaima folder included.")

In [None]:
from dataset_loader import load_splits
from preprocess import preprocess
from feature_selection import feature_selection
import json

# Load raw dataset splits
train, val, test = load_splits(BASE_PATH)

# Preprocess
X_train_s, X_val_s, X_test_s, features = preprocess(train, val, test, BASE_PATH)

# Feature selection
selected_features = feature_selection(X_train_s, features, BASE_PATH)

In [None]:
import numpy as np

INTERMEDIATE_DIR = os.path.join(BASE_PATH, "Dataset/intermediate")
os.makedirs(INTERMEDIATE_DIR, exist_ok=True)

# Save preprocessed arrays
np.save(os.path.join(INTERMEDIATE_DIR, "X_train_s.npy"), X_train_s)
np.save(os.path.join(INTERMEDIATE_DIR, "X_val_s.npy"), X_val_s)
np.save(os.path.join(INTERMEDIATE_DIR, "X_test_s.npy"), X_test_s)

# Save all feature names
with open(os.path.join(INTERMEDIATE_DIR, "features.json"), "w") as f:
    json.dump(features, f, indent=4)

# Load selected features (for display)
with open(os.path.join(BASE_PATH, "Spark/selected_features.json")) as f:
    sel = json.load(f)

print(f"Number of selected features: {len(sel)}")
print(f"First 10 features: {sel[:10]}")


In [None]:
with open(os.path.join(BASE_PATH, "Spark/selected_features.json")) as f:
    sel = json.load(f)

len(sel), sel[:10]


(31,
 ['flow_duration',
  'Header_Length',
  'Protocol Type',
  'Duration',
  'Rate',
  'fin_flag_number',
  'syn_flag_number',
  'rst_flag_number',
  'psh_flag_number',
  'ack_flag_number'])