In [1]:
!rm -rf spark-3.3.2-bin-hadoop3 spark-3.3.2-bin-hadoop3.tgz

!wget -O spark-3.3.2-bin-hadoop3.tgz https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz

!tar -xzf spark-3.3.2-bin-hadoop3.tgz

!pip install -q findspark pyspark tensorflow numpy matplotlib

--2025-08-10 17:21:17--  https://archive.apache.org/dist/spark/spark-3.3.2/spark-3.3.2-bin-hadoop3.tgz
Resolving archive.apache.org (archive.apache.org)... 65.108.204.189, 2a01:4f9:1a:a084::2
Connecting to archive.apache.org (archive.apache.org)|65.108.204.189|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 299360284 (285M) [application/x-gzip]
Saving to: ‘spark-3.3.2-bin-hadoop3.tgz’


2025-08-10 17:21:41 (12.7 MB/s) - ‘spark-3.3.2-bin-hadoop3.tgz’ saved [299360284/299360284]



In [2]:
!apt-get install openjdk-11-jdk -qq > /dev/null

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.3.2-bin-hadoop3"

import findspark
findspark.init()

from pyspark.sql import SparkSession
spark = SparkSession.builder.master("local[*]").appName("SignalAnomalyDetection").getOrCreate()

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [32]:
from pyspark.sql.functions import col, unix_timestamp
from pyspark.ml.feature import VectorAssembler, MinMaxScaler
from pyspark.sql.types import DoubleType

df = spark.read.option("header", True).csv("/content/drive/My Drive/Colab Notebooks/Book1.csv")

signal_cols = [f"signal_{i}" for i in range(1, 61)]
df = df.select(col("Timestamp"), *[col(c).cast(DoubleType()) for c in signal_cols])
df = df.withColumn("timestamp_secs", unix_timestamp("Timestamp", "MM/dd/yyyy HH:mm:ss:SSSSSS"))

assembler = VectorAssembler(inputCols=signal_cols, outputCol="features_raw")
df_vec = assembler.transform(df)

scaler = MinMaxScaler(inputCol="features_raw", outputCol="features")
scaler_model = scaler.fit(df_vec)
df_scaled = scaler_model.transform(df_vec)

In [6]:
df_scaled.limit(10).show()

+--------------------+-------------------+------------------+-------------------+------------+------------------+-------------------+--------------------+-------------------+--------------------+------------+------------+-------------------+-------------------+-------------------+-------------------+-------------------+------------+------------+-------------------+------------------+-------------------+--------------------+------------+------------+-------------------+------------------+-------------------+-------------------+-------------------+------------+--------------------+------------------+-------------------+-------------------+------------------+------------+------------+-------------------+------------------+-------------------+-------------------+------------+------------+-------------------+-------------------+-------------------+------------------+------------+--------------------+-------------------+-------------------+-------------------+-------------------+------------

In [33]:
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, DoubleType
import pandas as pd

df_pandas = df_scaled.select("features").toPandas()

features_np = np.array([row.toArray() for row in df_pandas['features']])

split_idx = int(0.8 * len(features_np))
train_data = features_np[:split_idx]
val_data = features_np[split_idx:]

param_list = [
     {"epochs": 20, "batch_size": 32, "learning_rate": 0.001},
     {"epochs": 20, "batch_size": 64, "learning_rate": 0.001},
     {"epochs": 20, "batch_size": 128, "learning_rate": 0.001},
     {"epochs": 20, "batch_size": 32, "learning_rate": 0.0005},
     {"epochs": 20, "batch_size": 64, "learning_rate": 0.0005}
 ]

best_loss = float("inf")
best_params = None

for params in param_list:
    tf.keras.backend.clear_session()

    input_layer = keras.Input(shape=(60,))
    encoded = layers.Dense(40, activation="relu")(input_layer)
    encoded = layers.Dense(20, activation="relu")(encoded)
    encoded = layers.Dense(10, activation="relu")(encoded)
    decoded = layers.Dense(20, activation="relu")(encoded)
    decoded = layers.Dense(40, activation="relu")(decoded)
    output_layer = layers.Dense(60, activation="sigmoid")(decoded)

    autoencoder = keras.Model(inputs=input_layer, outputs=output_layer)

    optimizer = keras.optimizers.Adam(learning_rate=params["learning_rate"])
    autoencoder.compile(optimizer=optimizer, loss="mse")

    history = autoencoder.fit(
        train_data, train_data,
        epochs=params["epochs"],
        batch_size=params["batch_size"],
        shuffle=True,
        validation_data=(val_data, val_data),
        verbose=0
    )

    val_loss = min(history.history["val_loss"])
    print(f"Params: {params}, Min Val Loss: {val_loss}")

    if val_loss < best_loss:
        best_loss = val_loss
        best_params = params

print(f"\nBest Params: {best_params} with Loss: {best_loss}")

tf.keras.backend.clear_session()
input_layer = keras.Input(shape=(60,))
encoded = layers.Dense(40, activation="relu")(input_layer)
encoded = layers.Dense(20, activation="relu")(encoded)
encoded = layers.Dense(10, activation="relu")(encoded)
decoded = layers.Dense(20, activation="relu")(encoded)
decoded = layers.Dense(40, activation="relu")(decoded)
output_layer = layers.Dense(60, activation="sigmoid")(decoded)

autoencoder = keras.Model(inputs=input_layer, outputs=output_layer)
optimizer = keras.optimizers.Adam(learning_rate=best_params["learning_rate"])
autoencoder.compile(optimizer=optimizer, loss="mse")

autoencoder.fit(
    train_data, train_data,
    epochs=best_params["epochs"],
    batch_size=best_params["batch_size"],
    shuffle=True,
    validation_data=(val_data, val_data),
    verbose=1
)

autoencoder.save("/content/best_autoencoder.keras")
print("Model saved at /content/best_autoencoder.keras")

Params: {'epochs': 5, 'batch_size': 32, 'learning_rate': 0.001}, Min Val Loss: 0.023620935156941414
Params: {'epochs': 5, 'batch_size': 64, 'learning_rate': 0.001}, Min Val Loss: 0.02386954054236412
Params: {'epochs': 5, 'batch_size': 32, 'learning_rate': 0.0005}, Min Val Loss: 0.023577477782964706

Best Params: {'epochs': 5, 'batch_size': 32, 'learning_rate': 0.0005} with Loss: 0.023577477782964706
Epoch 1/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 13ms/step - loss: 0.0258 - val_loss: 0.0254
Epoch 2/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 11ms/step - loss: 0.0255 - val_loss: 0.0248
Epoch 3/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0249 - val_loss: 0.0243
Epoch 4/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - loss: 0.0244 - val_loss: 0.0241
Epoch 5/5
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - loss: 0.0243 - val_loss: 0.0240
Model sav

In [43]:
from google.colab import drive
drive.mount('/content/drive')

autoencoder.save("/content/drive/My Drive/best_autoencoder.keras")
print("Model saved to Drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Model saved to Drive
