In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense
from tqdm import tqdm

In [2]:
# List all available devices detected by TensorFlow
print("Available devices:")
devices = tf.config.list_physical_devices()
for device in devices:
    print(device)

Available devices:
PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')


## Load and prepare data

In [3]:
# Load the data
data = pd.read_csv("../dataset.csv")

In [4]:
# Separate features and target
X = data.drop("Label", axis=1)
y = data["Label"]

In [5]:
# Convert categorical data to numeric if necessary
X = pd.get_dummies(X, sparse=True)

In [6]:
# chunk_size = 10000  # Adjust based on your system's memory
# chunks = []
# for chunk in tqdm(pd.read_csv('../dataset.csv', chunksize=chunk_size)):
#     # Process each chunk
#     chunk = pd.get_dummies(chunk, sparse=True)
#     chunks.append(chunk)

# # Concatenate all processed chunks
# X = pd.concat(chunks, ignore_index=True)

In [7]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# Standardize the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

  if np.may_share_memory(array, array_orig):
  if np.may_share_memory(array, array_orig):


## Build an AutoEncoder

In [9]:
# Define the encoder
input_layer = Input(shape=(X_train_scaled.shape[1],))
encoded = Dense(128, activation="relu")(input_layer)
encoded = Dense(64, activation="relu")(encoded)

In [10]:
# Define the decoder
decoded = Dense(128, activation="relu")(encoded)
decoded = Dense(X_train_scaled.shape[1], activation="sigmoid")(decoded)

In [11]:
# Define the autoencoder
autoencoder = Model(input_layer, decoded)

# Encoder for predictions
encoder = Model(input_layer, encoded)

# Prediction model
predictor = Dense(1, activation="sigmoid")(encoded)
prediction_model = Model(input_layer, predictor)

# Compile models
autoencoder.compile(optimizer="adam", loss="mse")
prediction_model.compile(
    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
)

In [12]:
# Display the model summary
autoencoder.summary()
prediction_model.summary()

## Train and evaluate

In [16]:
# Train the autoencoder
autoencoder.fit(
    X_train_scaled, X_train_scaled, epochs=30, batch_size=256, validation_split=0.2
)

Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 59ms/step - loss: 0.8007 - val_loss: 0.8012
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 48ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 50ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 51ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 7/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 49ms/step - loss: 0.8007 - val_loss: 0.8011
Epoch 8/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 45ms/step - loss: 0.8007 - val_loss: 0.8010
Epoch 9/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x31d487810>

In [17]:
# Train the prediction model
prediction_model.fit(
    X_train_scaled, y_train, epochs=30, batch_size=256, validation_split=0.2
)

Epoch 1/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - accuracy: 0.9993 - loss: 0.0013 - val_accuracy: 0.9887 - val_loss: 0.1474
Epoch 2/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 1.0000 - loss: 7.9383e-05 - val_accuracy: 0.9875 - val_loss: 0.1482
Epoch 3/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 6.8381e-05 - val_accuracy: 0.9875 - val_loss: 0.1475
Epoch 4/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 6.8023e-05 - val_accuracy: 0.9875 - val_loss: 0.1468
Epoch 5/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 6.5381e-05 - val_accuracy: 0.9875 - val_loss: 0.1461
Epoch 6/30
[1m25/25[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 1.0000 - loss: 6.4966e-05 - val_accuracy: 0.9875 - val_loss: 0.1456
Epoch 7/30
[1

<keras.src.callbacks.history.History at 0x31d643350>

In [18]:
# Evaluate the prediction model
test_loss, test_acc = prediction_model.evaluate(X_test_scaled, y_test)
print("Test Accuracy: {:.2f}%".format(test_acc * 100))

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.4809 - loss: 3.0862
Test Accuracy: 49.50%
