In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from data_processing import clean_combined_data  

# Load and clean data
combined_data = clean_combined_data()


X = combined_data.drop(columns=['label', 'timestamp'])
y = combined_data['label']

# Split data 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Scale the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [3]:
import tensorflow as tf
from tensorflow.keras import Sequential # type: ignore
from tensorflow.keras.layers import Dense, Dropout # type: ignore
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard # type: ignore
from sklearn.metrics import classification_report, accuracy_score
import numpy as np

# Ensure that data is a numpy array 
X_train_scaled = np.array(X_train_scaled, dtype='float32')
y_train = np.array(y_train)
X_test_scaled = np.array(X_test_scaled, dtype='float32')
y_test = np.array(y_test)

unique_labels = np.unique(y_train)
label_mapping = {old_label: new_label for new_label, old_label in enumerate(unique_labels)}


y_train = np.array([label_mapping[label] for label in y_train])
y_test = np.array([label_mapping[label] for label in y_test])


print("Unique labels in y_train:", np.unique(y_train))
print("Unique labels in y_test:", np.unique(y_test))

num_classes = len(np.unique(y_train))
print("Number of unique classes:", num_classes)

# Build neural network 
model = Sequential([
    Dense(128, input_shape=(X_train_scaled.shape[1],), activation='relu'),
    Dropout(0.4),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(num_classes, activation='softmax')  
])

model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=2, restore_best_weights=True)

tensorboard = TensorBoard(log_dir='./logs', histogram_freq=1)


model.fit(X_train_scaled, y_train, epochs=10, batch_size=32, validation_split=0.2,
          callbacks=[early_stopping, tensorboard])


nn_results = model.evaluate(X_test_scaled, y_test)
y_pred_nn = model.predict(X_test_scaled).argmax(axis=1)

print(f'Neural Network Accuracy: {nn_results[1]:.4f}')
print(classification_report(y_test, y_pred_nn))



Unique labels in y_train: [ 0  1  2  3  4  5  6  7  8  9 10 11]
Unique labels in y_test: [ 0  1  2  3  4  5  6  7  8  9 10 11]
Number of unique classes: 12
Epoch 1/10
[1m113074/113074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m411s[0m 4ms/step - accuracy: 0.8494 - loss: 0.4735 - val_accuracy: 0.8777 - val_loss: 0.3750
Epoch 2/10
[1m113074/113074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m389s[0m 3ms/step - accuracy: 0.8691 - loss: 0.4014 - val_accuracy: 0.8788 - val_loss: 0.3684
Epoch 3/10
[1m113074/113074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 3ms/step - accuracy: 0.8708 - loss: 0.3949 - val_accuracy: 0.8793 - val_loss: 0.3665
Epoch 4/10
[1m113074/113074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m400s[0m 4ms/step - accuracy: 0.8716 - loss: 0.3929 - val_accuracy: 0.8798 - val_loss: 0.3646
Epoch 5/10
[1m113074/113074[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m390s[0m 3ms/step - accuracy: 0.8723 - loss: 0.3899 - val_accuracy: 0.8805 - val_loss: 0

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

# Random Forest 
rf_clf = RandomForestClassifier(
    n_estimators=50,  
    max_depth=15,     
    n_jobs=-1,        
    random_state=42
)


rf_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_rf = rf_clf.predict(X_test)
print(f'Random Forest Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}')
print(classification_report(y_test, y_pred_rf))


Random Forest Accuracy: 0.8809
              precision    recall  f1-score   support

           0       0.74      0.88      0.81    359238
           1       0.93      0.83      0.88     87592
           2       0.57      0.07      0.13     76676
           3       0.71      0.00      0.01     22897
           4       0.62      0.01      0.01     20140
           5       0.74      0.91      0.82    222327
           6       0.99      1.00      0.99    870840
           7       1.00      1.00      1.00    128693
           8       0.78      0.87      0.82    118360
           9       0.76      0.41      0.54     16772
          10       0.75      0.19      0.31     12493
          11       0.67      0.16      0.25      2371

    accuracy                           0.88   1938399
   macro avg       0.77      0.53      0.55   1938399
weighted avg       0.87      0.88      0.86   1938399



In [5]:
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import classification_report, accuracy_score

# Gaussian Naive Bayes model
gnb_clf = GaussianNB()
gnb_clf.fit(X_train, y_train)

# Predict and evaluate
y_pred_gnb = gnb_clf.predict(X_test)
print(f'Bayesian Network Accuracy: {accuracy_score(y_test, y_pred_gnb):.4f}')
print(classification_report(y_test, y_pred_gnb, zero_division=0))


Bayesian Network Accuracy: 0.7750
              precision    recall  f1-score   support

           0       0.65      0.53      0.59    359238
           1       0.62      0.61      0.62     87592
           2       0.15      0.10      0.12     76676
           3       0.00      0.00      0.00     22897
           4       0.00      0.00      0.00     20140
           5       0.55      0.93      0.69    222327
           6       0.97      0.98      0.97    870840
           7       0.92      0.98      0.95    128693
           8       0.57      0.54      0.56    118360
           9       0.35      0.11      0.17     16772
          10       0.00      0.00      0.00     12493
          11       0.13      0.01      0.01      2371

    accuracy                           0.77   1938399
   macro avg       0.41      0.40      0.39   1938399
weighted avg       0.75      0.77      0.76   1938399



In [6]:
import pandas as pd

results = {
    'Model': ['Neural Network', 'Random Forest', 'Bayesian Network'],
    'Accuracy': [nn_results[1], accuracy_score(y_test, y_pred_rf), accuracy_score(y_test, y_pred_gnb)],
}

results_df = pd.DataFrame(results)
print(results_df)


              Model  Accuracy
0    Neural Network  0.880196
1     Random Forest  0.880920
2  Bayesian Network  0.774960
