# Experiment 1: Problem of unbalanced data

### One class classification vs Binary classification

In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = os.path.dirname(os.path.dirname(current_dir))

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import OneClassSVM
    from sklearn.ensemble import IsolationForest

    from imblearn.over_sampling import SMOTE

    # Custom utilities
    from utils.feature_extraction import transform_data
    from utils.load_data import load_data
    from utils.result_utils import create_results_df, record_result


    from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
    import matplotlib.pyplot as plt


    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")

In [None]:
#0 == good | 1 == bad |
X, y, y_binary = load_data()

### Experiment 1.1: One Class Classification (OneClassSVM)

In [None]:
# Split data into training (80%) and testing (20%) sets while maintaining class distribution
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.4, random_state=42, stratify=y_binary)

# Extract features from raw data for both training and testing sets
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

# Initialize One-Class SVM model
# gamma='scale' automatically scales gamma based on feature variance
# nu=0.01 controls the upper bound on training errors and lower bound on support vectors
model = OneClassSVM(gamma='scale', nu=0.01)

# Filter training data to only include normal samples (class 0)
trainX_0 = trainX_tr[trainy_tr==0]
# Train the model on normal samples only
model.fit(trainX_0)

# Make predictions on test data
# OneClassSVM returns: 1 for inliers (normal), -1 for outliers (anomalies)
yhat = model.predict(testX_tr)

# Convert ground truth labels to match OneClassSVM output format:
# -1 for anomalies (class 1), 1 for normal (class 0)
testy_converted = np.array(testy.copy())
# Then apply the conversion
testy_converted[testy_converted == 1] = -1
testy_converted[testy_converted == 0] = 1

# Calculate F1 score for anomaly detection (using -1 as the positive class)
score = f1_score(testy_converted, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

# Calculate confusion matrix
cm = confusion_matrix(testy_converted, yhat)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Anomaly (-1)', 'Normal (1)'], 
            yticklabels=['Anomaly (-1)', 'Normal (1)'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - OneClassSVM')
plt.show()

# Display classification report
print(classification_report(testy_converted, yhat, target_names=['Anomaly (-1)', 'Normal (1)']))

### Experiment 1.2: One Class Classification (Isolation Forest)

In [None]:
# Split data into training (80%) and testing (20%) sets while maintaining class distribution
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.4, random_state=42, stratify=y_binary)

# Extract features from raw data for both training and testing sets
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

# Initialize Isolation Forest model
# Default parameters work well for anomaly detection
model = IsolationForest(contamination=0.01)

# Filter training data to only include normal samples (class 0)
trainX_0 = trainX_tr[trainy_tr==0]
# Train the model on normal samples only
model.fit(trainX_0)

# Make predictions on test data
# Isolation Forest returns: 1 for inliers (normal), -1 for outliers (anomalies)
yhat = model.predict(testX_tr)

# Convert ground truth labels to match Isolation Forest output format:
# -1 for anomalies (class 1), 1 for normal (class 0)
testy_converted = np.array(testy.copy())
# Then apply the conversion
testy_converted[testy_converted == 1] = -1
testy_converted[testy_converted == 0] = 1

# Calculate F1 score for anomaly detection (using -1 as the positive class)
score = f1_score(testy_converted, yhat, pos_label=-1)
print('F1 Score: %.3f' % score)

# Calculate confusion matrix
cm = confusion_matrix(testy_converted, yhat)

# Display confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=['Anomaly (-1)', 'Normal (1)'], 
            yticklabels=['Anomaly (-1)', 'Normal (1)'])
plt.xlabel('Predicted')
plt.ylabel('True')
plt.title('Confusion Matrix - Isolation Forest')
plt.show()

# Display classification report
print(classification_report(testy_converted, yhat, target_names=['Anomaly (-1)', 'Normal (1)']))

### Experiment 1.3: Binary Classification (Random Forest)