# Experiment 1: Problem of unbalanced data

### One class classification vs Binary classification

In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = os.path.dirname(os.path.dirname(current_dir))

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))

    # Standard libraries
    import numpy as np
    import pandas as pd
    from IPython.display import display
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.preprocessing import StandardScaler
    from sklearn.svm import OneClassSVM
    from sklearn.ensemble import IsolationForest,RandomForestClassifier

    from imblearn.over_sampling import SMOTE
    from imblearn.under_sampling import RandomUnderSampler

    # Custom utilities
    from utils.feature_extraction import transform_data
    from utils.load_data import load_data
    from utils.result_utils import create_results_df, record_result


    from sklearn.metrics import f1_score, confusion_matrix, ConfusionMatrixDisplay, classification_report
    import matplotlib.pyplot as plt


    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")

In [None]:
#0 == good | 1 == bad |
X, y, y_binary = load_data()

### Experiment 1.1: One Class Classification (OneClassSVM)

In [None]:
# Split data into training (80%) and testing (20%) sets while maintaining class distribution
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.25, random_state=42, stratify=y_binary)

# Extract features from raw data for both training and testing sets
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

# Print class distribution of testy
unique, counts = np.unique(testy, return_counts=True)
print("Class distribution of testy:")
for class_label, count in zip(unique, counts):
    print(f"Class {class_label}: {count} samples ({count/len(testy)*100:.2f}%)")

# Initialize One-Class SVM model
# gamma='scale' automatically scales gamma based on feature variance
# nu=0.01 controls the upper bound on training errors and lower bound on support vectors
model = OneClassSVM(gamma='scale', nu=0.01)

# Filter training data to only include normal samples (class 0)
trainX_0 = trainX_tr[trainy_tr==0]
# Train the model on normal samples only
model.fit(trainX_0)

# Make predictions on test data
# OneClassSVM returns: 1 for inliers (normal), -1 for outliers (anomalies)
yhat = model.predict(testX_tr)

# Convert ground truth labels to match OneClassSVM output format:
# -1 for anomalies (class 1), 1 for normal (class 0)
yhat_converted = np.array(yhat.copy())
# Then apply the conversion
yhat_converted[yhat_converted == 1] = 0
yhat_converted[yhat_converted == -1] = 1

# Calculate F1 score for anomaly detection (using -1 as the positive class)
score_1_1 = f1_score(testy_tr, yhat_converted, pos_label=1)
print('F1 Score: %.3f' % score_1_1)

# Calculate confusion matrix
cm_1_1 = confusion_matrix(testy_tr, yhat_converted,labels=[1,0])

disp = ConfusionMatrixDisplay(confusion_matrix=cm_1_1, display_labels=['Anomaly (1)','Normal (0)'])
disp.plot()
plt.show()


### Experiment 1.2: One Class Classification (Isolation Forest)

In [None]:
# Split data into training (80%) and testing (20%) sets while maintaining class distribution
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.25, random_state=42, stratify=y_binary)

# Extract features from raw data for both training and testing sets
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

# Initialize Isolation Forest model
# Default parameters work well for anomaly detection
model = IsolationForest(contamination=0.01,random_state=42)

# Filter training data to only include normal samples (class 0)
trainX_0 = trainX_tr[trainy_tr==0]
# Train the model on normal samples only
model.fit(trainX_0)

# Make predictions on test data
# Isolation Forest returns: 1 for inliers (normal), -1 for outliers (anomalies)
yhat = model.predict(testX_tr)

# Convert ground truth labels to match Isolation Forest output format:
# -1 for anomalies (class 1), 1 for normal (class 0)
yhat_converted = np.array(yhat.copy())
# Then apply the conversion
yhat_converted[yhat_converted == 1] = 0
yhat_converted[yhat_converted == -1] = 1

# Calculate F1 score for anomaly detection (using -1 as the positive class)
score_1_2 = f1_score(testy_tr, yhat_converted, pos_label=1)
print('F1 Score: %.3f' % score_1_2)

# Calculate confusion matrix
cm_1_2 = confusion_matrix(testy_tr, yhat_converted,labels=[1,0])

disp = ConfusionMatrixDisplay(confusion_matrix=cm_1_2, display_labels=['Anomaly (1)','Normal (0)'])
disp.plot()
plt.show()

### Experiment 1.3: Binary Classification (Random Forest)

In [None]:
# Split data into training (80%) and testing (20%) sets while maintaining class distribution
trainX, testX, trainy, testy = train_test_split(X, y_binary, test_size=0.25, random_state=42, stratify=y_binary)

# transform data only once to avoid repeated expensive operations
trainX_tr, trainy_tr = transform_data(trainX, trainy, label_type='binary')
testX_tr, testy_tr = transform_data(testX, testy, label_type='binary')

# Define parameter grids for hyperparameter optimization
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer, f1_score

# Parameter grid for sampling methods and Random Forest
param_grid = {
    'rus__sampling_strategy': [0.2, 0.25, 0.3],
    'smote__k_neighbors': [3, 5, 7],
    'rf__n_estimators': [100, 150, 200],
    'rf__max_depth': [10, 15, 20],
    'rf__min_samples_split': [2, 5],
    'rf__min_samples_leaf': [1, 2],
    'rf__max_features': ['sqrt', 'log2']
}

# Use imblearn's Pipeline instead of sklearn's
from imblearn.pipeline import Pipeline

# Create a pipeline with undersampling, SMOTE, and Random Forest
pipeline = Pipeline([
    ('rus', RandomUnderSampler(random_state=42)),
    ('smote', SMOTE(random_state=42)),
    ('rf', RandomForestClassifier(random_state=42))
])

# Use F1 score as the scoring metric
f1_scorer = make_scorer(f1_score, pos_label=1)

# Perform grid search with 3-fold cross-validation
grid_search = GridSearchCV(
    pipeline, 
    param_grid, 
    cv=3, 
    scoring=f1_scorer,
    n_jobs=-1,
    verbose=1
)

# Fit the grid search to the preprocessed data (already transformed)
grid_search.fit(trainX_tr, trainy_tr)

# Print best parameters and score
print("Best parameters:", grid_search.best_params_)
print("Best cross-validation F1 score: {:.3f}".format(grid_search.best_score_))

# Use the best model for prediction
best_model = grid_search.best_estimator_
yhat = best_model.predict(testX_tr)
score_1_3 = f1_score(testy_tr, yhat, pos_label=1)
print('Test F1 Score: {:.3f}'.format(score_1_3))

# Calculate confusion matrix
cm_1_3 = confusion_matrix(testy_tr, yhat, labels=[1,0])
disp = ConfusionMatrixDisplay(confusion_matrix=cm_1_3, display_labels=['Anomaly (1)','Normal (0)'])
disp.plot()
plt.show()

In [None]:
# Get the best model from grid search
best_model

# Extract feature importances from the Random Forest classifier
feature_importances = best_model.named_steps['rf'].feature_importances_
feature_names = trainX_tr.columns

# Create a DataFrame to display feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
})

# Sort by importance and display top 20 features
top_features = importance_df.sort_values('Importance', ascending=False).head(20)
print("Top 20 features selected by the model:")
display(top_features)

# Visualize feature importances
plt.figure(figsize=(10, 6))
plt.barh(top_features['Feature'][:10], top_features['Importance'][:10])
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.title('Top 10 Most Important Features')
plt.tight_layout()
plt.show()

### Experiment 1: Results

In [None]:
# Create a table with all results
import pandas as pd
from sklearn.metrics import precision_score, recall_score


# Collect all results
results = [
    {
        "Model": "OneClassSVM",
        "Class": "One Class Classification",
        "F1 Score": score_1_1,
        "Precision": precision_score(testy_tr, yhat_1_1, pos_label=1),
        "Recall": recall_score(testy_tr, yhat_1_1, pos_label=1),
        "Confusion Matrix": cm_1_1,
    },
    {
        "Model": "Isolation Forest",
        "Class": "One Class Classification",
        "F1 Score": score_1_2,
        "Precision": precision_score(testy_tr, yhat_1_2, pos_label=1),
        "Recall": recall_score(testy_tr, yhat_1_2, pos_label=1),
        "Confusion Matrix": cm_1_2
    },
    {
        "Model": "Random Forest",
        "Class": "Binary Classification",
        "F1 Score": score_1_3,
        "Precision": precision_score(testy_tr, yhat, pos_label=1),
        "Recall": recall_score(testy_tr, yhat, pos_label=1),
        "Confusion Matrix": cm_1_3
    }
]

# Create a DataFrame for display
results_df = pd.DataFrame([
    {
        "Model": r["Model"],
        "Class": r['Class'],
        "F1 Score": f"{r['F1 Score']:.3f}",
        "Precision": f"{r['Precision']:.3f}",
        "Recall": f"{r['Recall']:.3f}",
        # "Confusion Matrix":  r['Confusion Matrix']
    } for r in results
])

# Display the results table
print("Summary of Results:")
display(results_df)


In [None]:
results_df.to_latex()