# DSS Thesis - Koen de Bonth

### Import packages

In [None]:
import os
import sys
from pathlib import Path

# Get the current working directory
current_dir = os.getcwd()

# Set the root directory to the parent of the current directory
root_dir = Path(current_dir).parent

# Add the root directory to sys.path so Python can find the utils module
sys.path.append(str(root_dir))
print(f"Added {root_dir} to Python path")

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import itertools
import h5py

# Data processing and visualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal, stats
import pywt
from tqdm import tqdm

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# Custom utilities
from utils import data_loader_utils
from utils.feature_extraction import transform_data, prepare_train_test_data
from utils.model_validation import perform_cross_validation

# Import for confusion matrix visualization
from sklearn.metrics import ConfusionMatrixDisplay


### Loading and Preparing Data

In [None]:
machines = ["M01","M02","M03"]

# total list of operations
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

# process_names without OP13
# process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP14"]

labels = ["good","bad"]

path_to_dataset = os.path.join(root_dir, "data")

X_data = []
y_data = []

for process_name, machine, label in itertools.product(process_names, machines, labels):
    data_path = os.path.join(path_to_dataset, machine, process_name, label)
    data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
    X_data.extend(data_list)
    y_data.extend(data_label)

### Data Preprocessing

### Split data into training and testing sets

In [None]:
print("y_data type: ", type(y_data))
print("X_data type: ", type(X_data))


# Count occurrences of 'good' and 'bad' in the item_list
item_list = [item.split('_')[-1] for item in y_data]
good_count = item_list.count('good')
bad_count = item_list.count('bad')

print(f"Number of 'good' items: {good_count}")
print(f"Number of 'bad' items: {bad_count}")
print(f"Total items: {len(item_list)}")


In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, stratify=[item.split('_')[-1] for item in y_data], test_size=0.3)


### Feature Extraction

In [None]:
X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)

X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

In [None]:
# Count class distribution of training set
print("Training set class distribution:")
print(pd.Series(y_train_labels).value_counts())
print(f"Total training samples: {len(y_train)}")
print("\n")

# Count class distribution of testing set
print("Testing set class distribution:")
print(pd.Series(y_test_labels).value_counts())
print(f"Total testing samples: {len(y_test)}")


### Synthetic oversampling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(sampling_strategy='auto', random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

# Set a clean Seaborn theme
sns.set_theme(style='whitegrid')

plt.figure(figsize=(14, 7))

# Define the desired order and corresponding colors:
# "good" comes first and is green, "bad" comes second and is red.
order = ['good', 'bad']
colors = ['green', 'red']

# Create counts ensuring the order is maintained:
# If a class is missing, fill it with 0.
before_counts = pd.Series(y_train_labels).value_counts().reindex(order).fillna(0)
after_counts = pd.Series(y_train_resampled).value_counts().reindex(order).fillna(0)

# Plot: Class Distribution Before SMOTE
plt.subplot(1, 2, 1)
sns.barplot(x=before_counts.index, y=before_counts.values, hue=before_counts.index, palette=colors, legend=False)
plt.title('Class Distribution Before SMOTE', fontsize=16, fontweight='bold')
plt.xlabel('Class', fontsize=14)
plt.ylabel('Count', fontsize=14)
for i, v in enumerate(before_counts.values):
    plt.text(i, v, str(int(v)), ha='center', va='bottom', fontsize=12)

# Plot: Class Distribution After SMOTE
plt.subplot(1, 2, 2)
sns.barplot(x=after_counts.index, y=after_counts.values, hue=after_counts.index, palette=colors, legend=False)
plt.title('Class Distribution After SMOTE', fontsize=16, fontweight='bold')
plt.xlabel('Class', fontsize=14)
plt.ylabel('Count', fontsize=14)
for i, v in enumerate(after_counts.values):
    plt.text(i, v, str(int(v)), ha='center', va='bottom', fontsize=12)

# Create a common legend at the bottom of the plots
handles = [plt.Rectangle((0, 0), 1, 1, color=colors[i]) for i in range(len(order))]
plt.figlegend(handles, order, title="Classes", loc="lower center", ncol=len(order),
              bbox_to_anchor=(0.5, -0.05), fontsize=12, title_fontsize=14)

plt.tight_layout(rect=[0, 0.05, 1, 1])
plt.show()
# plt.savefig('../export/class_distribution_before_after_smote.png', dpi=300)


### Model Fit

In [None]:
# Train Random Forest model
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = RF.predict(X_test_features)
print(classification_report(y_test_labels, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred))

# Feature importance
feature_importances = pd.DataFrame(
    RF.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importances.head(5))

In [None]:
# Visualize Confusion Matrix
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_labels, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Create heatmap
sns.heatmap(cm_normalized, annot=cm, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y_test_labels), 
            yticklabels=np.unique(y_test_labels))

plt.title('Random Forest Confusion Matrix - Model 1', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.tight_layout()
plt.show()
# plt.savefig('../export/random_forest_confusion_matrix.png', dpi=300)


In [None]:
# Create a function to generate LaTeX table from classification report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def classification_report_to_latex(y_true, y_pred, model_name='Model'):
    """
    Convert classification metrics to LaTeX table format
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    model_name : str, default='Model'
        Name of the model to display in the table
        
    Returns:
    --------
    str
        LaTeX formatted table of evaluation metrics
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate per-class metrics
    classes = np.unique(y_true)
    class_metrics = []
    
    for cls in classes:
        # Create binary classification for this class
        y_true_binary = (y_true == cls)
        y_pred_binary = (y_pred == cls)
        
        precision = precision_score(y_true_binary, y_pred_binary)
        recall = recall_score(y_true_binary, y_pred_binary)
        f1 = f1_score(y_true_binary, y_pred_binary)
        support = np.sum(y_true_binary)
        
        class_metrics.append({
            'Class': cls,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Support': support
        })
    
    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame(class_metrics)
    
    # Add model name and accuracy
    metrics_df.insert(0, 'Model', model_name)
    metrics_df.insert(1, 'Accuracy', accuracy)
    
    # Generate LaTeX table
    latex_table = metrics_df.to_latex(index=False, float_format=lambda x: f"{x:.4f}")
    
    return latex_table, metrics_df

# Generate LaTeX table for Random Forest model
latex_table, metrics_df = classification_report_to_latex(y_test_labels, y_pred, 'Random Forest')
print("LaTeX Table of Evaluation Metrics (Per Class):")
print(latex_table)

# Display the metrics in a formatted table
print("\nEvaluation Metrics Summary (Per Class):")
display(metrics_df.style.format({
    'Accuracy': '{:.4f}',
    'Precision': '{:.4f}',
    'Recall': '{:.4f}',
    'F1 Score': '{:.4f}'
}))


In [None]:
# LightGBM Model

# Create and train LightGBM model
lgb_model = lgb.LGBMClassifier(
    random_state=42,
    force_col_wise=True
)
lgb_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the LightGBM model
y_pred_lgb = lgb_model.predict(X_test_features)
print("\nLightGBM Results:")
print(classification_report(y_test_labels, y_pred_lgb))
print("LightGBM Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_lgb))

# Feature importance for LightGBM
lgb_feature_importances = pd.DataFrame(
    lgb_model.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("\nTop 20 most important features (LightGBM):")
print(lgb_feature_importances.head(5))

In [None]:
# Initialize models with regularization parameters
rf_model = RandomForestClassifier(
    n_estimators=100,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42
)

lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    max_depth=10,
    min_child_samples=5,
    random_state=42,
    force_col_wise=True
)

# Perform cross-validation for Random Forest
print("Evaluating Random Forest model...")
rf_results = perform_cross_validation(rf_model, X_data, y_data, n_splits=5)

print("\nRandom Forest Cross-Validation Results:")
print("\nMetrics per fold:")
print(rf_results[['fold', 'accuracy', 'bad_precision', 'bad_recall', 'bad_f1', 
                'good_precision', 'good_recall', 'good_f1']])
# Calculate and print mean/std metrics
mean_metrics = rf_results[['accuracy', 'bad_precision', 'bad_recall', 'bad_f1', 
                        'good_precision', 'good_recall', 'good_f1']].mean()
std_metrics = rf_results[['accuracy', 'bad_precision', 'bad_recall', 'bad_f1', 
                        'good_precision', 'good_recall', 'good_f1']].std()
print("\nMean metrics:")
print(mean_metrics)
print("\nStandard deviation:")
print(std_metrics)


# Repeat for LightGBM
print("\nEvaluating LightGBM model...")
lgb_results = perform_cross_validation(lgb_model, X_data, y_data, n_splits=5)
# ... (similar printing code for LightGBM results)

In [None]:
# # Calculate and print mean/std metrics
# lgb_mean_metrics = lgb_results[['accuracy', 'bad_precision', 'bad_recall', 'bad_f1', 
#                         'good_precision', 'good_recall', 'good_f1']].mean()
# lgb_std_metrics = lgb_results[['accuracy', 'bad_precision', 'bad_recall', 'bad_f1', 
#                         'good_precision', 'good_recall', 'good_f1']].std()
# print("\nMean metrics:")
# print(lgb_mean_metrics)
# print("\nStandard deviation:")
# print(lgb_std_metrics)