# DSS Thesis - Koen de Bonth

### Import packages

In [None]:
import os
import sys
from pathlib import Path

# Get the current working directory
current_dir = os.getcwd()

# Set the root directory to the parent of the current directory
root_dir = Path(current_dir).parent

# Add the root directory to sys.path so Python can find the utils module
sys.path.append(str(root_dir))
print(f"Added {root_dir} to Python path")

In [2]:
from utils import data_loader_utils
from utils.feature_extraction import transform_data, prepare_train_test_data
import itertools 
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pywt
import numpy as np
from scipy import signal,stats
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

### Loading and Preparing Data

In [None]:
machines = ["M01","M02","M03"]

# total list of operations
# process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

# process_names without OP07
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

labels = ["good","bad"]

path_to_dataset = os.path.join(root_dir, "data")

X_data = []
y_data = []

for process_name, machine, label in itertools.product(process_names, machines, labels):
    data_path = os.path.join(path_to_dataset, machine, process_name, label)
    data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
    X_data.extend(data_list)
    y_data.extend(data_label)

y_data_label = [item.split('_')[-1] for item in y_data]

In [19]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.3)

### Feature Extraction

In [None]:
def extract_wavelet_features(signal, wavelet='coif8', max_level=3):
    """
    Perform wavelet packet decomposition and extract statistical features
    
    Parameters:
    -----------
    signal : ndarray
        Input signal (1D array)
    wavelet : str
        Wavelet to use (default: 'coif8')
    max_level : int
        Maximum decomposition level
        
    Returns:
    --------
    dict: Dictionary of statistical features
    """
    # Create wavelet packet
    wp = pywt.WaveletPacket(data=signal, wavelet=wavelet, mode='symmetric', maxlevel=max_level)
    
    # Extract nodes at the maximum level
    level_nodes = [node.path for node in wp.get_level(max_level, 'natural')]
    
    features = {}
    
    for node in level_nodes:
        # Get coefficients for this node
        coeffs = wp[node].data
        
        # Extract statistical features
        features[f"mean_{node}"] = np.mean(coeffs)
        features[f"max_{node}"] = np.max(coeffs)
        features[f"min_{node}"] = np.min(coeffs)
        features[f"std_{node}"] = np.std(coeffs)
        features[f"kurtosis_{node}"] = stats.kurtosis(coeffs)
        features[f"skewness_{node}"] = stats.skew(coeffs)
        
        # Shannon entropy
        # Normalize the coefficients
        coeffs_norm = np.abs(coeffs) / np.sum(np.abs(coeffs) + 1e-10)
        entropy = -np.sum(coeffs_norm * np.log2(coeffs_norm + 1e-10))
        features[f"entropy_{node}"] = entropy
        
    return features


# List to store all features
all_features = []

# Process each sample in X_data
for i, sample in enumerate(tqdm(X_data, desc="Extracting features")):
    sample_features = {}
    
    # Process each axis (channel)
    for axis in range(sample.shape[1]):
        # Get signal for this axis
        signal = sample[:, axis]
        
        # Apply wavelet packet transform and extract features
        wp_features = extract_wavelet_features(signal, wavelet='coif8', max_level=3)
        
        # Add axis identifier to feature names
        for key, value in wp_features.items():
            sample_features[f"axis{axis}_{key}"] = value
    
    # Add label
    split_label = y_data[i].split("_")
    sample_features['label'] = split_label[-1]
    sample_features['machine'] = split_label[0]
    sample_features['process'] = split_label[1]
    
    # Add to collection
    all_features.append(sample_features)

# Convert to DataFrame
features_df = pd.DataFrame(all_features)

# display(features_df)


In [None]:
X_features, y_labels = transform_data(X_train, y_train, include_metadata=False)

### Synthetic oversampling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_features, y_labels)

### Model Fit

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Separate features and target
X = features_df.drop(['label','machine'], axis=1)
y = features_df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Evaluate the model
y_pred = model.predict(X_test)
print(classification_report(y_test, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# Feature importance
feature_importances = pd.DataFrame(
    model.feature_importances_,
    index=X.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importances.head(5))

In [21]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(y_test, y_pred, class_names=['good', 'bad']):
    """
    Maakt en toont een confusion matrix.

    Parameters:
    - y_test: array-like, werkelijke labels.
    - y_pred: array-like, voorspelde labels.
    - class_names: lijst met class namen (optioneel). Als None, worden standaard numerieke labels gebruikt.
    """
    # Bereken de confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    print("Confusion Matrix:")
    print(cm)
    
    # Als er geen class_names zijn meegegeven, gebruik dan standaard numerieke labels.
    if class_names is None:
        class_names = [str(i) for i in range(cm.shape[0])]
    
    # Maak de plot aan
    fig, ax = plt.subplots(figsize=(5, 5))
    im = ax.imshow(cm, interpolation='nearest', cmap=plt.cm.Blues)
    ax.figure.colorbar(im, ax=ax)
    
    # Stel de ticks en labels in
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=class_names, yticklabels=class_names,
           title='Confusion Matrix',
           ylabel='Werkelijke label',
           xlabel='Voorspelde label')
    
    # Draai de x-labels voor een betere leesbaarheid
    plt.setp(ax.get_xticklabels(), rotation=45, ha="right",
             rotation_mode="anchor")
    
    # Voeg waarden toe aan de cellen van de matrix
    fmt = 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")
    
    fig.tight_layout()
    plt.show()


In [None]:
plot_confusion_matrix(y_test, y_pred)