# DSS Thesis - Koen de Bonth

### Import packages

In [None]:
import os
import sys
from pathlib import Path

# Get the current working directory
current_dir = os.getcwd()

# Set the root directory to the parent of the current directory
root_dir = Path(current_dir).parent

# Add the root directory to sys.path so Python can find the utils module
sys.path.append(str(root_dir))
print(f"Added {root_dir} to Python path")

In [None]:
# Standard libraries
import numpy as np
import pandas as pd
import itertools
import h5py

# Data processing and visualization
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import signal, stats
import pywt
from tqdm import tqdm

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix
from imblearn.over_sampling import SMOTE
import lightgbm as lgb

# Custom utilities
from utils import data_loader_utils
from utils.feature_extraction import transform_data, prepare_train_test_data

### Loading and Preparing Data

In [None]:
machines = ["M01","M02","M03"]

# total list of operations
# process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

# process_names without OP07
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

labels = ["good","bad"]

path_to_dataset = os.path.join(root_dir, "data")

X_data = []
y_data = []

for process_name, machine, label in itertools.product(process_names, machines, labels):
    data_path = os.path.join(path_to_dataset, machine, process_name, label)
    data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
    X_data.extend(data_list)
    y_data.extend(data_label)

y_data_label = [item.split('_')[-1] for item in y_data]

### Data Augmentation and Preprocessing

In [None]:
def augment_time_series_with_labels(X_data, y_data, augmentation_factor=2):
    augmented_data = []
    augmented_labels = []
    
    for data, label in tqdm(zip(X_data, y_data), desc="Augmenting data", total=len(X_data)):
        # Original data
        augmented_data.append(data)
        augmented_labels.append(label)
        
        # Add noise
        for _ in range(augmentation_factor - 1):
            noise_level = 0.05 * np.random.rand()
            noise = noise_level * np.random.normal(0, 1, size=data.shape)
            augmented_data.append(data + noise)
            augmented_labels.append(label)  # Same label as original
            
            # Time shifting
            shift = np.random.randint(5, 20)
            shifted_data = np.roll(data, shift, axis=0)
            augmented_data.append(shifted_data)
            augmented_labels.append(label)  # Same label as original
            
            # Scaling
            scale_factor = 0.9 + 0.2 * np.random.rand()  # Scale between 0.9 and 1.1
            augmented_data.append(data * scale_factor)
            augmented_labels.append(label)  # Same label as original
    
    return augmented_data, augmented_labels

In [None]:
augmented_data, augmented_labels = augment_time_series_with_labels(X_data,y_data)

### Split data into training and testing sets

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.3)

### Feature Extraction

In [None]:
X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)

In [None]:
X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

In [None]:
y_test_labels.value_counts()

### Synthetic oversampling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

### Model Fit

In [None]:


# Train Random Forest model
RF = RandomForestClassifier(n_estimators=100, random_state=42)
RF.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = RF.predict(X_test_features)
print(classification_report(y_test_labels, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred))

# Feature importance
feature_importances = pd.DataFrame(
    RF.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importances.head(5))

In [None]:
# LightGBM Model

# Create and train LightGBM model
lgb_model = lgb.LGBMClassifier(
    n_estimators=100,
    learning_rate=0.1,
    max_depth=5,
    random_state=42
)
lgb_model.fit(X_train_resampled, y_train_resampled)

# Evaluate the LightGBM model
y_pred_lgb = lgb_model.predict(X_test_features)
print("\nLightGBM Results:")
print(classification_report(y_test_labels, y_pred_lgb))
print("LightGBM Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred_lgb))

# Feature importance for LightGBM
lgb_feature_importances = pd.DataFrame(
    lgb_model.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("\nTop 20 most important features (LightGBM):")
print(lgb_feature_importances.head(5))