# DSS Thesis - Koen de Bonth

### Import packages

In [None]:
import os
import sys
from pathlib import Path

# Get the current working directory
current_dir = os.getcwd()

# Set the root directory to the parent of the current directory
root_dir = Path(current_dir).parent

# Add the root directory to sys.path so Python can find the utils module
sys.path.append(str(root_dir))
print(f"Added {root_dir} to Python path")

In [None]:
from utils import data_loader_utils
from utils.feature_extraction import transform_data, prepare_train_test_data
import itertools 
import h5py
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pywt
import numpy as np
from scipy import signal,stats
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split

### Loading and Preparing Data

In [None]:
machines = ["M01","M02","M03"]

# total list of operations
# process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

# process_names without OP07
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]

labels = ["good","bad"]

path_to_dataset = os.path.join(root_dir, "data")

X_data = []
y_data = []

for process_name, machine, label in itertools.product(process_names, machines, labels):
    data_path = os.path.join(path_to_dataset, machine, process_name, label)
    data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
    X_data.extend(data_list)
    y_data.extend(data_label)

y_data_label = [item.split('_')[-1] for item in y_data]

In [None]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_data,y_data, test_size=0.3)

### Feature Extraction

In [None]:
X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)

In [None]:
X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

In [None]:
y_test_labels.value_counts()

### Synthetic oversampling

In [None]:
from imblearn.over_sampling import SMOTE

smote = SMOTE(random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

### Model Fit

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix

# Train Random Forest model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = model.predict(X_test_features)
print(classification_report(y_test_labels, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred))

# Feature importance
feature_importances = pd.DataFrame(
    model.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importances.head(5))