# Expiriment 1

In [None]:
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from imblearn.over_sampling import SMOTE, SMOTENC
    import lightgbm as lgb
    

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data, prepare_train_test_data
    from utils.model_validation import perform_cross_validation

    from imblearn.over_sampling import SMOTE

    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


In [None]:
machines = ["M01","M02","M03"]
process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
labels = ["good","bad"]

path_to_dataset = os.path.join(root_dir, "data")

X_data = []
y_data = []

try:
    for process_name, machine, label in itertools.product(process_names, machines, labels):
        data_path = os.path.join(path_to_dataset, machine, process_name, label)
        data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
        X_data.extend(data_list)
        y_data.extend(data_label)
    print(f"Data loaded successfully 📊 - {len(X_data)} samples")
except Exception as e:
    print(f"Error loading data: {e}")


In [None]:
# Create DataFrame and split label into components
df = pd.DataFrame({'data': X_data, 'label': y_data})
df[['machine', 'month', 'year', 'process', 'sample_id', 'status']] = df['label'].str.split('_', expand=True)


X_M01 = df[df["machine"]=="M01"]
X_M01_drop = X_M01[~X_M01["process"].isin(["OP01","OP02","OP04","OP05","OP06","OP07","OP08","OP10","OP11","OP12"])]
X_train = X_M01['data'].tolist()
y_train = X_M01["label"].tolist()


# Print dataset size
print("\nDataset size:")
print(len(X_train))

# Display class distribution
class_distribution_train = pd.Series(X_M01['status']).value_counts()
print("\nClass distribution:")
print(class_distribution_train)

X_M02_M03 = df[df["machine"].isin(["M02","M03"])]
X_test = X_M02_M03['data'].tolist()
y_test = X_M02_M03["label"].tolist()

# Print dataset size
print("\nDataset size:")
print(len(X_test))

# Display class distribution
class_distribution_test = pd.Series(X_M02_M03['status']).value_counts()
print("\nClass distribution:")
print(class_distribution_test)

In [None]:
X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)

X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

In [None]:
print("X_train_features shape:", X_train_features.shape)
print("X_test_features shape:", X_test_features.shape)

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
# smote = SMOTENC(y_test_labels, random_state=42)

X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

print("X_train_resampled shape:", X_train_resampled.shape)
print("X_test_features shape:", X_test_features.shape)

In [None]:
# Train Random Forest model
RF = RandomForestClassifier(max_features='log2', 
                            n_estimators=150,
                            max_depth=15,
                            min_samples_leaf=1,
                            min_samples_split=2,
                            random_state=42)

RF.fit(X_train_resampled, y_train_resampled)

# Evaluate the model
y_pred = RF.predict(X_test_features)
print(classification_report(y_test_labels, y_pred))
print("Confusion Matrix:")
print(confusion_matrix(y_test_labels, y_pred))

# Feature importance
feature_importances = pd.DataFrame(
    RF.feature_importances_,
    index=X_train_resampled.columns,
    columns=['importance']
).sort_values('importance', ascending=False)

print("Top 20 most important features:")
print(feature_importances.head(5))

In [None]:
# Create a function to generate LaTeX table from classification report
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def classification_report_to_latex(y_true, y_pred, model_name='Model'):
    """
    Convert classification metrics to LaTeX table format
    
    Parameters:
    -----------
    y_true : array-like
        True labels
    y_pred : array-like
        Predicted labels
    model_name : str, default='Model'
        Name of the model to display in the table
        
    Returns:
    --------
    str
        LaTeX formatted table of evaluation metrics
    """
    # Calculate metrics
    accuracy = accuracy_score(y_true, y_pred)
    
    # Calculate per-class metrics
    classes = np.unique(y_true)
    class_metrics = []
    
    for cls in classes:
        # Create binary classification for this class
        y_true_binary = (y_true == cls)
        y_pred_binary = (y_pred == cls)
        
        precision = precision_score(y_true_binary, y_pred_binary)
        recall = recall_score(y_true_binary, y_pred_binary)
        f1 = f1_score(y_true_binary, y_pred_binary)
        support = np.sum(y_true_binary)
        
        class_metrics.append({
            'Class': cls,
            'Precision': precision,
            'Recall': recall,
            'F1 Score': f1,
            'Support': support
        })
    
    # Create a DataFrame for the metrics
    metrics_df = pd.DataFrame(class_metrics)
    
    # Add model name and accuracy
    metrics_df.insert(0, 'Model', model_name)
    metrics_df.insert(1, 'Accuracy', accuracy)
    
    # Generate LaTeX table
    latex_table = metrics_df.to_latex(index=False, float_format=lambda x: f"{x:.4f}")
    
    return latex_table, metrics_df

# Generate LaTeX table for Random Forest model
latex_table, metrics_df = classification_report_to_latex(y_test_labels, y_pred, 'Random Forest')
print("LaTeX Table of Evaluation Metrics (Per Class):")
print(latex_table)

# Display the metrics in a formatted table
print("\nEvaluation Metrics Summary (Per Class):")
display(metrics_df.style.format({
    'Accuracy': '{:.4f}',
    'Precision': '{:.4f}',
    'Recall': '{:.4f}',
    'F1 Score': '{:.4f}'
}))

In [None]:
plt.figure(figsize=(10, 8))
cm = confusion_matrix(y_test_labels, y_pred)
cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]

# Create heatmap
sns.heatmap(cm_normalized, annot=cm, fmt='d', cmap='Blues', 
            xticklabels=np.unique(y_test_labels), 
            yticklabels=np.unique(y_test_labels))

plt.title('Random Forest Confusion Matrix - Model 2', fontsize=16, fontweight='bold')
plt.ylabel('True Label', fontsize=14)
plt.xlabel('Predicted Label', fontsize=14)
plt.tight_layout()
plt.show()