# Experiment 1: Fine tuning experiment

In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from imblearn.over_sampling import SMOTE, SMOTENC
    import lightgbm as lgb
    

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data
    from utils.model_validation import perform_cross_validation

    from imblearn.over_sampling import SMOTE

    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


### Create functions

In [None]:
def load_data(exclude_processes=None):
    """
    Load data from all machines and processes, with option to exclude specific processes.
    
    Args:
        exclude_processes (list, optional): List of process names to exclude from loading.
    
    Returns:
        tuple: (X_data, y_data) containing features and labels
    """
    machines = ["M01","M02","M03"]
    process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
    labels = ["good","bad"]
    
    # Filter out excluded processes if any
    if exclude_processes:
        process_names = [p for p in process_names if p not in exclude_processes]
    
    path_to_dataset = os.path.join(root_dir, "data")
    
    X_data = []
    y_data = []
    
    try:
        # Calculate total number of combinations
        total_combinations = len(process_names) * len(machines) * len(labels)
        
        # Create progress bar
        with tqdm(total=total_combinations, desc="Loading data") as pbar:
            for process_name, machine, label in itertools.product(process_names, machines, labels):
                data_path = os.path.join(path_to_dataset, machine, process_name, label)
                data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
                X_data.extend(data_list)
                y_data.extend(data_label)
                pbar.update(1)
                pbar.set_postfix({"Samples": len(X_data)})
                
        print(f"Data loaded successfully ✅ - {len(X_data)} samples")
    except Exception as e:
        print(f"Error loading data: {e}")
    
    return X_data, y_data

In [None]:
def undersample_good_class_to_minimum(X_data, y_data, machines_to_reduce=['M01']):
    """
    Undersample the 'good' class from specified machine(s) so that each process
    for each machine ends up at its per‐process minimum.
    """
    import pandas as pd

    # build DataFrame and split out metadata
    df = pd.DataFrame({'data': X_data, 'label': y_data})
    df[['machine','month','year','process','sample_id','status']] = \
        df['label'].str.split('_', expand=True)

    good = df[df['status']=='good']
    bad  = df[df['status']=='bad']

    # allow passing a single string
    if isinstance(machines_to_reduce, str):
        machines_to_reduce = [machines_to_reduce]

    reduced_good = pd.DataFrame()
    for m in machines_to_reduce:
        sub = good[good['machine']==m]
        if sub.empty: 
            continue
        counts = sub['process'].value_counts()
        mcount = counts.min()
        # sample each process down to the min
        for proc, _ in counts.items():
            proc_samples = sub[sub['process']==proc]
            sampled = proc_samples.sample(n=mcount, random_state=42)
            reduced_good = pd.concat([reduced_good, sampled], axis=0)

    # keep all good samples from other machines plus all bad samples
    other_good = good[~good['machine'].isin(machines_to_reduce)]
    final_df  = pd.concat([reduced_good, other_good, bad], axis=0).reset_index(drop=True)

    print(f"Original: {len(df)} rows → Reduced: {len(final_df)} rows")
    return final_df['data'].tolist(), final_df['label'].tolist()

In [None]:
# Load existing results or create a new DataFrame to store results
import os

results_file = 'results_df.csv'
if os.path.exists(results_file):
    # Load existing results
    results_df = pd.read_csv(results_file)
    print(f"Loaded existing results with {len(results_df)} entries")
else:
    # Create a new DataFrame
    results_df = pd.DataFrame(columns=[
        'timestamp', 'M01_adoption', 'M02_adoption', 'M03_adoption',
        'accuracy', 'precision', 'recall', 'f1_score',
        'train_samples', 'test_samples',
        'good_train', 'bad_train', 'good_test', 'bad_test'
    ])
    print("Created new results DataFrame")

def store_results(result, M01, M02, M03, y_train_labels, y_test_labels):
    """
    Store model results in the results DataFrame.
    
    Args:
        result (dict): Dictionary containing model results
        M01 (float): M01 adoption percentage
        M02 (float): M02 adoption percentage
        M03 (float): M03 adoption percentage
        y_train_labels (array-like): Training labels
        y_test_labels (array-like): Test labels
    """
    global results_df

    results_file = '../export/results_df.csv'
    
    if os.path.exists(results_file):
        # Load existing results
        results_df = pd.read_csv(results_file)
        print(f"Loaded existing results with {len(results_df)} entries")
    else:
        # Create a new DataFrame
        results_df = pd.DataFrame(columns=[
            'timestamp', 'M01_adoption', 'M02_adoption', 'M03_adoption',
            'accuracy', 'precision', 'recall', 'f1_score',
            'train_samples', 'test_samples',
            'good_train', 'bad_train', 'good_test', 'bad_test'
        ])
        print("Created new results DataFrame")
    
    # Get classification report metrics
    report = result['classification_report']
    
    # Calculate class distributions from the labels
    train_good = np.sum(y_train_labels == 0)
    train_bad = np.sum(y_train_labels == 1)
    test_good = np.sum(y_test_labels == 0)
    test_bad = np.sum(y_test_labels == 1)
    
    # Create new row with timestamp
    new_row = {
        'timestamp': pd.Timestamp.now(),
        'M01_adoption': M01,
        'M02_adoption': M02,
        'M03_adoption': M03,
        'accuracy': report['accuracy'],
        'precision': report['weighted avg']['precision'],
        'recall': report['weighted avg']['recall'],
        'f1_score': report['weighted avg']['f1-score'],
        'train_samples': len(y_train_labels),
        'test_samples': len(y_test_labels),
        'good_train': train_good,
        'bad_train': train_bad,
        'good_test': test_good,
        'bad_test': test_bad
    }
    
    # Append to results DataFrame
    results_df = pd.concat([results_df, pd.DataFrame([new_row])], ignore_index=True)
    
    # Sort by f1_score in descending order
    results_df = results_df.sort_values('f1_score', ascending=False)

    results_df.to_csv('../export/results_df.csv', index=False)
    
    return results_df

In [None]:
def train_rf_with_adoption(X_data=None, y_data=None, M01=0, M02=0, M03=0, verbose=False):
    """
    Train a Random Forest model with machine-specific adoption.
    
    Args:
        X_data (list, optional): List of feature data. If None, data will be loaded.
        y_data (list, optional): List of labels. If None, data will be loaded.
        exclude_process (list): List of processes to exclude from data loading
        M01 (float): Percentage (0-1) of M01 data to include in training
        M02 (float): Percentage (0-1) of M02 data to include in training
        M03 (float): Percentage (0-1) of M03 data to include in training
    
    Returns:
        dict: Dictionary containing model, evaluation results, and machine adoption percentages
    """

    # Create DataFrame
    df = pd.DataFrame({'data': X_data, 'label': y_data})
    df[['machine', 'month', 'year', 'process', 'sample_id', 'status']] = df['label'].str.split('_', expand=True)

    # Initialize empty DataFrames for train and test
    train_dfs = []
    test_dfs = []

    # Process each machine separately
    for machine, percentage in [('M01', M01), ('M02', M02), ('M03', M03)]:
        machine_data = df[df['machine'] == machine]
        
        if len(machine_data) > 0 and percentage > 0:  # Only process if percentage > 0
            # Get the status for stratification
            stratify = machine_data['status']
            
            # Split the data with stratification
            if percentage == 1:  # If percentage is 1, use all data for training
                train_samples = machine_data
                test_samples = pd.DataFrame(columns=machine_data.columns)
            else:
                train_samples, test_samples = train_test_split(
                    machine_data,
                    train_size=percentage,
                    stratify=stratify,
                    random_state=42
                )
            
            train_dfs.append(train_samples)
            test_dfs.append(test_samples)
        elif len(machine_data) > 0:  # If percentage is 0, add all to test set
            test_dfs.append(machine_data)

    # Combine all machine splits
    train_df = pd.concat(train_dfs) if train_dfs else pd.DataFrame()
    test_df = pd.concat(test_dfs)

    if verbose:
        print(f"Training set size: {len(train_df)} samples")
        print(f"Test set size: {len(test_df)} samples")
        print("\nMachine distribution in training set:")
        print(train_df['machine'].value_counts() if not train_df.empty else "No training data")
        print("\nStatus distribution in training set:")
        print(train_df['status'].value_counts() if not train_df.empty else "No training data")
        print("\nMachine distribution in test set:")
        print(test_df['machine'].value_counts())
        print("\nStatus distribution in test set:")
        print(test_df['status'].value_counts())

    # Store status counts before dropping columns
    train_status_counts = train_df['status'].value_counts() if not train_df.empty else pd.Series()
    test_status_counts = test_df['status'].value_counts()
    
    train_good = train_status_counts.get('good', 0)
    train_bad = train_status_counts.get('bad', 0)
    test_good = test_status_counts.get('good', 0)
    test_bad = test_status_counts.get('bad', 0)

    # Prepare data for training
    train_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)
    X_train = train_df['data'].tolist()
    y_train = train_df['label'].tolist()

    test_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)
    X_test = test_df['data'].tolist()
    y_test = test_df['label'].tolist()

    # transform data to features and transform labels to 0 and 1
    X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)
    X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

    #smote oversampling on training data
    smote = SMOTE(random_state=42)
    X_train_resampled, y_train_resampled = smote.fit_resample(X_train_features, y_train_labels)

    if verbose:
        print("X_train_resampled shape:", X_train_resampled.shape , "with smote")
        print("X_test_features shape:", X_test_features.shape , "without smote")

    # Train Random Forest classifier with optimized hyperparameters
    RF = RandomForestClassifier(max_features='log2', 
                                n_estimators=150,
                                max_depth=15,
                                min_samples_leaf=1,
                                min_samples_split=2,
                                random_state=42)

    RF.fit(X_train_resampled, y_train_resampled)

    # Evaluate the model
    y_pred = RF.predict(X_test_features)
    
    if verbose:
        print(classification_report(y_test_labels, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(y_test_labels, y_pred))

    # Feature importance
    feature_importances = pd.DataFrame(
        RF.feature_importances_,
        index=X_train_resampled.columns,
        columns=['importance']
    ).sort_values('importance', ascending=False)

    if verbose:
        print("Top 20 most important features:")
        print(feature_importances.head(5))

    # Return the trained model and classification report
    report = classification_report(y_test_labels, y_pred, output_dict=True)
    
    # Create a dictionary with model and evaluation results
    result = {
        'model': RF,
        'classification_report': report,
        'confusion_matrix': confusion_matrix(y_test_labels, y_pred),
        'feature_importances': feature_importances,
        'machine_adoption': {
            'M01': M01,
            'M02': M02,
            'M03': M03
        },
        'status_counts': {
            'train_good': train_good,
            'train_bad': train_bad,
            'test_good': test_good,
            'test_bad': test_bad
        }
    }
    
    # Store results in DataFrame
    store_results(result, M01, M02, M03, y_train_labels, y_test_labels)

### Run code

In [None]:
# Example usage:
X_data, y_data = load_data()  # Load all processes
# X_data, y_data = load_data(exclude_processes=["OP00"])  # Exclude specific processes


In [None]:
# Reduce good samples from M01 by keeping only 30%
Xr, yr = undersample_good_class_to_minimum(X_data, y_data, machines_to_reduce=['M01','M02'])


In [None]:
# Example: Use 50% of M01 data, 30% of M02 data, and 0% of M03 data in training
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.0,  M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.05, M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.1,  M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.15, M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.2,  M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.25, M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.3,  M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.35, M03=0.0)
train_rf_with_adoption(X_data=Xr, y_data=yr, M01=1.0, M02=0.4,  M03=0.0)

In [None]:
# Bekijk alle resultaten met timestamps
print(results_df)

results_df.to_csv('../export/results_df.csv', index=False)

In [None]:
from EXP_improved import run_holdout_experiment_preloaded

In [None]:
clf = run_holdout_experiment_preloaded(
    X_data=X_data,
    y_data=y_data,
    M01=1.0,
    M02=0.0,
    M03=0.0,
    machines_to_reduce=["M01","M02"],
    random_state=42,
    check_split=True
)