# Experiment 1: Fine tuning experiment

In [None]:
#import depencies
import os
import sys
from pathlib import Path

try:
    # Get the current working directory
    current_dir = os.getcwd()

    # Set the root directory to the parent of the current directory
    root_dir = Path(current_dir).parent

    # Add the root directory to sys.path so Python can find the utils module
    sys.path.append(str(root_dir))
    print(f"Added {root_dir} to Python path")

    # Standard libraries
    import numpy as np
    import pandas as pd
    import itertools
    import h5py

    # Data processing and visualization
    import matplotlib.pyplot as plt
    import seaborn as sns
    from scipy import signal, stats
    import pywt
    from tqdm import tqdm

    # Machine learning
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.metrics import classification_report, confusion_matrix
    from imblearn.over_sampling import SMOTE, SMOTENC
    import lightgbm as lgb
    

    # Custom utilities
    from utils import data_loader_utils
    from utils.feature_extraction import transform_data
    from utils.model_validation import perform_cross_validation

    from imblearn.over_sampling import SMOTE

    print("Dependencies loaded successfully ✅")
except Exception as e:
    print(f"Error loading dependencies: {e}")


### Create functions

In [None]:
def load_data(exclude_processes=None):
    """
    Load data from all machines and processes, with option to exclude specific processes.
    
    Args:
        exclude_processes (list, optional): List of process names to exclude from loading.
    
    Returns:
        tuple: (X_data, y_data) containing features and labels
    """
    machines = ["M01","M02","M03"]
    process_names = ["OP00","OP01","OP02","OP03","OP04","OP05","OP06","OP07","OP08","OP09","OP10","OP11","OP12","OP13","OP14"]
    labels = ["good","bad"]
    
    # Filter out excluded processes if any
    if exclude_processes:
        process_names = [p for p in process_names if p not in exclude_processes]
    
    path_to_dataset = os.path.join(root_dir, "data")
    
    X_data = []
    y_data = []
    
    try:
        # Calculate total number of combinations
        total_combinations = len(process_names) * len(machines) * len(labels)
        
        # Create progress bar
        with tqdm(total=total_combinations, desc="Loading data") as pbar:
            for process_name, machine, label in itertools.product(process_names, machines, labels):
                data_path = os.path.join(path_to_dataset, machine, process_name, label)
                data_list, data_label = data_loader_utils.load_tool_research_data(data_path, label=label)
                X_data.extend(data_list)
                y_data.extend(data_label)
                pbar.update(1)
                pbar.set_postfix({"Samples": len(X_data)})
                
        print(f"Data loaded successfully ✅ - {len(X_data)} samples")
    except Exception as e:
        print(f"Error loading data: {e}")
    
    return X_data, y_data

In [None]:
def train_rf_with_adoption(exclude_process=[], M01=0, M02=0, M03=0):
    """
    Train a Random Forest model with machine-specific adoption.
    
    Args:
        exclude_process (list): List of processes to exclude
        M01 (float): Percentage (0-1) of M01 data to include in training
        M02 (float): Percentage (0-1) of M02 data to include in training
        M03 (float): Percentage (0-1) of M03 data to include in training
    
    Returns:
        tuple: (train_df, test_df) containing the split data
    """
    X_data, y_data = load_data(exclude_process)

    # Create DataFrame
    df = pd.DataFrame({'data': X_data, 'label': y_data})
    df[['machine', 'month', 'year', 'process', 'sample_id', 'status']] = df['label'].str.split('_', expand=True)

    # Initialize empty DataFrames for train and test
    train_dfs = []
    test_dfs = []

    # Process each machine separately
    for machine, percentage in [('M01', M01), ('M02', M02), ('M03', M03)]:
        machine_data = df[df['machine'] == machine]
        
        if len(machine_data) > 0 and percentage > 0:  # Only process if percentage > 0
            # Get the status for stratification
            stratify = machine_data['status']
            
            # Split the data with stratification
            train_samples, test_samples = train_test_split(
                machine_data,
                train_size=percentage,
                stratify=stratify,
                random_state=42
            )
            
            train_dfs.append(train_samples)
            test_dfs.append(test_samples)
        elif len(machine_data) > 0:  # If percentage is 0, add all to test set
            test_dfs.append(machine_data)

    # Combine all machine splits
    train_df = pd.concat(train_dfs) if train_dfs else pd.DataFrame()
    test_df = pd.concat(test_dfs)

    print(f"Training set size: {len(train_df)} samples")
    print(f"Test set size: {len(test_df)} samples")
    print("\nMachine distribution in training set:")
    print(train_df['machine'].value_counts() if not train_df.empty else "No training data")
    print("\nStatus distribution in training set:")
    print(train_df['status'].value_counts() if not train_df.empty else "No training data")
    print("\nMachine distribution in test set:")
    print(test_df['machine'].value_counts())
    print("\nStatus distribution in test set:")
    print(test_df['status'].value_counts())

    # Prepare data for training
    train_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)
    X_train = train_df['data'].tolist()
    y_train = train_df['label'].tolist()

    test_df.drop(columns=['machine', 'month', 'year', 'process', 'sample_id', 'status'], inplace=True)

    X_test = test_df['data'].tolist()
    y_test = test_df['label'].tolist()

    # transform data to features and transform labels to 0 and 1

    X_train_features, y_train_labels = transform_data(X_train, y_train, include_metadata=False)

    X_test_features, y_test_labels = transform_data(X_test, y_test, include_metadata=False)

    #smote oversampling on training data
    smote = SMOTE(random_state=42)

    X_train_features, y_train_labels = smote.fit_resample(X_train_features, y_train_labels)













    return X_train_features, y_train_labels

### Run code

In [None]:
# Example usage:
X_data, y_data = load_data()  # Load all processes
# X_data, y_data = load_data(exclude_processes=["OP00"])  # Exclude specific processes

In [None]:
# Example: Use 50% of M01 data, 30% of M02 data, and 0% of M03 data in training
train_df, test_df = train_rf_with_adoption(M01=0.5, M02=0.3, M03=0)