In [5]:
import numpy as np
import pandas as pd

import numpy as np
import pandas as pd

# Global configuration
PROCESS_CONFIG: dict[str, dict[str, object]] = {
    'welding_temp':        {'mean': 350, 'std': 10, 'dist': 'normal'},
    'clamp_pressure':      {'mean': 5.0, 'std': 0.5, 'dist': 'normal'},
    'robot_torque':        {'mean': 120, 'std': 15, 'dist': 'normal'},
    'paint_viscosity':     {'mean': 0.85, 'std': 0.1, 'dist': 'lognormal'},
    'conveyor_speed':      {'mean': 1.5, 'std': 0.2, 'dist': 'normal'},
    'ambient_temp':        {'mean': 25, 'std': 2, 'dist': 'normal'},
    'humidity':            {'dist': 'uniform', 'low': 30, 'high': 60},
    'sensor_drift':        {'dist': 'exponential', 'scale': 0.1},
    'cycle_time':          {'dist': 'chi2', 'df': 4},
    'quality_score':       {'dist': 'beta', 'a': 2, 'b': 5},
    
    # Dependent
    'robot_current':       {'formula': 'robot_torque * 0.08 + noise(0.5)'},
    'weld_time':           {'formula': '350 / welding_temp + noise(0.01)'},
    'spray_pressure':      {'formula': 'np.sin(paint_viscosity * 3) * 5 + 10 + noise(0.5)'},
}

def simulate_normal_data(n_samples: int = 1000, seed: int = 42) -> pd.DataFrame:
    """
    Simulate in-control automotive manufacturing data with dependencies and mixed distributions.
    """
    np.random.seed(seed)
    df: pd.DataFrame = pd.DataFrame(index=range(n_samples))

    for var, cfg in PROCESS_CONFIG.items():
        if 'formula' in cfg:
            continue
        dist = cfg['dist']
        if dist == 'normal':
            df[var] = np.random.normal(loc=cfg['mean'], scale=cfg['std'], size=n_samples)
        elif dist == 'lognormal':
            mu = np.log(cfg['mean']**2 / np.sqrt(cfg['std']**2 + cfg['mean']**2))
            sigma = np.sqrt(np.log(1 + (cfg['std']**2 / cfg['mean']**2)))
            df[var] = np.random.lognormal(mean=mu, sigma=sigma, size=n_samples)
        elif dist == 'uniform':
            df[var] = np.random.uniform(low=cfg['low'], high=cfg['high'], size=n_samples)
        elif dist == 'exponential':
            df[var] = np.random.exponential(scale=cfg['scale'], size=n_samples)
        elif dist == 'chi2':
            df[var] = np.random.chisquare(df=cfg['df'], size=n_samples)
        elif dist == 'beta':
            df[var] = np.random.beta(a=cfg['a'], b=cfg['b'], size=n_samples)

    def noise(std: float) -> np.ndarray:
        return np.random.normal(0, std, size=n_samples)

    for var, cfg in PROCESS_CONFIG.items():
        if 'formula' not in cfg:
            continue
        local_vars = df.to_dict(orient='series')
        df[var] = eval(cfg['formula'], {'np': np, 'noise': noise}, local_vars)

    df['faultNumber'] = 0  # Label all rows as in-control

    return df

df_normal = simulate_normal_data(n_samples=1000)
print(df_normal.describe())


       welding_temp  clamp_pressure  robot_torque  paint_viscosity  \
count   1000.000000     1000.000000   1000.000000      1000.000000   
mean     350.193321        5.035418    120.087513         0.848449   
std        9.792159        0.498727     14.751814         0.102434   
min      317.587327        3.529806     74.707318         0.598787   
25%      343.524097        4.696879    110.280006         0.774259   
50%      350.253006        5.031539    119.996239         0.844196   
75%      356.479439        5.364441    129.913730         0.912838   
max      388.527315        6.596554    178.893566         1.234712   

       conveyor_speed  ambient_temp     humidity  sensor_drift   cycle_time  \
count     1000.000000   1000.000000  1000.000000   1000.000000  1000.000000   
mean         1.490145     24.906525    44.575508      0.094197     4.149612   
std          0.198476      2.014778     8.573862      0.091544     2.885172   
min          0.864659     19.200972    30.007288     

In [6]:
df_normal.head()  # Display the first few rows of the simulated data

Unnamed: 0,welding_temp,clamp_pressure,robot_torque,paint_viscosity,conveyor_speed,ambient_temp,humidity,sensor_drift,cycle_time,quality_score,robot_current,weld_time,spray_pressure,faultNumber
0,354.967142,5.699678,109.872326,0.674982,1.327301,24.152481,40.093511,0.102235,5.353317,0.358583,8.713244,1.00751,15.44926,0
1,348.617357,5.462317,117.83222,0.763176,1.493759,24.093172,36.336248,0.209956,1.628155,0.203676,9.635242,0.993919,13.294496,0
2,356.476885,5.029815,108.113701,0.804219,1.503603,21.408714,39.395462,0.054151,4.465059,0.111206,8.911211,0.992666,14.123443,0
3,365.230299,4.676532,115.380577,1.053298,1.594526,24.33982,35.365232,0.24193,3.79111,0.324653,10.315672,0.951083,9.927539,0
4,347.658466,5.349112,91.59578,0.901099,1.226628,26.465658,57.057483,0.498551,1.73842,0.1341,7.559896,0.998026,12.404414,0


In [11]:
import numpy as np
import pandas as pd
from typing import Tuple

PROCESS_CONFIG: dict[str, dict[str, object]] = {
    'welding_temp':        {'mean': 350, 'std': 10, 'dist': 'normal'},
    'clamp_pressure':      {'mean': 5.0, 'std': 0.5, 'dist': 'normal'},
    'robot_torque':        {'mean': 120, 'std': 15, 'dist': 'normal'},
    'paint_viscosity':     {'mean': 0.85, 'std': 0.1, 'dist': 'lognormal'},
    'conveyor_speed':      {'mean': 1.5, 'std': 0.2, 'dist': 'normal'},
    'ambient_temp':        {'mean': 25, 'std': 2, 'dist': 'normal'},
    'humidity':            {'dist': 'uniform', 'low': 30, 'high': 60},
    'sensor_drift':        {'dist': 'exponential', 'scale': 0.1},
    'cycle_time':          {'dist': 'chi2', 'df': 4},
    'quality_score':       {'dist': 'beta', 'a': 2, 'b': 5},
    'robot_current':       {'formula': 'robot_torque * 0.08 + noise(0.5)'},
    'weld_time':           {'formula': '350 / welding_temp + noise(0.01)'},
    'spray_pressure':      {'formula': 'np.sin(paint_viscosity * 3) * 5 + 10 + noise(0.5)'},
}

def noise(std: float, size: int) -> np.ndarray:
    return np.random.normal(0, std, size=size)

def simulate_normal_data(n_samples: int, seed: int = 42) -> pd.DataFrame:
    np.random.seed(seed)
    df = pd.DataFrame(index=range(n_samples))

    for var, cfg in PROCESS_CONFIG.items():
        if 'formula' in cfg:
            continue
        dist = cfg['dist']
        if dist == 'normal':
            df[var] = np.random.normal(cfg['mean'], cfg['std'], n_samples)
        elif dist == 'lognormal':
            mu = np.log(cfg['mean']**2 / np.sqrt(cfg['std']**2 + cfg['mean']**2))
            sigma = np.sqrt(np.log(1 + (cfg['std']**2 / cfg['mean']**2)))
            df[var] = np.random.lognormal(mu, sigma, n_samples)
        elif dist == 'uniform':
            df[var] = np.random.uniform(cfg['low'], cfg['high'], n_samples)
        elif dist == 'exponential':
            df[var] = np.random.exponential(cfg['scale'], n_samples)
        elif dist == 'chi2':
            df[var] = np.random.chisquare(cfg['df'], n_samples)
        elif dist == 'beta':
            df[var] = np.random.beta(cfg['a'], cfg['b'], n_samples)

    for var, cfg in PROCESS_CONFIG.items():
        if 'formula' not in cfg:
            continue
        context = df.to_dict(orient='series')
        df[var] = eval(cfg['formula'], {'np': np, 'noise': lambda std: noise(std, n_samples)}, context)

    df['faultNumber'] = 0
    return df

def inject_fault(df: pd.DataFrame, fault_number: int) -> pd.DataFrame:
    df_fault = df.copy()
    np.random.seed(fault_number + 100)

    col = df.columns[fault_number % (df.shape[1] - 1)]

    if fault_number in {1, 2, 3}:
        df_fault[col] += np.random.normal(10, 2, size=len(df))
    elif fault_number in {4, 5, 6}:
        drift = np.linspace(0, 20, len(df))
        df_fault[col] += drift
    elif fault_number in {7, 8, 9}:
        df_fault[col] += np.random.normal(0, 5 * df[col].std(), size=len(df))
    elif fault_number in {10, 11, 12}:
        stuck_value = df[col].iloc[0]
        df_fault[col] = stuck_value
    elif fault_number in {13, 14}:
        df_fault[col] += 5 * np.sin(np.linspace(0, 20 * np.pi, len(df)))
    elif fault_number in {15, 16}:
        outliers = np.random.choice([0, 20], size=len(df), p=[0.95, 0.05])
        df_fault[col] += outliers
    elif fault_number == 17:
        for c in df.columns[:5]:
            df_fault[c] += np.linspace(0, 5, len(df))
    elif fault_number == 18:
        df_fault['quality_score'] += 0.5 * np.cos(df_fault['humidity'] / 10)
    elif fault_number in {19, 20}:
        df_fault[col] = df_fault[col].max() - df_fault[col]

    df_fault['faultNumber'] = fault_number
    return df_fault

def reorder_columns(df: pd.DataFrame) -> pd.DataFrame:
    cols = df.columns.tolist()
    # Remove faultNumber, simulationRun, sample if present
    for c in ['faultNumber', 'simulationRun', 'sample']:
        if c in cols:
            cols.remove(c)
    new_order = ['faultNumber', 'simulationRun', 'sample'] + cols
    return df[new_order]

def generate_dataset(
    n_normal_simulations: int = 5,
    n_faulty_simulations_per_fault: int = 5,
    n_samples_per_simulation: int = 200
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    normal_runs = []
    for run_id in range(1, n_normal_simulations + 1):
        df = simulate_normal_data(n_samples_per_simulation, seed=run_id)
        df['simulationRun'] = run_id
        df['sample'] = np.arange(1, n_samples_per_simulation + 1)
        df = reorder_columns(df)
        normal_runs.append(df)

    faulty_runs = []
    run_id = n_normal_simulations + 1
    for fault_id in range(1, 21):
        for sim_run in range(n_faulty_simulations_per_fault):
            df = simulate_normal_data(n_samples_per_simulation, seed=run_id)
            df = inject_fault(df, fault_number=fault_id)
            df['simulationRun'] = run_id
            df['sample'] = np.arange(1, n_samples_per_simulation + 1)
            df = reorder_columns(df)
            faulty_runs.append(df)
            run_id += 1

    df_ff_training = pd.concat(normal_runs, ignore_index=True)
    df_f_training = pd.concat(faulty_runs, ignore_index=True)
    return df_ff_training, df_f_training

if __name__ == '__main__':
    df_ff_training, df_f_training = generate_dataset()
    print(df_ff_training.columns[:10])  # Check order of first columns
    print(df_ff_training[['faultNumber', 'simulationRun', 'sample']].head())


Index(['faultNumber', 'simulationRun', 'sample', 'welding_temp',
       'clamp_pressure', 'robot_torque', 'paint_viscosity', 'conveyor_speed',
       'ambient_temp', 'humidity'],
      dtype='object')
   faultNumber  simulationRun  sample
0            0              1       1
1            0              1       2
2            0              1       3
3            0              1       4
4            0              1       5


In [13]:
df_f_training.head()  # Display the first few rows of the faulty training data

Unnamed: 0,faultNumber,simulationRun,sample,welding_temp,clamp_pressure,robot_torque,paint_viscosity,conveyor_speed,ambient_temp,humidity,sensor_drift,cycle_time,quality_score,robot_current,weld_time,spray_pressure
0,1,6,1,346.882163,20.161566,120.079132,0.738092,1.473141,22.162602,38.009544,0.209386,1.215002,0.158443,9.947462,1.000058,13.182958
1,1,6,2,357.290039,16.78252,119.457177,0.770183,1.604219,24.678117,41.895839,0.059237,6.852859,0.264833,10.184109,0.977341,13.546232
2,1,6,3,352.178208,16.082412,140.807782,0.935254,1.322614,25.427862,31.043206,0.041497,2.134372,0.284078,10.91625,0.992462,12.162262
3,1,6,4,341.009082,16.794704,103.901618,0.92843,1.345022,27.563385,44.483501,0.031777,11.115552,0.319943,8.289525,1.021662,11.418193
4,1,6,5,325.132193,16.538405,102.790097,0.781213,1.25111,20.234156,36.866923,0.00258,9.405903,0.532255,7.683254,1.072635,13.602256
