# Custom code for generating response functions & datasets:
- Currently, response functions are multi-dimensional sigmoids meaning all input-output relationships will be monotonic. Eventually, might be nice to support non-monotonic relationships as well, so that certain input features can have an "optimum" with worse performance on either side of the optimum.
- Also note: this currently only works for generating non-formulations datasets. Eventually, want to support formulations as well.

In [118]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional

## These functions are doing most of the work:

### Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [119]:
def wide_to_compact_format(df):
    """
    Convert formulation data from wide format to compact format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in compact format with columns:
        - component-1_identifier, component-1_amount, component-2_identifier, component-2_amount, etc.
    """
    # Create an empty list to store the transformed rows
    compact_rows = []
    
    # Iterate through each formulation (row)
    for idx, row in df.iterrows():
        # Get non-zero ingredients and their percentages
        ingredients = row[row > 0]
        
        # Create a new row with alternating ingredient names and percentages
        new_row = {}
        for i, (ingredient_name, percentage) in enumerate(ingredients.items(), 1):
            new_row[f'component-{0+i}_identifier'] = ingredient_name
            new_row[f'component-{0+i}_amount'] = percentage
            
        compact_rows.append(new_row)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(compact_rows)
    
    return result_df

### Convert ingredient recipe data tables from "Compact" to "Wide" format:

In [47]:
def compact_to_wide_format(df):
    """
    Convert formulation data from compact format to wide format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in compact format where:
        - Each row is a formulation
        - Columns alternate between ingredient names and weight percentages
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    """
    # Create a list to store the transformed rows
    wide_rows = []
    
    # Get all unique ingredients across all formulations
    ingredient_columns = [col for col in df.columns if 'Name' in col]
    all_ingredients = set()
    for col in ingredient_columns:
        all_ingredients.update(df[col].dropna().unique())
    
    # Process each formulation
    for idx, row in df.iterrows():
        # Create a dictionary with all ingredients initialized to 0
        formulation = {ingredient: 0 for ingredient in all_ingredients}
        
        # Fill in the actual values
        for i in range(1, len(df.columns) // 2 + 1):
            name_col = f'component-{0+i}_identifier'
            weight_col = f'component-{0+i}_amount'
            
            if name_col in df.columns and pd.notna(row[name_col]):
                ingredient_name = row[name_col]
                formulation[ingredient_name] = row[weight_col]
        
        wide_rows.append(formulation)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(wide_rows)
    
    # Sort columns alphabetically for consistency
    result_df = result_df.reindex(sorted(result_df.columns), axis=1)
    
    return result_df

### Constrained Simplex Sampling

#### TODO: make this a little smarter; currently this is very bad at sampling from small constraint ranges

In [48]:
def sample_from_constrained_simplex(
    n_dimensions: int,
    constraints: Optional[List[Tuple[float, float]]] = None,
    max_attempts: int = 1000
):
    """
    Generate a random point from an N-dimensional simplex with optional element-wise constraints.
    
    Parameters:
        n_dimensions (int): Number of dimensions for the simplex
        constraints (List[Tuple[float, float]], optional): List of (min, max) constraints for each dimension.
            Use None for unconstrained dimensions. Example: [(0.2, 0.4), None, (0, 0.5)]
        max_attempts (int): Maximum number of attempts to find a valid solution
        
    Returns:
        numpy.ndarray: Array of N numbers between 0 and 1 that sum to 1 and satisfy constraints
        
    Raises:
        ValueError: If constraints are impossible to satisfy or if max_attempts is reached
    """

    if n_dimensions==0:
        sample = np.array([])
        return sample

    # Initialize constraints if not provided
    if constraints is None:
        constraints = [None] * n_dimensions
    elif len(constraints) != n_dimensions:
        raise ValueError("Length of constraints must match n_dimensions")
    
    # Validate constraints
    total_min = sum(c[0] for c in constraints if c is not None)
    if total_min > 1:
        raise ValueError("Sum of minimum constraints exceeds 1")
    
    for attempt in range(max_attempts):
        try:
            # Generate initial random sample
            sample = np.random.random(n_dimensions)
            sample = sample / np.sum(sample)  # Normalize to sum to 1
            
            # Apply constraints iteratively
            for _ in range(n_dimensions * 2):  # Allow multiple passes for adjustment
                modified = False
                
                # Adjust values to meet constraints
                for i, constraint in enumerate(constraints):
                    if constraint is not None:
                        min_val, max_val = constraint
                        if sample[i] < min_val:
                            deficit = min_val - sample[i]
                            # Take deficit proportionally from unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] > c[0])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy minimum constraint")
                            weights = np.array([sample[j] for j in free_indices])
                            weights = weights / weights.sum()
                            for j, w in zip(free_indices, weights):
                                sample[j] -= deficit * w
                            sample[i] = min_val
                            modified = True
                        elif sample[i] > max_val:
                            excess = sample[i] - max_val
                            # Distribute excess proportionally to unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] < c[1])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy maximum constraint")
                            sample[free_indices] += excess / len(free_indices)
                            sample[i] = max_val
                            modified = True
                
                # Normalize to sum to 1
                sample = sample / np.sum(sample)
                
                # Check if all constraints are satisfied
                constraints_satisfied = all(
                    c is None or (c[0] <= v <= c[1])
                    for c, v in zip(constraints, sample)
                )
                
                if constraints_satisfied and abs(sum(sample) - 1.0) < 1e-10:
                    return sample
                
                if not modified:
                    break
                    
        except ValueError:
            continue
            
    raise ValueError(f"Could not find valid solution after {max_attempts} attempts")

### TODO: allow user to add noise to the response functions (make use of the `noise` argument which currently does nothing)

In [97]:
### D-dimensional sigmoid function with the given set of D coefficients:
def sigmoid(input_row, coefs):
    value = 1 / (1 + np.exp(-1 * np.matmul(input_row, coefs)))
    return value


def build_sythetic_demo_dataset(inputs=5, outputs=1, num_rows=10, noise=0, coefs=None, output_format="compact"):

    ### TODO: allow user to add noise to the response functions (using the `noise` argument)
    
    if isinstance(inputs, int):
        num_inputs = inputs
    else:
        general_inputs = inputs["general"]
        formulation_inputs = inputs["formulation"]
        num_general_inputs = len(general_inputs)
        num_formulation_inputs = len(formulation_inputs)
        all_inputs = list(general_inputs) + list(formulation_inputs)
        num_inputs = len(all_inputs)
        if inputs["formulation"]:
            formulation_constraints = [(formulation_inputs[input_]["min"], formulation_inputs[input_]["max"]) for input_ in formulation_inputs]


    if isinstance(outputs, int):
        num_outputs = outputs
    else:
        num_outputs = len(outputs)  


    # Randomly set coefficients for the response function, if not set by the user   
    if coefs==None:
        coefs = np.array([[np.random.uniform(-1, 1) for i in range(num_inputs)] for k in range(num_outputs)])


    # Create pandas DataFrame for the response function coefficients & name the columns
    coefs_df = pd.DataFrame(coefs)
    if isinstance(inputs, int):
        coefs_df = coefs_df.rename(columns={i: f"x_{i+1}" for i in range(len(coefs_df.T))})
        coefs_df = coefs_df.rename(index={k: f"y_{k+1}" for k in range(len(coefs_df))})
    else:
        coefs_df = coefs_df.rename(columns={i: list(all_inputs)[i] for i in range(len(coefs_df.T))})
        coefs_df = coefs_df.rename(index={k: list(outputs)[k] for k in range(len(coefs_df))})


    
    # Generate input values
    if isinstance(inputs, int):
        num_inputs = inputs
        X = np.array([[np.random.uniform(-2, 2) for i in range(num_inputs)] for j in range(num_rows)])
    else:
        X_general = np.array([[np.random.uniform(-2, 2) for i in range(num_general_inputs)] for j in range(num_rows)])
        if inputs["formulation"]:
            X_formulation = np.array([sample_from_constrained_simplex(n_dimensions=num_formulation_inputs, constraints=formulation_constraints) for j in range(num_rows)])
            X = np.concatenate((X_general, X_formulation), axis=1)
        else:
            X = X_general


    # Generate output values
    y = list()
    for k in range(num_outputs):
        y.append(list())
        for row in X:
            y[k].append(sigmoid(row, coefs[k]))

    y = np.array(y)

    
    # Create pandas DataFrame for the generated data & name the columns
    data_df = pd.DataFrame()

    for i in range(num_inputs):
        if isinstance(inputs, int):
            data_df[f"x_{i+1}"] = X[:, i]
        else:
            data_df[all_inputs[i]] = X[:, i]
    
    for k in range(num_outputs):
        if isinstance(outputs, int):
            data_df[f"y_{k+1}"] = y[k]
        else:
            data_df[list(outputs)[k]] = y[k]


    ### TODO: clean this section up
    #################################
    if isinstance(inputs, int):
        pass
    else:
        df_scaled = data_df.copy()

        for col in df_scaled.columns:
            if col in general_inputs:
                scaled_col = (df_scaled[col].to_numpy() + 2) / 4
            else:
                scaled_col = df_scaled[col]
            df_scaled[col] = scaled_col

        all_columns = dict()
        # all_columns.update(all_inputs)
        all_columns.update(general_inputs)
        all_columns.update(formulation_inputs)
        all_columns.update(outputs)

        for col in all_columns:
            if col in general_inputs or col in outputs:
                df_scaled[col] = df_scaled[col] * (all_columns[col]["max"] - all_columns[col]["min"]) + all_columns[col]["min"]

        column_renaming = {col: f'{col}-{all_columns[col]["units"]}' for col in general_inputs or col in outputs}
        df_scaled = df_scaled.rename(column_renaming, axis=1)
        coefs_df = coefs_df.rename(column_renaming, axis=0)
        coefs_df = coefs_df.rename(column_renaming, axis=1)

        data_df = df_scaled

        if output_format == "compact":
            formulation_column_headers = list(formulation_inputs.keys())
            formulation_df = data_df[formulation_column_headers] * 100
            formulation_df = wide_to_compact_format(formulation_df)
            data_df = data_df.drop(labels=formulation_column_headers, axis=1)
            data_df = pd.concat([data_df, formulation_df], axis=1)
        elif output_format == "wide":
            pass
        else:
            raise ValueError("argument `output_format` must be either 'compact' or 'wide'.")


    #################################
    
    return data_df, coefs_df

## Examples

### Example 1: generate arbitrary # of rows & columns, with no column names

In [98]:
data_df, coefs_df = build_sythetic_demo_dataset(inputs=9, outputs=4, num_rows=10)
data_df

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,y_1,y_2,y_3,y_4
0,0.765494,-0.369574,-0.986663,-1.093495,-1.270351,0.277561,-0.23031,-1.664897,-0.305443,0.354449,0.351779,0.980312,0.575495
1,1.072555,-1.846399,1.736851,0.782146,-1.781357,-1.747569,-0.461164,0.884466,1.223301,0.075037,0.47717,0.493077,0.729484
2,-1.113011,1.926073,-0.433582,-0.921556,0.051124,-0.93693,-1.097582,1.781055,1.909386,0.242376,0.690969,0.418274,0.397814
3,-1.360461,-0.50426,1.784987,0.428155,1.102322,-1.955303,-0.211978,0.674478,-0.137608,0.491216,0.212764,0.005229,0.498984
4,1.757835,0.261978,0.761351,1.749493,-1.003905,0.347835,0.868058,-1.870991,0.195429,0.705538,0.775295,0.673197,0.85326
5,-1.615234,1.859117,-1.901508,-1.786323,-1.064398,0.14995,0.900889,1.880116,0.284649,0.019877,0.615169,0.894491,0.021475
6,-0.83406,-1.1595,-1.139827,-1.302328,0.526911,-0.863734,0.309815,-0.290296,-0.475188,0.31918,0.091033,0.647986,0.279166
7,1.152777,-0.608273,1.66929,1.131314,1.586422,-0.118666,-0.997858,1.952837,0.541904,0.791038,0.768011,0.035531,0.833245
8,1.523456,1.006333,1.646714,-1.677205,0.639087,-1.485597,-0.614983,1.937117,-0.499467,0.03022,0.684527,0.067719,0.35273
9,-1.01198,-0.666873,1.451049,-1.122773,0.772853,1.149452,-1.932826,-1.787666,-0.833564,0.957452,0.110882,0.463119,0.864636


In [99]:
coefs_df

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
y_1,-0.238667,-0.062006,-0.217714,0.896426,0.928577,0.553291,-0.843308,-0.721824,0.144933
y_2,0.501477,0.716428,-0.243719,0.728638,-0.304649,0.125098,-0.388056,0.46686,-0.301628
y_3,0.457239,-0.451116,-0.746492,-0.870909,-0.888746,0.916803,0.102573,-0.35693,0.824551
y_4,0.404247,-0.153513,0.262188,0.256328,0.412037,0.084666,-0.554929,-0.630477,0.656245


### Example 2: create a laser welding dataset with named columns

#### Assign "reasonable" ranges and desired units for each input & output column:

In [100]:
inputs = {
    "general": {
        "Laser Power": {"min": 100, "max": 1000, "units": "W"},
        "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
        "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
        "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
        "Focal Position": {"min": -2, "max": 5, "units": "mm"},
        # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
        "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
        "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
        "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
        "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
    },
    "formulation": {
        # "Carbon": {"min": 0.0, "max": 0.0008, "units": "%"},
        # "Manganese": {"min": 0.00, "max": 0.02, "units": "%"},
        # "Molybdenum": {"min": 0.01, "max": 0.05, "units": "%"},
        "Nickel": {"min": 0.05, "max": 0.50, "units": "%"},
        "Chromium": {"min": 0.10, "max": 0.40, "units": "%"},
        "Iron": {"min": 0.0, "max": 1.0, "units": "%"},
    },
}

outputs = {
    "Hardness": {"min": 200, "max": 800, "units": "HV"},
    "Fatigue Life": {"min": 10000, "max": 100000, "units": "numCycles"},
    "Wear Rate": {"min": 0.01, "max": 1.0, "units": "mg/m"},
    "Cutting Efficiency": {"min": 0.1, "max": 5, "units": "m/s"},
}

In [101]:
data_df, coefs_df = build_sythetic_demo_dataset(inputs=inputs, outputs=outputs, num_rows=15)
data_df

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Hardness,Fatigue Life,Wear Rate,Cutting Efficiency,component-1_identifier,component-1_amount,component-2_identifier,component-2_amount,component-3_identifier,component-3_amount
0,657.575122,1.151157,146.262355,0.15583,2.788576,6.129449,372.324325,22.082862,85.14496,631.301775,42146.420979,0.98877,1.554181,Nickel,39.31293,Chromium,40.0,Iron,20.68707
1,661.934298,6.492606,99.840703,1.730438,-1.368801,19.769217,439.659133,22.323202,971.948973,524.786372,91804.377557,0.423124,3.625762,Nickel,50.0,Chromium,12.379953,Iron,37.620047
2,971.039748,2.158018,16.229303,1.548285,0.527911,10.673586,474.453233,25.400855,672.77775,702.036639,33864.807104,0.893086,2.036269,Nickel,20.273311,Chromium,40.0,Iron,39.726689
3,677.337059,3.219265,158.098206,1.343191,0.335292,9.602302,194.59303,25.213809,527.418503,701.882322,70231.211852,0.471639,1.905093,Nickel,44.452173,Chromium,32.622152,Iron,22.925675
4,512.743099,8.854313,122.960011,2.103318,0.266086,9.327981,437.786943,28.217305,196.621813,395.59164,49370.964227,0.292087,4.602882,Nickel,39.839663,Chromium,32.975112,Iron,27.185225
5,521.282969,8.991222,2.091363,0.11434,-1.427585,5.301806,142.165159,25.192209,334.2356,430.231697,27561.179951,0.794086,0.616361,Nickel,24.840647,Chromium,36.650105,Iron,38.509247
6,270.727656,2.338608,192.359062,0.564671,1.231236,22.553831,180.364907,23.99439,282.41095,428.919131,91165.01828,0.335118,2.554446,Nickel,24.620873,Chromium,27.878304,Iron,47.500822
7,113.479813,3.315786,180.476581,0.559536,0.928877,13.131041,445.697199,21.059223,100.487756,548.443753,67023.598384,0.908601,3.049962,Nickel,13.464278,Chromium,34.428486,Iron,52.107236
8,198.546887,4.918065,143.788414,2.4772,-0.020826,5.538321,20.577703,25.305337,735.701547,778.410626,50930.301625,0.122549,1.95132,Nickel,47.958234,Chromium,10.0,Iron,42.041766
9,852.1457,0.582679,186.295189,2.66562,-0.324981,13.162578,236.320282,22.037341,619.639402,789.48284,39932.577991,0.158896,3.247074,Nickel,36.966974,Chromium,40.0,Iron,23.033026


In [87]:
coefs_df

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Nickel,Chromium,Iron
Hardness,0.415287,-0.502809,0.934319,0.510989,0.009964,0.462646,-0.753302,-0.740969,-0.434216,0.281781,-0.532926,0.968417
Fatigue Life,0.335154,0.193579,0.819928,0.413711,0.185451,-0.424339,0.807163,0.230036,0.208932,-0.86849,0.533784,0.010794
Wear Rate,0.291783,-0.625827,0.051901,0.499616,0.107761,0.206914,-0.383288,-0.050822,0.296147,0.06825,0.685964,-0.968839
Cutting Efficiency,-0.96434,-0.930754,-0.598236,0.067788,-0.940965,-0.722055,-0.667422,0.32203,0.994286,-0.702094,-0.405502,-0.355583


## [Optional] Save result to Excel or CSV file: 

### Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [60]:
# data_df.to_excel("Demo Datasets/Laser Welding (Synthetic)/laser_welding.xlsx", index=False)
# data_df.to_csv("Demo Datasets/Laser Welding (Synthetic)/laser_welding.csv", index=False)

# data_df.to_csv("./laser_welding_with_formulation.csv", index=False)

# Done!