# Custom code for generating response functions & datasets:
- Currently, response functions are multi-dimensional sigmoids meaning all input-output relationships will be monotonic. Eventually, might be nice to support non-monotonic relationships as well, so that certain input features can have an "optimum" with worse performance on either side of the optimum.
- Also note: this currently only works for generating non-formulations datasets. Eventually, want to support formulations as well.

In [6]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional

## These functions are doing most of the work:

### Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [7]:
def wide_to_compact_format(df):
    """
    Convert formulation data from wide format to compact format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in compact format with columns:
        - component-1_identifier, component-1_amount, component-2_identifier, component-2_amount, etc.
    """
    # Create an empty list to store the transformed rows
    compact_rows = []
    
    # Iterate through each formulation (row)
    for idx, row in df.iterrows():
        # Get non-zero ingredients and their percentages
        ingredients = row[row > 0]
        
        # Create a new row with alternating ingredient names and percentages
        new_row = {}
        for i, (ingredient_name, percentage) in enumerate(ingredients.items(), 1):
            new_row[f'component-{0+i}_identifier'] = ingredient_name
            new_row[f'component-{0+i}_amount'] = percentage
            
        compact_rows.append(new_row)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(compact_rows)
    
    return result_df

### Convert ingredient recipe data tables from "Compact" to "Wide" format:

In [8]:
def compact_to_wide_format(df):
    """
    Convert formulation data from compact format to wide format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in compact format where:
        - Each row is a formulation
        - Columns alternate between ingredient names and weight percentages
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    """
    # Create a list to store the transformed rows
    wide_rows = []
    
    # Get all unique ingredients across all formulations
    ingredient_columns = [col for col in df.columns if 'Name' in col]
    all_ingredients = set()
    for col in ingredient_columns:
        all_ingredients.update(df[col].dropna().unique())
    
    # Process each formulation
    for idx, row in df.iterrows():
        # Create a dictionary with all ingredients initialized to 0
        formulation = {ingredient: 0 for ingredient in all_ingredients}
        
        # Fill in the actual values
        for i in range(1, len(df.columns) // 2 + 1):
            name_col = f'component-{0+i}_identifier'
            weight_col = f'component-{0+i}_amount'
            
            if name_col in df.columns and pd.notna(row[name_col]):
                ingredient_name = row[name_col]
                formulation[ingredient_name] = row[weight_col]
        
        wide_rows.append(formulation)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(wide_rows)
    
    # Sort columns alphabetically for consistency
    result_df = result_df.reindex(sorted(result_df.columns), axis=1)
    
    return result_df

### Constrained Simplex Sampling

#### TODO: make this a little smarter; currently this is very bad at sampling from small constraint ranges

In [9]:
def sample_from_constrained_simplex(
    n_dimensions: int,
    constraints: Optional[List[Tuple[float, float]]] = None,
    max_attempts: int = 1000
):
    """
    Generate a random point from an N-dimensional simplex with optional element-wise constraints.
    
    Parameters:
        n_dimensions (int): Number of dimensions for the simplex
        constraints (List[Tuple[float, float]], optional): List of (min, max) constraints for each dimension.
            Use None for unconstrained dimensions. Example: [(0.2, 0.4), None, (0, 0.5)]
        max_attempts (int): Maximum number of attempts to find a valid solution
        
    Returns:
        numpy.ndarray: Array of N numbers between 0 and 1 that sum to 1 and satisfy constraints
        
    Raises:
        ValueError: If constraints are impossible to satisfy or if max_attempts is reached
    """

    if n_dimensions==0:
        sample = np.array([])
        return sample

    # Initialize constraints if not provided
    if constraints is None:
        constraints = [None] * n_dimensions
    elif len(constraints) != n_dimensions:
        raise ValueError("Length of constraints must match n_dimensions")
    
    # Validate constraints
    total_min = sum(c[0] for c in constraints if c is not None)
    if total_min > 1:
        raise ValueError("Sum of minimum constraints exceeds 1")
    
    for attempt in range(max_attempts):
        try:
            # Generate initial random sample
            sample = np.random.random(n_dimensions)
            sample = sample / np.sum(sample)  # Normalize to sum to 1
            
            # Apply constraints iteratively
            for _ in range(n_dimensions * 2):  # Allow multiple passes for adjustment
                modified = False
                
                # Adjust values to meet constraints
                for i, constraint in enumerate(constraints):
                    if constraint is not None:
                        min_val, max_val = constraint
                        if sample[i] < min_val:
                            deficit = min_val - sample[i]
                            # Take deficit proportionally from unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] > c[0])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy minimum constraint")
                            weights = np.array([sample[j] for j in free_indices])
                            weights = weights / weights.sum()
                            for j, w in zip(free_indices, weights):
                                sample[j] -= deficit * w
                            sample[i] = min_val
                            modified = True
                        elif sample[i] > max_val:
                            excess = sample[i] - max_val
                            # Distribute excess proportionally to unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] < c[1])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy maximum constraint")
                            sample[free_indices] += excess / len(free_indices)
                            sample[i] = max_val
                            modified = True
                
                # Normalize to sum to 1
                sample = sample / np.sum(sample)
                
                # Check if all constraints are satisfied
                constraints_satisfied = all(
                    c is None or (c[0] <= v <= c[1])
                    for c, v in zip(constraints, sample)
                )
                
                if constraints_satisfied and abs(sum(sample) - 1.0) < 1e-10:
                    return sample
                
                if not modified:
                    break
                    
        except ValueError:
            continue
            
    raise ValueError(f"Could not find valid solution after {max_attempts} attempts")

### TODO: allow user to add noise to the response functions (make use of the `noise` argument which currently does nothing)

In [10]:
### D-dimensional sigmoid function with the given set of D coefficients:
def sigmoid(input_row, coefs):
    value = 1 / (1 + np.exp(-1 * np.matmul(input_row, coefs)))
    return value


def build_sythetic_demo_dataset(inputs=5, outputs=1, num_rows=10, noise=0, coefs=None, output_format="compact"):

    ### TODO: allow user to add noise to the response functions (using the `noise` argument)
    
    if isinstance(inputs, int):
        num_inputs = inputs
    else:
        general_inputs = inputs["general"]
        formulation_inputs = inputs["formulation"]
        num_general_inputs = len(general_inputs)
        num_formulation_inputs = len(formulation_inputs)
        all_inputs = list(general_inputs) + list(formulation_inputs)
        num_inputs = len(all_inputs)
        if inputs["formulation"]:
            formulation_constraints = [(formulation_inputs[input_]["min"], formulation_inputs[input_]["max"]) for input_ in formulation_inputs]


    if isinstance(outputs, int):
        num_outputs = outputs
    else:
        num_outputs = len(outputs)  


    # Randomly set coefficients for the response function, if not set by the user   
    if coefs==None:
        coefs = np.array([[np.random.uniform(-1, 1) for i in range(num_inputs)] for k in range(num_outputs)])


    # Create pandas DataFrame for the response function coefficients & name the columns
    coefs_df = pd.DataFrame(coefs)
    if isinstance(inputs, int):
        coefs_df = coefs_df.rename(columns={i: f"x_{i+1}" for i in range(len(coefs_df.T))})
        coefs_df = coefs_df.rename(index={k: f"y_{k+1}" for k in range(len(coefs_df))})
    else:
        coefs_df = coefs_df.rename(columns={i: list(all_inputs)[i] for i in range(len(coefs_df.T))})
        coefs_df = coefs_df.rename(index={k: list(outputs)[k] for k in range(len(coefs_df))})

    
    # Generate input values
    if isinstance(inputs, int):
        num_inputs = inputs
        X = np.array([[np.random.uniform(-2, 2) for i in range(num_inputs)] for j in range(num_rows)])
    else:
        X_general = np.array([[np.random.uniform(-2, 2) for i in range(num_general_inputs)] for j in range(num_rows)])
        if inputs["formulation"]:
            X_formulation = np.array([sample_from_constrained_simplex(n_dimensions=num_formulation_inputs, constraints=formulation_constraints) for j in range(num_rows)])
            X = np.concatenate((X_general, X_formulation), axis=1)
        else:
            X = X_general


    # Generate output values
    y = list()
    for k in range(num_outputs):
        y.append(list())
        for row in X:
            y[k].append(sigmoid(row, coefs[k]))

    y = np.array(y)

    
    # Create pandas DataFrame for the generated data & name the columns
    data_df = pd.DataFrame()

    for i in range(num_inputs):
        if isinstance(inputs, int):
            data_df[f"x_{i+1}"] = X[:, i]
        else:
            data_df[all_inputs[i]] = X[:, i]
    
    for k in range(num_outputs):
        if isinstance(outputs, int):
            data_df[f"y_{k+1}"] = y[k]
        else:
            data_df[list(outputs)[k]] = y[k]


    ### TODO: clean this section up
    #################################
    if isinstance(inputs, int):
        pass
    else:
        df_scaled = data_df.copy()

        for col in df_scaled.columns:
            if col in general_inputs:
                scaled_col = (df_scaled[col].to_numpy() + 2) / 4
            else:
                scaled_col = df_scaled[col]
            df_scaled[col] = scaled_col

        all_columns = dict()
        # all_columns.update(all_inputs)
        all_columns.update(general_inputs)
        all_columns.update(formulation_inputs)
        all_columns.update(outputs)

        for col in all_columns:
            if col in general_inputs or col in outputs:
                df_scaled[col] = df_scaled[col] * (all_columns[col]["max"] - all_columns[col]["min"]) + all_columns[col]["min"]

        column_renaming = {col: f'{col}-{all_columns[col]["units"]}' for col in general_inputs or col in outputs}
        df_scaled = df_scaled.rename(column_renaming, axis=1)
        coefs_df = coefs_df.rename(column_renaming, axis=0)
        coefs_df = coefs_df.rename(column_renaming, axis=1)

        data_df = df_scaled

        if output_format == "compact":
            formulation_column_headers = list(formulation_inputs.keys())
            formulation_df = data_df[formulation_column_headers] * 100
            formulation_df = wide_to_compact_format(formulation_df)
            data_df = data_df.drop(labels=formulation_column_headers, axis=1)
            data_df = pd.concat([data_df, formulation_df], axis=1)
        elif output_format == "wide":
            pass
        else:
            raise ValueError("argument `output_format` must be either 'compact' or 'wide'.")
    #################################
    
    return data_df, coefs_df

## Examples

### Example 1: generate arbitrary # of rows & columns, with no column names

In [11]:
data_df, coefs_df = build_sythetic_demo_dataset(inputs=9, outputs=4, num_rows=10)
data_df

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,y_1,y_2,y_3,y_4
0,-1.57647,-0.414938,0.736815,0.576554,-1.57264,-0.327055,0.746466,1.215863,0.022918,0.990177,0.906285,0.344374,0.674261
1,-0.056419,-1.676106,-0.113009,-1.882063,-1.767548,0.794273,-0.156433,1.642772,1.555692,0.863576,0.531129,0.02446,0.87271
2,1.661459,0.343669,-0.087095,-0.269568,1.081128,-0.936237,-0.874002,1.091845,1.498568,0.343863,0.429754,0.026912,0.769079
3,-0.779343,-0.21389,1.612317,-1.203847,1.86979,0.408796,1.761511,1.127846,-1.306539,0.268464,0.665339,0.290294,0.287837
4,1.777105,-1.697093,1.290074,-1.527507,-0.607218,-0.9654,1.884138,-1.320587,1.949156,0.134835,0.62394,0.076517,0.369767
5,-0.215556,1.756279,-1.445565,-1.539579,-0.692488,1.183189,-0.94934,0.192995,-1.532091,0.062112,0.443031,0.682863,0.867476
6,1.32594,0.379207,-1.55161,-0.15144,1.764516,-0.268872,-1.72416,-0.836855,-1.957808,0.035645,0.054768,0.909892,0.738372
7,0.020882,1.018365,-0.310497,-0.354056,0.064289,-1.047336,1.569125,0.196023,1.086825,0.127408,0.702912,0.223447,0.140236
8,1.485061,0.098379,1.131291,0.840602,1.256186,-0.593138,-0.492896,0.386669,-0.613646,0.719237,0.768733,0.383183,0.890997
9,0.061813,1.820074,0.475303,-1.870663,1.110947,-0.087825,-1.084126,0.30562,-0.771604,0.08045,0.350234,0.05682,0.81816


In [12]:
coefs_df

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9
y_1,-0.52064,-0.666121,0.92754,0.957131,-0.949871,-0.519663,-0.680916,0.929009,-0.18847
y_2,0.599259,0.595478,0.76864,0.470921,-0.881737,0.067004,0.620027,0.659932,-0.301549
y_3,0.012116,-0.206737,-0.796563,0.987135,-0.028692,0.617148,0.562035,-0.768303,-0.935139
y_4,0.997019,0.054262,0.604781,-0.211323,-0.980429,-0.23575,-0.78974,0.811888,-0.809953


### Example 2: create a laser welding dataset with named columns

#### Assign "reasonable" ranges and desired units for each input & output column:

In [13]:
inputs = {
    "general": {
        "Laser Power": {"min": 100, "max": 1000, "units": "W"},
        "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
        "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
        "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
        "Focal Position": {"min": -2, "max": 5, "units": "mm"},
        # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
        "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
        "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
        "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
        "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
    },
    "formulation": {
        "Carbon": {"min": 0.0, "max": 0.0008, "units": "%"},
        "Manganese": {"min": 0.00, "max": 0.02, "units": "%"},
        "Molybdenum": {"min": 0.01, "max": 0.05, "units": "%"},
        "Nickel": {"min": 0.05, "max": 0.70, "units": "%"},
        "Chromium": {"min": 0.10, "max": 0.40, "units": "%"},
        "Iron": {"min": 0.0, "max": 1.0, "units": "%"},
        "Gold": {"min": 0.0, "max": 1.0, "units": "%"}
    },
}

outputs = {
    "Hardness": {"min": 200, "max": 800, "units": "HV"},
    "Fatigue Life": {"min": 10000, "max": 100000, "units": "numCycles"},
    "Wear Rate": {"min": 0.01, "max": 1.0, "units": "mg/m"},
    "Cutting Efficiency": {"min": 0.1, "max": 5, "units": "m/s"},
}

In [14]:
data_df, coefs_df = build_sythetic_demo_dataset(inputs=inputs, outputs=outputs, num_rows=30)
data_df

Unnamed: 0,Laser Power-W,Pulse Duration-ms,Welding Speed-mm/s,Beam Diameter-mm,Focal Position-mm,Flow Rate-L/min,Heat Input-J/mm,Ambient Temperature-degC,Cooling Rate-degC/s,Hardness,...,component-3_identifier,component-3_amount,component-4_identifier,component-4_amount,component-5_identifier,component-5_amount,component-6_identifier,component-6_amount,component-7_identifier,component-7_amount
0,900.501841,4.980385,2.564547,0.779615,-1.411224,24.050326,362.000721,29.18774,561.979038,207.869908,...,Molybdenum,5.0,Nickel,29.371987,Chromium,20.180051,Iron,30.246532,Gold,13.12143
1,410.22817,2.803031,53.401761,2.121122,0.70505,23.471194,195.661573,26.37123,142.294974,240.973735,...,Molybdenum,5.0,Nickel,21.75408,Chromium,27.373908,Iron,27.551723,Gold,16.240288
2,418.59198,8.942184,117.162873,1.867388,2.996369,11.645559,230.646377,23.281483,826.489299,686.106402,...,Molybdenum,5.0,Nickel,31.177412,Chromium,26.092661,Iron,25.248067,Gold,10.401859
3,634.783941,6.37409,48.420474,0.410473,4.306437,17.824438,47.033598,28.694397,90.189872,224.073967,...,Molybdenum,5.0,Nickel,36.915098,Chromium,40.0,Iron,7.834512,Gold,8.170389
4,119.184221,7.465102,187.992109,1.49552,0.437164,6.781652,402.133467,28.246615,573.120774,747.409868,...,Molybdenum,5.0,Nickel,10.558911,Chromium,22.753076,Iron,33.698427,Gold,25.909587
5,915.244637,7.818669,6.94976,2.862523,2.645079,15.927449,213.236911,20.634689,884.059362,251.586912,...,Molybdenum,5.0,Nickel,26.889515,Chromium,27.429001,Iron,23.81852,Gold,14.782964
6,432.546399,5.04161,180.395236,1.159416,1.067408,16.885458,330.588899,22.47333,283.237354,701.971466,...,Molybdenum,5.0,Nickel,30.010982,Chromium,20.740261,Iron,15.026375,Gold,27.142382
7,637.365487,2.2859,104.537847,1.385002,2.701831,16.495587,367.596906,29.106878,52.873431,212.532976,...,Molybdenum,5.0,Nickel,23.117356,Chromium,26.518987,Iron,15.334812,Gold,27.948845
8,296.956758,4.725901,46.217422,2.081679,2.465243,8.479097,429.602165,23.150752,154.834785,310.53672,...,Molybdenum,5.0,Nickel,12.531901,Chromium,27.184339,Iron,20.143786,Gold,33.059974
9,366.003426,0.966284,141.781415,1.448283,-1.896414,16.478815,419.371659,26.903738,986.426206,752.224034,...,Molybdenum,5.0,Nickel,26.910363,Chromium,17.606633,Iron,22.758546,Gold,25.644458


In [124]:
coefs_df

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Nickel,Chromium,Iron
Hardness,-0.691944,-0.767034,-0.621157,0.7684,-0.552964,0.906269,0.752854,0.480597,-0.414088,0.955222,-0.103719,-0.85142
Fatigue Life,-0.226403,-0.912723,-0.136404,-0.272832,-0.711493,0.31712,0.365346,0.750055,0.948254,0.386321,-0.717836,-0.58558
Wear Rate,0.201569,-0.323022,-0.0318,-0.68424,-0.69869,0.965247,0.405764,-0.176801,0.739038,-0.568402,-0.301719,-0.054176
Cutting Efficiency,0.137146,-0.868588,-0.143433,0.725122,0.370706,0.208146,0.846428,0.85189,0.528561,-0.794475,-0.478047,0.974847


## [Optional] Save result to Excel or CSV file: 

### Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [60]:
# data_df.to_excel("Demo Datasets/Laser Welding (Synthetic)/laser_welding.xlsx", index=False)
# data_df.to_csv("Demo Datasets/Laser Welding (Synthetic)/laser_welding.csv", index=False)

data_df.to_csv("./laser_welding_with_formulation.csv", index=False)

# Done!