# Custom code for generating response functions & datasets:
- Currently, response functions are multi-dimensional sigmoids meaning all input-output relationships will be monotonic. Eventually, might be nice to support non-monotonic relationships as well, so that certain input features can have an "optimum" with worse performance on either side of the optimum.
- Also note: this currently only works for generating non-formulations datasets. Eventually, want to support formulations as well.

In [1]:
import numpy as np
import pandas as pd
from typing import List, Tuple, Optional

## These functions are doing most of the work:

### Constrained Simplex Sampling

#### TODO: make this a little smarter; currently this is very bad at sampling from small constraint ranges

In [2]:
def sample_from_constrained_simplex(
    n_dimensions: int,
    constraints: Optional[List[Tuple[float, float]]] = None,
    max_attempts: int = 1000
):
    """
    Generate a random point from an N-dimensional simplex with optional element-wise constraints.
    
    Parameters:
        n_dimensions (int): Number of dimensions for the simplex
        constraints (List[Tuple[float, float]], optional): List of (min, max) constraints for each dimension.
            Use None for unconstrained dimensions. Example: [(0.2, 0.4), None, (0, 0.5)]
        max_attempts (int): Maximum number of attempts to find a valid solution
        
    Returns:
        numpy.ndarray: Array of N numbers between 0 and 1 that sum to 1 and satisfy constraints
        
    Raises:
        ValueError: If constraints are impossible to satisfy or if max_attempts is reached
    """
    # Initialize constraints if not provided
    if constraints is None:
        constraints = [None] * n_dimensions
    elif len(constraints) != n_dimensions:
        raise ValueError("Length of constraints must match n_dimensions")
    
    # Validate constraints
    total_min = sum(c[0] for c in constraints if c is not None)
    if total_min > 1:
        raise ValueError("Sum of minimum constraints exceeds 1")
    
    for attempt in range(max_attempts):
        try:
            # Generate initial random sample
            sample = np.random.random(n_dimensions)
            sample = sample / np.sum(sample)  # Normalize to sum to 1
            
            # Apply constraints iteratively
            for _ in range(n_dimensions * 2):  # Allow multiple passes for adjustment
                modified = False
                
                # Adjust values to meet constraints
                for i, constraint in enumerate(constraints):
                    if constraint is not None:
                        min_val, max_val = constraint
                        if sample[i] < min_val:
                            deficit = min_val - sample[i]
                            # Take deficit proportionally from unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] > c[0])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy minimum constraint")
                            weights = np.array([sample[j] for j in free_indices])
                            weights = weights / weights.sum()
                            for j, w in zip(free_indices, weights):
                                sample[j] -= deficit * w
                            sample[i] = min_val
                            modified = True
                        elif sample[i] > max_val:
                            excess = sample[i] - max_val
                            # Distribute excess proportionally to unconstrained elements
                            free_indices = [j for j, c in enumerate(constraints) 
                                         if c is None or (j != i and sample[j] < c[1])]
                            if not free_indices:
                                raise ValueError("Cannot satisfy maximum constraint")
                            sample[free_indices] += excess / len(free_indices)
                            sample[i] = max_val
                            modified = True
                
                # Normalize to sum to 1
                sample = sample / np.sum(sample)
                
                # Check if all constraints are satisfied
                constraints_satisfied = all(
                    c is None or (c[0] <= v <= c[1])
                    for c, v in zip(constraints, sample)
                )
                
                if constraints_satisfied and abs(sum(sample) - 1.0) < 1e-10:
                    return sample
                
                if not modified:
                    break
                    
        except ValueError:
            continue
            
    raise ValueError(f"Could not find valid solution after {max_attempts} attempts")

### TODO: allow user to add noise to the response functions (make use of the `noise` argument which currently does nothing)

In [78]:
### D-dimensional sigmoid function with the given set of D coefficients:
def sigmoid(input_row, coefs):
    value = 1 / (1 + np.exp(-1 * np.matmul(input_row, coefs)))
    return value


def build_sythetic_demo_dataset(inputs=5, outputs=1, num_rows=10, noise=0, coefs=None):

    ### TODO: allow user to add noise to the response functions (using the `noise` argument)
    
    if isinstance(inputs, int):
        num_inputs = inputs
    else:
        general_inputs = inputs["general"]
        formulation_inputs = inputs["formulation"]
        all_inputs = list(general_inputs) + list(formulation_inputs)
        if inputs["formulation"]:
            num_general_inputs = len(general_inputs)
            num_formulation_inputs = len(formulation_inputs)
            formulation_constraints = [(formulation_inputs[input_]["min"], formulation_inputs[input_]["max"]) for input_ in formulation_inputs]



            num_inputs = len(all_inputs)
            
            
            # num_inputs = num_general_inputs + num_formulation_inputs
        else:
            num_inputs = len(inputs)

    if isinstance(outputs, int):
        num_outputs = outputs
    else:
        num_outputs = len(outputs)

    


    # Allow user to set their own coefficients   
    if coefs==None:
        coefs = np.array([[np.random.uniform(-1, 1) for i in range(num_inputs)] for k in range(num_outputs)])

    
    # # Generate input values
    # X = np.array([[np.random.uniform(-2, 2) for i in range(num_inputs)] for j in range(num_rows)])


    


    

    if isinstance(inputs, int):
        num_inputs = inputs
        X = np.array([[np.random.uniform(-2, 2) for i in range(num_inputs)] for j in range(num_rows)])
    else:
        if inputs["formulation"]:
            X_general = np.array([[np.random.uniform(-2, 2) for i in range(num_general_inputs)] for j in range(num_rows)])
            X_formulation = np.array([sample_from_constrained_simplex(n_dimensions=num_formulation_inputs, constraints=formulation_constraints) for j in range(num_rows)])


            X = np.concatenate((X_general, X_formulation), axis=1)




    # Generate output values
    y = list()
    for k in range(num_outputs):
        y.append(list())
        for row in X:
            y[k].append(sigmoid(row, coefs[k]))

    y = np.array(y)

    


    # Create pandas DataFrame for the generated data & name the columns
    data_df = pd.DataFrame()

    for i in range(num_inputs):
        if isinstance(inputs, int):
            data_df[f"x_{i+1}"] = X[:, i]
        else:
            data_df[all_inputs[i]] = X[:, i]
    
    for k in range(num_outputs):
        if isinstance(outputs, int):
            data_df[f"y_{k+1}"] = y[k]
        else:
            data_df[list(outputs)[k]] = y[k]










    #################################
    if isinstance(inputs, int):
        pass
    else:
        df = data_df.copy()
        df_scaled = df.copy()

        for col in df.columns:
            if col in general_inputs:
                scaled_col = (df[col].to_numpy() + 2) / 4
            else:
                scaled_col = df[col]
            df_scaled[col] = scaled_col

        all_columns = dict()
        # all_columns.update(all_inputs)
        all_columns.update(general_inputs)
        all_columns.update(formulation_inputs)
        all_columns.update(outputs)

        for col in all_columns:
            if col in general_inputs or col in outputs:
                df_scaled[col] = df_scaled[col] * (all_columns[col]["max"] - all_columns[col]["min"]) + all_columns[col]["min"]

        column_renaming = {col: f'{col}_{all_columns[col]["units"]}' for col in all_columns}
        df_scaled = df_scaled.rename(column_renaming, axis=1)

        data_df = df_scaled

    #################################


    coefs_df = pd.DataFrame(coefs)
    coefs_df = coefs_df.rename(index={k: f"y_{k+1}" for k in range(len(coefs_df))})


    ### TODO: fix this part!!!!!

    # coefs_df = coefs_df.rename(columns={i: list(inputs)[i] for i in range(len(coefs_df.T))})
    # coefs_df = coefs_df.rename(index={k: list(outputs)[k] for k in range(len(coefs_df))})


    
    return data_df, coefs_df

In [79]:
inputs = {
    "general": {
        "Laser Power": {"min": 100, "max": 1000, "units": "W"},
        "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
        "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
        "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
        "Focal Position": {"min": -2, "max": 5, "units": "mm"},
        # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
        "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
        "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
        "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
        "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
    },
    "formulation": {
        # "Carbon": {"min": 0.0, "max": 0.0008, "units": "%"},
        # "Manganese": {"min": 0.00, "max": 0.02, "units": "%"},
        # "Molybdenum": {"min": 0.01, "max": 0.05, "units": "%"},
        "Nickel": {"min": 0.05, "max": 0.50, "units": "%"},
        "Chromium": {"min": 0.10, "max": 0.40, "units": "%"},
        "Iron": {"min": 0.0, "max": 1.0, "units": "%"},
    },
}

In [80]:
data_df, coefs_df = build_sythetic_demo_dataset(inputs=inputs, outputs=outputs, num_rows=15)
data_df

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Nickel_%,Chromium_%,Iron_%,Hardness_HV,Fatigue Life_numCycles,Wear Rate_mg/m,Cutting Efficiency_m/s
0,393.783057,7.912059,32.798278,2.881101,-0.07957,17.536414,142.240747,21.659698,980.243609,0.05,0.309568,0.640432,573.355521,92689.521788,0.575022,3.043504
1,315.553914,5.397093,50.314319,2.062133,0.03132,5.742504,189.179562,27.809803,626.776481,0.477362,0.1,0.422638,676.38605,83273.483028,0.928936,2.873891
2,944.321905,3.402032,59.055336,1.210716,3.570482,10.6208,199.316612,20.365313,28.706858,0.5,0.117331,0.382669,242.650652,40341.757451,0.061031,0.254268
3,680.956285,6.131942,156.851929,2.289099,1.454984,22.39408,37.687445,25.347977,511.584863,0.099541,0.4,0.500459,553.694013,88050.901138,0.394142,3.051583
4,791.928631,3.439613,74.560228,0.732635,0.455246,17.720747,199.07438,21.453032,823.452484,0.42157,0.1,0.47843,222.474635,31822.003689,0.679529,0.521276
5,820.33662,0.652422,99.466875,0.811882,-1.633304,17.020383,156.320899,27.548373,855.180321,0.367245,0.318019,0.314737,241.96353,20850.717072,0.908188,0.916565
6,376.887333,4.947339,40.845696,0.41291,0.91501,10.712991,344.995588,20.622723,687.967922,0.05,0.318604,0.631396,338.114596,38731.092544,0.735276,1.852631
7,571.701066,8.603604,33.154647,2.866511,0.11847,5.394363,13.063442,23.196546,667.819098,0.075418,0.371872,0.55271,566.692171,90204.947347,0.913059,2.640752
8,517.861455,4.022545,157.565725,0.245201,-0.209202,16.785354,353.540179,28.589616,819.440405,0.5,0.144353,0.355647,433.914036,25261.496493,0.966429,3.295453
9,331.513918,6.480786,18.901431,0.77943,4.925661,8.552179,228.210409,23.077938,707.250585,0.412478,0.4,0.187522,279.082756,83048.071283,0.628184,0.603341


In [81]:
coefs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
y_1,-0.840725,0.112844,0.673807,0.877136,-0.130844,-0.25955,0.430705,0.263414,-0.313929,-0.717687,-0.331936,0.928871
y_2,-0.84877,0.582293,-0.492281,0.233068,0.326916,0.505562,-0.86567,0.264199,-0.324999,0.437788,-0.729764,0.762236
y_3,0.026102,0.927654,0.256619,-0.96028,-0.750958,-0.769074,-0.580751,0.618808,0.734888,-0.185762,-0.915296,0.356914
y_4,-0.67365,0.772374,0.575866,0.138604,-0.775582,0.083376,0.287859,0.161126,-0.334669,-0.721802,0.677421,-0.337793


In [None]:
inputs["general"]

{'Laser Power': {'min': 100, 'max': 1000, 'units': 'W'},
 'Pulse Duration': {'min': 0.1, 'max': 10, 'units': 'ms'},
 'Welding Speed': {'min': 1, 'max': 200, 'units': 'mm/s'},
 'Beam Diameter': {'min': 0.1, 'max': 3, 'units': 'mm'},
 'Focal Position': {'min': -2, 'max': 5, 'units': 'mm'},
 'Flow Rate': {'min': 5, 'max': 25, 'units': 'L/min'},
 'Heat Input': {'min': 10, 'max': 500, 'units': 'J/mm'},
 'Ambient Temperature': {'min': 20, 'max': 30, 'units': 'degC'},
 'Cooling Rate': {'min': 10, 'max': 1000, 'units': 'degC/s'}}

In [None]:
general_inputs = inputs["general"]
formulation_inputs = inputs["formulation"]
all_inputs = list(general_inputs) + list(formulation_inputs)
all_inputs

['Laser Power',
 'Pulse Duration',
 'Welding Speed',
 'Beam Diameter',
 'Focal Position',
 'Flow Rate',
 'Heat Input',
 'Ambient Temperature',
 'Cooling Rate',
 'Ingredient A',
 'Ingredient B',
 'Ingredient C']

In [44]:
formulation_inputs

{'Ingredient A': {'min': 0.0, 'max': 1.0, 'units': ''},
 'Ingredient B': {'min': 0.1, 'max': 0.4, 'units': ''},
 'Ingredient C': {'min': 0.0, 'max': 0.8, 'units': ''}}

In [45]:
all_inputs

['Laser Power',
 'Pulse Duration',
 'Welding Speed',
 'Beam Diameter',
 'Focal Position',
 'Flow Rate',
 'Heat Input',
 'Ambient Temperature',
 'Cooling Rate',
 'Ingredient A',
 'Ingredient B',
 'Ingredient C']

In [46]:
outputs

{'Hardness': {'min': 200, 'max': 800, 'units': 'HV'},
 'Fatigue Life': {'min': 10000, 'max': 100000, 'units': 'numCycles'},
 'Wear Rate': {'min': 0.01, 'max': 1.0, 'units': 'mg/m'},
 'Cutting Efficiency': {'min': 0.1, 'max': 5, 'units': 'm/s'}}

In [47]:
all_columns = dict()
# all_columns.update(all_inputs)
all_columns.update(general_inputs)
all_columns.update(formulation_inputs)
all_columns.update(outputs)
all_columns

{'Laser Power': {'min': 100, 'max': 1000, 'units': 'W'},
 'Pulse Duration': {'min': 0.1, 'max': 10, 'units': 'ms'},
 'Welding Speed': {'min': 1, 'max': 200, 'units': 'mm/s'},
 'Beam Diameter': {'min': 0.1, 'max': 3, 'units': 'mm'},
 'Focal Position': {'min': -2, 'max': 5, 'units': 'mm'},
 'Flow Rate': {'min': 5, 'max': 25, 'units': 'L/min'},
 'Heat Input': {'min': 10, 'max': 500, 'units': 'J/mm'},
 'Ambient Temperature': {'min': 20, 'max': 30, 'units': 'degC'},
 'Cooling Rate': {'min': 10, 'max': 1000, 'units': 'degC/s'},
 'Ingredient A': {'min': 0.0, 'max': 1.0, 'units': ''},
 'Ingredient B': {'min': 0.1, 'max': 0.4, 'units': ''},
 'Ingredient C': {'min': 0.0, 'max': 0.8, 'units': ''},
 'Hardness': {'min': 200, 'max': 800, 'units': 'HV'},
 'Fatigue Life': {'min': 10000, 'max': 100000, 'units': 'numCycles'},
 'Wear Rate': {'min': 0.01, 'max': 1.0, 'units': 'mg/m'},
 'Cutting Efficiency': {'min': 0.1, 'max': 5, 'units': 'm/s'}}

In [49]:
df = data_df.copy()
df_scaled = df.copy()

for col in df.columns:
    if col in all_inputs:
        scaled_col = (df[col].to_numpy() + 2) / 4
        df_scaled[col] = scaled_col

all_columns = dict()
# all_columns.update(all_inputs)
all_columns.update(general_inputs)
all_columns.update(formulation_inputs)
all_columns.update(outputs)

for col in all_columns:
    df_scaled[col] = df_scaled[col] * (all_columns[col]["max"] - all_columns[col]["min"]) + all_columns[col]["min"]

column_renaming = {col: f'{col}_{all_columns[col]["units"]}' for col in all_columns}
df_scaled = df_scaled.rename(column_renaming, axis=1)


In [None]:
df_scaled

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Ingredient A_,Ingredient B_,Ingredient C_,Hardness_HV,Fatigue Life_numCycles,Wear Rate_mg/m,Cutting Efficiency_m/s
0,633.958885,4.964168,63.580999,2.478202,-1.895447,10.52382,67.805993,27.031938,646.850846,0.527585,0.28,0.497932,649.793773,95380.675269,0.130369,0.336141
1,820.859857,1.034379,28.602759,2.92859,4.194364,6.961314,215.162166,25.303679,776.819145,0.644929,0.259832,0.457838,609.68582,96407.034088,0.769682,0.605211
2,954.297856,2.528917,130.212721,0.775313,-0.251681,23.949307,429.523283,25.111062,311.947832,0.591781,0.2575,0.506575,255.622208,15601.251377,0.492134,2.493374
3,439.294962,2.034724,198.746624,1.069689,4.634388,22.228217,306.180736,21.742078,760.083755,0.592775,0.27232,0.466261,230.277989,12731.156882,0.710367,3.937931
4,148.005124,4.254411,95.78824,1.574499,2.69703,10.542156,387.319637,23.911197,364.599853,0.556198,0.272536,0.494945,765.217037,67903.44901,0.564698,4.529215
5,146.175208,7.165516,62.455403,1.362271,-1.444318,20.59265,12.330787,20.783228,934.716793,0.618684,0.258692,0.481873,373.270666,38857.939779,0.586393,2.020642
6,463.779838,3.035144,187.31203,2.124072,0.598453,22.105393,464.447723,26.466061,258.124583,0.581287,0.26998,0.48169,642.662891,13010.746854,0.049029,4.337706
7,741.051326,0.106939,115.524192,2.591992,3.593972,22.812355,337.0816,22.404843,139.499485,0.560579,0.28,0.471536,532.379062,39902.437905,0.176249,4.032498
8,639.40341,9.706728,52.881128,2.345833,2.128071,9.286701,277.10487,21.782507,171.843921,0.512654,0.261204,0.56,579.192351,90587.40968,0.638709,2.029366
9,296.964222,4.475521,41.440239,0.264117,0.430656,18.109802,45.617008,23.247486,720.349671,0.517664,0.28,0.505868,280.316896,87459.189103,0.881313,2.197637


In [51]:
inputs = {
    "general": {
        "Laser Power": {"min": 100, "max": 1000, "units": "W"},
        "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
        "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
        "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
        "Focal Position": {"min": -2, "max": 5, "units": "mm"},
        # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
        "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
        "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
        "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
        "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
    },
    "formulation": {
        "Ingredient A": {"min": 0.0, "max": 1.0, "units": "wt%"},
        "Ingredient B": {"min": 0.1, "max": 0.4, "units": "wt%"},
        "Ingredient C": {"min": 0.0, "max": 0.8, "units": "wt%"},
    },
}

In [5]:
list(inputs["general"]) + list(inputs["formulation"])

['Laser Power',
 'Pulse Duration',
 'Welding Speed',
 'Beam Diameter',
 'Focal Position',
 'Flow Rate',
 'Heat Input',
 'Ambient Temperature',
 'Cooling Rate',
 'Ingredient A',
 'Ingredient B',
 'Ingredient C']

In [6]:
blah = {'general': {'Laser Power': {'min': 100, 'max': 1000, 'units': 'W'},
  'Pulse Duration': {'min': 0.1, 'max': 10, 'units': 'ms'},
  'Welding Speed': {'min': 1, 'max': 200, 'units': 'mm/s'},
  'Beam Diameter': {'min': 0.1, 'max': 3, 'units': 'mm'},
  'Focal Position': {'min': -2, 'max': 5, 'units': 'mm'},
  'Flow Rate': {'min': 5, 'max': 25, 'units': 'L/min'},
  'Heat Input': {'min': 10, 'max': 500, 'units': 'J/mm'},
  'Ambient Temperature': {'min': 20, 'max': 30, 'units': 'degC'},
  'Cooling Rate': {'min': 10, 'max': 1000, 'units': 'degC/s'}},
 'formulation': {}}

In [7]:
blah["formulation"]

{}

In [8]:
if blah["formulation"]:
    print(True)
else:
    print(False)

False


In [9]:
blah["formulation"] == True

False

### Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [10]:
def wide_to_compact_format(df):
    """
    Convert formulation data from wide format to compact format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in compact format with columns:
        - Ingredient A Name, Ingredient A weight %, Ingredient B Name, etc.
    """
    # Create an empty list to store the transformed rows
    compact_rows = []
    
    # Iterate through each formulation (row)
    for idx, row in df.iterrows():
        # Get non-zero ingredients and their percentages
        ingredients = row[row > 0]
        
        # Create a new row with alternating ingredient names and percentages
        new_row = {}
        for i, (ingredient_name, percentage) in enumerate(ingredients.items(), 1):
            new_row[f'Ingredient {chr(64+i)} Name'] = ingredient_name
            new_row[f'Ingredient {chr(64+i)} weight %'] = percentage
            
        compact_rows.append(new_row)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(compact_rows)
    
    return result_df

### Convert ingredient recipe data tables from "Compact" to "Wide" format:

In [11]:
def compact_to_wide_format(df):
    """
    Convert formulation data from compact format to wide format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in compact format where:
        - Each row is a formulation
        - Columns alternate between ingredient names and weight percentages
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    """
    # Create a list to store the transformed rows
    wide_rows = []
    
    # Get all unique ingredients across all formulations
    ingredient_columns = [col for col in df.columns if 'Name' in col]
    all_ingredients = set()
    for col in ingredient_columns:
        all_ingredients.update(df[col].dropna().unique())
    
    # Process each formulation
    for idx, row in df.iterrows():
        # Create a dictionary with all ingredients initialized to 0
        formulation = {ingredient: 0 for ingredient in all_ingredients}
        
        # Fill in the actual values
        for i in range(1, len(df.columns) // 2 + 1):
            name_col = f'Ingredient {chr(64+i)} Name'
            weight_col = f'Ingredient {chr(64+i)} weight %'
            
            if name_col in df.columns and pd.notna(row[name_col]):
                ingredient_name = row[name_col]
                formulation[ingredient_name] = row[weight_col]
        
        wide_rows.append(formulation)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(wide_rows)
    
    # Sort columns alphabetically for consistency
    result_df = result_df.reindex(sorted(result_df.columns), axis=1)
    
    return result_df

## Examples

### Example 1: generate arbitrary # of rows & columns, with no column names

In [12]:
data_df, coefs = build_sythetic_demo_dataset(inputs=9, outputs=4, num_rows=10)
data_df

Unnamed: 0,x_1,x_2,x_3,x_4,x_5,x_6,x_7,x_8,x_9,y_1,y_2,y_3,y_4
0,-1.078959,-0.765576,-0.946805,0.684228,0.999892,0.287192,0.401062,1.154241,-0.858691,0.819915,0.721451,0.127229,0.202473
1,-0.893746,0.823875,-0.672399,1.786943,0.531398,0.409663,-1.171543,-0.347709,0.316208,0.865681,0.388834,0.74391,0.452834
2,-0.421491,1.289478,-0.201938,0.630835,-0.573471,-1.093919,-0.23157,-1.985848,-1.912077,0.044657,0.325428,0.902195,0.939219
3,1.411519,-0.004534,1.271952,0.063847,-0.235791,-0.851126,1.319437,-0.309943,1.527643,0.533128,0.624086,0.200242,0.218606
4,-1.329918,-1.922816,1.700723,-1.91238,0.313006,1.142252,1.986775,-0.941112,0.8445,0.596186,0.526424,0.191028,0.104799
5,-0.06339,-0.496608,1.976053,-0.27027,1.033185,0.258134,-1.412838,-0.458769,-1.600501,0.054653,0.635837,0.941958,0.902339
6,0.834055,-0.988322,0.436456,1.704422,-1.375021,-0.58278,0.119911,-1.808992,-0.501758,0.017772,0.822226,0.887508,0.868505
7,1.210841,-1.260695,-0.442372,-1.051503,0.009289,0.823266,-0.556457,-0.68627,0.635853,0.15584,0.440663,0.52708,0.81519
8,-0.866017,-0.094621,0.799304,-1.62848,0.744895,-1.937959,-1.171138,0.466011,-1.099324,0.064178,0.06202,0.434534,0.487569
9,-1.027243,-1.719741,-1.294606,-1.208314,-0.91225,-0.61855,0.783688,-1.497485,0.88809,0.128384,0.084939,0.120804,0.247939


In [13]:
coefs_df = pd.DataFrame(coefs)
coefs_df = coefs_df.rename(index={k: f"y_{k+1}" for k in range(len(coefs_df))})

### CHECK:
coefs_df

Unnamed: 0,0,1,2,3,4,5,6,7,8
y_1,-0.56854,0.887367,-0.221145,0.30641,0.583274,0.76172,0.608523,0.798043,0.9369
y_2,0.560793,-0.262248,0.359972,0.710049,-0.142425,0.720504,0.574679,0.379644,-0.556388
y_3,0.029304,0.339013,0.694947,0.206163,-0.906313,0.664971,-0.980488,-0.396903,-0.522873
y_4,0.797466,0.16781,-0.079379,-0.174992,-0.263954,0.403192,-0.586933,-0.659122,-0.939674


### Example 2: create a laser welding dataset with named columns

#### Assign "reasonable" ranges and desired units for each input & output column:

In [15]:
inputs = {
    "general": {
        "Laser Power": {"min": 100, "max": 1000, "units": "W"},
        "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
        "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
        "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
        "Focal Position": {"min": -2, "max": 5, "units": "mm"},
        # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
        "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
        "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
        "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
        "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
    },
    "formulation": {
        "Ingredient A": {"min": 0.0, "max": 1.0, "units": ""},
        "Ingredient B": {"min": 0.1, "max": 0.4, "units": ""},
        "Ingredient C": {"min": 0.0, "max": 0.8, "units": ""},
    },
}

In [47]:
inputs = {
    "Laser Power": {"min": 100, "max": 1000, "units": "W"},
    "Pulse Duration": {"min": 0.1, "max": 10, "units": "ms"},
    "Welding Speed": {"min": 1, "max": 200, "units": "mm/s"},
    "Beam Diameter": {"min": 0.1, "max": 3, "units": "mm"},
    "Focal Position": {"min": -2, "max": 5, "units": "mm"},
    # "Shielding Gas Type": {"min": , "max": , "units": "n/a"},  # leave out categorical inputs for now
    "Flow Rate": {"min": 5, "max": 25, "units": "L/min"},
    "Heat Input": {"min": 10, "max": 500, "units": "J/mm"},
    "Ambient Temperature": {"min": 20, "max": 30, "units": "degC"},
    "Cooling Rate": {"min": 10, "max": 1000, "units": "degC/s"},
}

In [17]:
outputs = {
    "Hardness": {"min": 200, "max": 800, "units": "HV"},
    "Fatigue Life": {"min": 10000, "max": 100000, "units": "numCycles"},
    "Wear Rate": {"min": 0.01, "max": 1.0, "units": "mg/m"},
    "Cutting Efficiency": {"min": 0.1, "max": 5, "units": "m/s"},
}

In [20]:
data_df, coefs = build_sythetic_demo_dataset(inputs=inputs, outputs=outputs, num_rows=15)
data_df

KeyError: 'general'

In [50]:
coefs_df = pd.DataFrame(coefs)

coefs_df = coefs_df.rename(columns={i: list(inputs)[i] for i in range(len(coefs_df.T))})
coefs_df = coefs_df.rename(index={k: list(outputs)[k] for k in range(len(coefs_df))})

### CHECK:
coefs_df

Unnamed: 0,Laser Power,Pulse Duration,Welding Speed,Beam Diameter,Focal Position,Flow Rate,Heat Input,Ambient Temperature,Cooling Rate
Hardness,-0.319518,0.599791,-0.636732,0.241493,0.676383,0.296027,0.060651,0.303258,0.677174
Fatigue Life,-0.769912,0.494003,-0.438935,0.72376,-0.884682,0.624275,0.151938,0.461919,0.475755
Wear Rate,0.819337,-0.998366,0.614139,0.585828,-0.991893,-0.95695,-0.870642,-0.519288,0.329193
Cutting Efficiency,0.177907,-0.841708,-0.040518,0.087411,-0.531787,0.163551,-0.369035,0.277397,-0.52778


## Re-scale each column to its desired range:

In [51]:
df = data_df.copy()

In [52]:
df_scaled = df.copy()

for col in df.columns:
    if col in inputs:
        scaled_col = (df[col].to_numpy() + 2) / 4
        df_scaled[col] = scaled_col

all_columns = dict()
all_columns.update(inputs)
all_columns.update(outputs)

for col in all_columns:
    df_scaled[col] = df_scaled[col] * (all_columns[col]["max"] - all_columns[col]["min"]) + all_columns[col]["min"]

column_renaming = {col: f'{col}_{all_columns[col]["units"]}' for col in all_columns}
df_scaled = df_scaled.rename(column_renaming, axis=1)

df_scaled

Unnamed: 0,Laser Power_W,Pulse Duration_ms,Welding Speed_mm/s,Beam Diameter_mm,Focal Position_mm,Flow Rate_L/min,Heat Input_J/mm,Ambient Temperature_degC,Cooling Rate_degC/s,Hardness_HV,Fatigue Life_numCycles,Wear Rate_mg/m,Cutting Efficiency_m/s
0,675.169248,9.01487,194.725745,0.361881,4.820019,23.955917,492.975937,20.651571,633.221503,630.973157,16977.9382,0.015686,0.223146
1,633.809902,8.033143,165.14992,0.188173,0.024385,11.780529,448.094799,28.182597,192.859217,284.350649,33335.182797,0.109059,2.084184
2,307.38604,8.788141,4.28974,1.537877,0.63529,21.069197,176.761413,24.722186,340.599776,727.248492,96278.910936,0.030323,2.10246
3,630.731162,1.072842,80.222353,0.377696,0.112127,20.792957,236.677038,25.287003,327.134806,280.557756,35132.722458,0.557438,4.580774
4,814.263493,3.503627,121.83787,2.661374,3.198382,12.461677,474.473972,26.30145,687.531461,590.018496,41815.585495,0.667828,1.776601
5,776.236392,4.290547,177.221653,0.473248,0.153206,22.984269,424.020016,22.877,640.846786,293.00097,35622.240899,0.471112,2.404223
6,908.380731,8.905667,21.877605,0.905748,0.599235,23.257647,217.17385,26.955308,966.280671,766.829831,91657.812462,0.097379,1.278008
7,301.372482,6.499591,164.176605,0.736085,-0.564073,7.916201,34.230035,22.934995,803.039701,345.036056,51459.277742,0.972581,1.787661
8,491.590738,6.714082,43.025483,2.866484,1.519545,23.29764,76.697447,27.074973,540.328092,750.969027,97628.534864,0.231226,3.208338
9,503.49801,0.656239,198.230879,2.410767,-0.345134,13.26293,358.937943,27.339634,451.950917,241.92049,65049.434772,0.97336,4.438738


## [Optional] Save result to Excel or CSV file: 

In [53]:
# df_scaled.to_excel("Demo Datasets/Laser Welding (Synthetic)/laser_welding.xlsx", index=False)
# df_scaled.to_csv("Demo Datasets/Laser Welding (Synthetic)/laser_welding.csv", index=False)

# Done!

# SCRATCH

## TODO: Simplex Sampling

In [33]:
def sample_from_simplex(n_dimensions):
    """
    Generate a random point from an N-dimensional simplex.
    
    The generated point will have N elements between 0 and 1 that sum to 1.
    This implementation uses the stick-breaking construction method.
    
    Parameters:
        n_dimensions (int): Number of dimensions for the simplex
        
    Returns:
        numpy.ndarray: Array of N numbers between 0 and 1 that sum to 1
    """
    # Generate N-1 random numbers between 0 and 1
    breaks = np.random.random(n_dimensions - 1)
    breaks.sort()  # Sort to implement stick-breaking construction
    
    # Add boundary points
    points = np.zeros(n_dimensions + 1)
    points[0] = 0
    points[1:-1] = breaks
    points[-1] = 1
    
    # Take differences to get segment lengths
    samples = np.diff(points)
    
    # Verify results
    assert np.all(samples >= 0)  # All values should be non-negative
    assert np.abs(np.sum(samples) - 1.0) < 1e-10  # Sum should be 1
    
    return samples

# Example usage
def demo_simplex_sampling():
    # Generate and test a few samples
    for dims in [2, 3, 4]:
        sample = sample_from_simplex(dims)
        print(f"\n{dims}-dimensional sample:")
        print(f"Values: {sample}")
        print(f"Sum: {np.sum(sample)}")

### TEST
demo_simplex_sampling()


2-dimensional sample:
Values: [0.03327879 0.96672121]
Sum: 1.0

3-dimensional sample:
Values: [0.47091755 0.12065754 0.40842491]
Sum: 1.0

4-dimensional sample:
Values: [5.66179505e-01 3.80292408e-01 5.30664502e-02 4.61637201e-04]
Sum: 1.0


In [34]:
sample_from_simplex(5)

array([0.18213976, 0.09231954, 0.18936957, 0.42032379, 0.11584735])

### TODO: Constrained Simplex Sampling

In [37]:
# Example usage and testing function
def demo_constrained_sampling():
    # Test case 1: Constraint on first element
    constraints = [(0.2, 0.4), None, None]
    sample = sample_from_constrained_simplex(3, constraints)
    print("\nTest case 1 - First element constrained to [0.2, 0.4]:")
    print(f"Values: {sample}")
    print(f"Sum: {np.sum(sample)}")
    print(f"First element in range: {0.2 <= sample[0] <= 0.4}")
    
    # Test case 2: Multiple constraints
    constraints = [(0.2, 0.4), (0, 0.3), None]
    sample = sample_from_constrained_simplex(3, constraints)
    print("\nTest case 2 - Multiple constraints:")
    print(f"Values: {sample}")
    print(f"Sum: {np.sum(sample)}")
    print(f"Constraints satisfied: {all([0.2 <= sample[0] <= 0.4, 0 <= sample[1] <= 0.3])}")


### TEST
demo_constrained_sampling()


Test case 1 - First element constrained to [0.2, 0.4]:
Values: [0.2        0.61495749 0.18504251]
Sum: 1.0
First element in range: True

Test case 2 - Multiple constraints:
Values: [0.4 0.3 0.3]
Sum: 1.0
Constraints satisfied: True


In [38]:
constraints = [(0.2, 0.4), None, None]
sample_from_constrained_simplex(n_dimensions=3, constraints=constraints)

array([0.2       , 0.02632755, 0.77367245])

In [39]:
constraints = [(0.2, 0.4), None, None, (0.01, 0.05), None]
sample = sample_from_constrained_simplex(n_dimensions=5, constraints=constraints)
print(sample.sum())
sample.tolist()

1.0


[0.38133766697190435,
 0.06600347187209563,
 0.17541699885677559,
 0.05,
 0.3272418622992245]

In [70]:
num_general_inputs = 2
num_rows = 5
X_general = np.array([[np.random.uniform(-2, 2) for i in range(num_general_inputs)] for j in range(num_rows)])
X_general


array([[-1.99452679, -0.52007474],
       [ 0.51790224, -0.31570184],
       [ 0.35842497,  0.79310645],
       [-1.28617266, -1.98701367],
       [-1.3882569 ,  0.19349122]])

In [71]:
formulation_inputs = inputs["formulation"]

In [72]:
formulation_inputs

{'Ingredient A': {'min': 0.0, 'max': 1.0, 'units': ''},
 'Ingredient B': {'min': 0.1, 'max': 0.4, 'units': ''},
 'Ingredient C': {'min': 0.0, 'max': 0.8, 'units': ''}}

In [73]:
X_general = np.array([[np.random.uniform(-2, 2) for i in range(num_general_inputs)] for j in range(num_rows)])
num_formulation_inputs = len(formulation_inputs)
constraints = [(formulation_inputs[input_]["min"], formulation_inputs[input_]["max"]) for input_ in formulation_inputs]
X_formulation = np.array([sample_from_constrained_simplex(n_dimensions=num_formulation_inputs, constraints=constraints) for j in range(num_rows)])
X_formulation

array([[0.51504906, 0.4       , 0.08495094],
       [0.33562935, 0.14884548, 0.51552516],
       [0.22366617, 0.38886426, 0.38746956],
       [0.45700698, 0.1       , 0.44299302],
       [0.39394389, 0.4       , 0.20605611]])

In [76]:
X = np.concatenate((X_general, X_formulation), axis=1)
X

array([[-1.27492642,  0.7552982 ,  0.51504906,  0.4       ,  0.08495094],
       [ 0.09609186, -0.82047921,  0.33562935,  0.14884548,  0.51552516],
       [ 1.2848995 , -0.51434444,  0.22366617,  0.38886426,  0.38746956],
       [-0.36759091, -1.09603109,  0.45700698,  0.1       ,  0.44299302],
       [-0.03063146,  0.89287068,  0.39394389,  0.4       ,  0.20605611]])

## Convert ingredient recipe data tables from "Wide" to "Compact" format:

In [59]:
# Example usage

# Create sample data in wide format
wide_data = {
    'Sugar': [10, 0, 15, 0],
    'Salt': [2, 1, 0, 0],
    'Flour': [83, 85, 73, 73],
    'Baking Powder': [0, 2, 5, 5],
    'Vanilla': [0, 7, 0, 0],
    'Brown Sugar': [0, 0, 0, 15],
    'Milk Chocolate Chips': [5, 0, 0, 0],
    'Dark Chocolate Chips': [0, 0, 5, 0],
    'White Chocolate Chips': [0, 0, 0, 5],
}

wide_df = pd.DataFrame(wide_data)

In [60]:
print("Original wide format:")
wide_df

Original wide format:


Unnamed: 0,Sugar,Salt,Flour,Baking Powder,Vanilla,Brown Sugar,Milk Chocolate Chips,Dark Chocolate Chips,White Chocolate Chips
0,10,2,83,0,0,0,5,0,0
1,0,1,85,2,7,0,0,0,0
2,15,0,73,5,0,0,0,5,0
3,0,0,73,5,0,15,0,0,5


In [61]:
print("\nTransformed compact format:")
wide_to_compact_format(wide_df)


Transformed compact format:


Unnamed: 0,Ingredient A Name,Ingredient A weight %,Ingredient B Name,Ingredient B weight %,Ingredient C Name,Ingredient C weight %,Ingredient D Name,Ingredient D weight %
0,Sugar,10,Salt,2,Flour,83,Milk Chocolate Chips,5
1,Salt,1,Flour,85,Baking Powder,2,Vanilla,7
2,Sugar,15,Flour,73,Baking Powder,5,Dark Chocolate Chips,5
3,Flour,73,Baking Powder,5,Brown Sugar,15,White Chocolate Chips,5


## Convert ingredient recipe data tables from "Compact" to "Wide" format:

In [62]:
def compact_to_wide_format(df):
    """
    Convert formulation data from compact format to wide format.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame in compact format where:
        - Each row is a formulation
        - Columns alternate between ingredient names and weight percentages
    
    Returns:
    pandas.DataFrame: Transformed DataFrame in wide format where:
        - Each row is a formulation
        - Each column is an ingredient with its weight percentage
    """
    # Create a list to store the transformed rows
    wide_rows = []
    
    # Get all unique ingredients across all formulations
    ingredient_columns = [col for col in df.columns if 'Name' in col]
    all_ingredients = set()
    for col in ingredient_columns:
        all_ingredients.update(df[col].dropna().unique())
    
    # Process each formulation
    for idx, row in df.iterrows():
        # Create a dictionary with all ingredients initialized to 0
        formulation = {ingredient: 0 for ingredient in all_ingredients}
        
        # Fill in the actual values
        for i in range(1, len(df.columns) // 2 + 1):
            name_col = f'Ingredient {chr(64+i)} Name'
            weight_col = f'Ingredient {chr(64+i)} weight %'
            
            if name_col in df.columns and pd.notna(row[name_col]):
                ingredient_name = row[name_col]
                formulation[ingredient_name] = row[weight_col]
        
        wide_rows.append(formulation)
    
    # Convert to DataFrame
    result_df = pd.DataFrame(wide_rows)
    
    # Sort columns alphabetically for consistency
    result_df = result_df.reindex(sorted(result_df.columns), axis=1)
    
    return result_df

In [63]:
wide_to_compact_format(wide_df)

Unnamed: 0,Ingredient A Name,Ingredient A weight %,Ingredient B Name,Ingredient B weight %,Ingredient C Name,Ingredient C weight %,Ingredient D Name,Ingredient D weight %
0,Sugar,10,Salt,2,Flour,83,Milk Chocolate Chips,5
1,Salt,1,Flour,85,Baking Powder,2,Vanilla,7
2,Sugar,15,Flour,73,Baking Powder,5,Dark Chocolate Chips,5
3,Flour,73,Baking Powder,5,Brown Sugar,15,White Chocolate Chips,5


In [64]:
compact_to_wide_format(wide_to_compact_format(wide_df))

Unnamed: 0,Baking Powder,Brown Sugar,Dark Chocolate Chips,Flour,Milk Chocolate Chips,Salt,Sugar,Vanilla,White Chocolate Chips
0,0,0,0,83,5,2,10,0,0
1,2,0,0,85,0,1,0,7,0
2,5,0,5,73,0,0,15,0,0
3,5,15,0,73,0,0,0,0,5


In [65]:
wide_df[sorted(wide_df.columns)]

Unnamed: 0,Baking Powder,Brown Sugar,Dark Chocolate Chips,Flour,Milk Chocolate Chips,Salt,Sugar,Vanilla,White Chocolate Chips
0,0,0,0,83,5,2,10,0,0
1,2,0,0,85,0,1,0,7,0
2,5,0,5,73,0,0,15,0,0
3,5,15,0,73,0,0,0,0,5
