In [None]:
import pandas as pd
import random
import os
import time
import numpy as np

In [None]:
'To-Do: Update the data generation function'
'1. Feature for each alternatives!'


def gen_data2(
    num_people,
    num_choices,
    num_alternatives,
    numerical_specs,  
    category_specs,
    availability_probs=None,
):
    random.seed(time.time())
    np.random.seed(int(time.time()))

    #checking numerical_specs is provided
    if numerical_specs is None:
        raise ValueError("You must provide 'numerical_specs' for this version.")
    num_numeric_features = len(numerical_specs)
    num_categorical_features = len(category_specs)

    
    alternatives = list(range(1, num_alternatives + 1))

    #checking availability_probs
    if availability_probs[1] is None:
        availability_probs[1] = [0.8] * num_alternatives
    elif len(availability_probs[1]) != num_alternatives:
        raise ValueError("Length of availability_probs must match num_alternatives")

    #Numerical features naming
    if any('Name' in spec for spec in numerical_specs) == True:
        numeric_features = [spec['Name'] for spec in numerical_specs]
    else:
        numeric_features = [f"Feature{i+1}" for i in range(num_numeric_features)]


    #Creating Dictionary for categorical features (name : levels)
    if any('Name' in spec for spec in category_specs) == True:
        categorical_features = {spec['Name']: spec['level'] for spec in category_specs}
    else:
        categorical_features = {}
        for spec in category_specs:
            i = category_specs.index(spec)
            categorical_features [f"Category{i+1}"] = spec['level']

    data = []

    for i in range(1, num_people + 1):
        for j in range(1, num_choices + 1):
            person_id = i
            choice_id = j

            ### Determining availability ###
            # Always available
            if availability_probs[0] == 1:
                availability = {f"Avail.{alt}": 1 for alt in alternatives}
            else:
                #randomly generate availability for each alternative
                availability = {
                    f"Avail.{alt}": int(random.random() < availability_probs[1][idx])
                    for idx, alt in enumerate(alternatives)
                    }
                #at least one alternative must be available
                if sum(availability.values()) == 0:
                    forced_alt = random.choice(alternatives)
                    availability[f"Avail.{forced_alt}"] = 1

            available_alts = [alt for alt in alternatives if availability[f"Avail.{alt}"] == 1]
            chosen_alt = random.choice(available_alts)

            entry = {
                'P.ID': person_id,
                'C.ID': choice_id,
            }
            entry.update(availability)

            # Generate independent features based on min, max, mean
            for idx, name in enumerate(numeric_features):
                f_range = numerical_specs[idx]
                min_val = f_range['min']
                max_val = f_range['max']
                mean_val = f_range['mean']
                value = np.random.triangular(min_val, mean_val, max_val)
                entry[name] = round(value, 2)

            for cat_feature, options in categorical_features.items():
                entry[cat_feature] = random.choice(options)

            entry['Choice'] = chosen_alt
            data.append(entry)
    
    df = pd.DataFrame(data)
    #filename = get_next_filename()
    #df.to_csv(filename, index=False)
    #print(f"Biogeme-ready CSV saved as {filename}")
    return df

In [137]:
feature_specs = [
        {'min': 20, 'max': 120, 'mean': 80},
        {'min': 10, 'max': 90, 'mean': 40},
        {'min': 30, 'max': 120, 'mean': 60}
    ]
numerical_specs = [
        {'Name': "TravelCost", 'min': 20, 'max': 120, 'mean': 80},
        {'Name': "TravelTime",'min': 10, 'max': 90, 'mean': 40},
        {'Name': "ServiceCost",'min': 5, 'max': 20, 'mean': 10}
    ]


#types:
# 1: ordinal (e.g., High, Medium, Low)
# 2: nominal (e.g., Car, Bus, Train)
# 3: binary (e.g., Yes/No)
# Level: for ordinal (Ascending order), number of categories for nominal/binary 
category_specs = [
        {'Name': "ServiceLevel", 'type': 1, 'level': ("1","2","3")},
        {'Name': "ServiceType" , 'type': 2, 'level': ("Wi-fi","AC","Free Snacks")},
        {'Name': "Subscription", 'type': 3, 'level': ("Yes","No")} 
    ]

category_specs2 = [
        {'type': 1, 'level': ("1","2","3")},
        {'type': 2, 'level': ("Wi-fi","AC","Free Snacks")},
        {'type': 3, 'level': ("Yes","No")} 
    ]

dataset3 = gen_data2(
        num_people=5,
        num_choices=2,
        num_alternatives=3,
        numerical_specs= numerical_specs,
        category_specs=category_specs2,
        availability_probs=(1,[0.9, 0.6, 0.3])
    )
dataset3

Unnamed: 0,P.ID,C.ID,Avail.1,Avail.2,Avail.3,TravelCost,TravelTime,ServiceCost,Category1,Category2,Category3,Choice
0,1,1,1,1,1,86.2,38.52,12.09,1,Wi-fi,No,1
1,1,2,1,1,1,53.06,22.67,15.45,3,Wi-fi,No,3
2,2,1,1,1,1,67.69,29.07,6.67,3,Wi-fi,No,3
3,2,2,1,1,1,90.08,48.44,10.88,1,Free Snacks,Yes,2
4,3,1,1,1,1,113.7,56.2,11.51,2,Free Snacks,Yes,2
5,3,2,1,1,1,96.95,62.58,12.03,2,Free Snacks,Yes,2
6,4,1,1,1,1,80.52,28.34,10.17,3,Free Snacks,No,3
7,4,2,1,1,1,82.74,59.14,9.53,2,Wi-fi,Yes,3
8,5,1,1,1,1,93.95,62.4,13.22,1,AC,No,2
9,5,2,1,1,1,77.35,46.42,9.45,1,Wi-fi,Yes,1
