In [1]:
import pandas as pd
import random
import os
import time
import numpy as np

In [None]:
import pandas as pd
import random
import os
import time

def get_next_filename(base_name="generated_data", extension=".xlsx"):
    i = 1
    while os.path.exists(f"{base_name}{i}{extension}"):
        i += 1
    return f"{base_name}{i}{extension}"

def generate_biogeme_ready_data(
    num_people=10,
    num_choices=3,
    num_numeric_features=3,
    num_categorical_features=2,
    availability_probs=None,
    seed=None
):
    if seed is None:
        seed = int(time.time())
    random.seed(seed)

    alternatives = list(range(1, num_choices + 1))  # [1, 2, 3]

    if availability_probs is None:
        availability_probs = [0.8] * num_choices
    elif len(availability_probs) != num_choices:
        raise ValueError("Length of availability_probs must match num_choices")

    numeric_features = [f"Feature{i+1}" for i in range(num_numeric_features)]
    categorical_feature_names = [f"Category{i+1}" for i in range(num_categorical_features)]
    category_encoding = {1: 'High', 2: 'Medium', 3: 'Low'}
    categorical_features = {name: list(category_encoding.keys()) for name in categorical_feature_names}

    data = []
    for i in range(1, num_people + 1):
        person_id = i
        choice_id = i

        availability = {
            f"Avail.{alt}": int(random.random() < availability_probs[idx])
            for idx, alt in enumerate(alternatives)
        }

        if sum(availability.values()) == 0:
            forced_alt = random.choice(alternatives)
            availability[f"Avail.{forced_alt}"] = 1

        available_alts = [alt for alt in alternatives if availability[f"Avail.{alt}"] == 1]
        chosen_alt = random.choice(available_alts)

        entry = {
            'C.ID': choice_id,
            'P.ID': person_id,
        }

        entry.update(availability)

        for feature in numeric_features:
            entry[feature] = round(random.uniform(1.0, 100.0), 2)

        for cat_feature, options in categorical_features.items():
            entry[cat_feature] = random.choice(options)

        entry['Choice'] = chosen_alt
        data.append(entry)

    df = pd.DataFrame(data)
    #filename = get_next_filename()
    #df.to_csv(filename, index=False)
    #print(f"Biogeme-ready CSV saved as {filename}")
    return df


In [17]:
dataset1 = generate_biogeme_ready_data(
        num_people=20,
        num_choices=3,
        num_numeric_features=3,
        num_categorical_features=1,
        availability_probs=[0.9, 0.6, 0.3]
    )

In [19]:
dataset1

Unnamed: 0,C.ID,P.ID,Avail.1,Avail.2,Avail.3,Feature1,Feature2,Feature3,Category1,Choice
0,1,1,1,1,0,4.82,92.19,65.39,2,2
1,2,2,1,1,0,81.19,91.25,49.44,1,1
2,3,3,0,1,0,95.91,31.35,63.33,1,2
3,4,4,1,0,0,4.52,29.83,55.19,2,1
4,5,5,0,0,1,96.88,27.94,72.59,2,3
5,6,6,1,0,0,75.82,95.11,98.05,1,1
6,7,7,0,1,0,9.94,9.85,21.5,1,2
7,8,8,1,1,1,97.79,21.87,77.99,1,3
8,9,9,1,1,1,25.23,75.0,15.12,3,3
9,10,10,1,0,0,97.02,3.31,42.21,1,1


In [13]:
def get_next_filename(base_name="generated_data", extension=".csv"):
    i = 1
    while os.path.exists(f"{base_name}{i}{extension}"):
        i += 1
    return f"{base_name}{i}{extension}"


def generate_biogeme_ready_data_range(
    num_people=10,
    num_choices=3,
    feature_ranges=None,  # List of dicts: [{'min': 0, 'max': 100, 'mean': 50}, ...]
    num_categorical_features=2,
    availability_probs=None,
):
    random.seed(time.time())
    np.random.seed(int(time.time()))

    if feature_ranges is None:
        raise ValueError("You must provide 'feature_ranges' for this version.")
    num_numeric_features = len(feature_ranges)

    alternatives = list(range(1, num_choices + 1))

    if availability_probs is None:
        availability_probs = [0.8] * num_choices
    elif len(availability_probs) != num_choices:
        raise ValueError("Length of availability_probs must match num_choices")

    numeric_features = [f"Feature{i+1}" for i in range(num_numeric_features)]
    categorical_feature_names = [f"Category{i+1}" for i in range(num_categorical_features)]
    category_encoding = {1: 'High', 2: 'Medium', 3: 'Low'}
    categorical_features = {name: list(category_encoding.keys()) for name in categorical_feature_names}

    data = []
    for i in range(1, num_people + 1):
        person_id = i
        choice_id = i

        availability = {
            f"Avail.{alt}": int(random.random() < availability_probs[idx])
            for idx, alt in enumerate(alternatives)
        }

        if sum(availability.values()) == 0:
            forced_alt = random.choice(alternatives)
            availability[f"Avail.{forced_alt}"] = 1

        available_alts = [alt for alt in alternatives if availability[f"Avail.{alt}"] == 1]
        chosen_alt = random.choice(available_alts)

        entry = {
            'C.ID': choice_id,
            'P.ID': person_id,
        }
        entry.update(availability)

        # Generate independent features based on min, max, mean
        for idx, name in enumerate(numeric_features):
            f_range = feature_ranges[idx]
            min_val = f_range['min']
            max_val = f_range['max']
            mean_val = f_range['mean']
            value = np.random.triangular(min_val, mean_val, max_val)
            entry[name] = round(value, 2)

        for cat_feature, options in categorical_features.items():
            entry[cat_feature] = random.choice(options)

        entry['Choice'] = chosen_alt
        data.append(entry)

    df = pd.DataFrame(data)
    #filename = get_next_filename()
    #df.to_csv(filename, index=False)
    #print(f"Biogeme-ready CSV saved as {filename}")
    return df


In [None]:
feature_ranges = [
        {'min': 20, 'max': 120, 'mean': 80},
        {'min': 10, 'max': 90, 'mean': 40},
        {'min': 30, 'max': 120, 'mean': 60}
    ]

dataset2 = generate_biogeme_ready_data_range(
        num_people=10,
        num_choices=3,
        availability_probs=[0.9, 0.6, 0.3],
        feature_ranges=feature_ranges,
        num_categorical_features=3,
    )


'To-Do: Update the data generation function'
'1. add names for features'
'2. categorical features with specific number of categories(?)'
'3. combine feature ranges for categorical and numeric features OR build separate functions for each (i.e., one for numeric, one for categorical)'

'2. categorical features with specific number of categories(?)'

In [26]:
dataset2

Unnamed: 0,C.ID,P.ID,Avail.1,Avail.2,Avail.3,Feature1,Feature2,Feature3,Category1,Category2,Category3,Choice
0,1,1,1,0,0,58.84,37.0,48.86,3,2,3,1
1,2,2,1,0,0,42.06,64.07,85.32,3,3,2,1
2,3,3,1,1,0,58.48,58.24,75.97,3,1,2,2
3,4,4,1,0,0,109.13,58.69,94.37,1,2,2,1
4,5,5,1,1,0,96.39,44.98,90.76,3,2,2,2
5,6,6,1,0,0,87.57,46.85,99.44,3,2,3,1
6,7,7,1,1,1,86.14,46.04,84.92,1,1,2,2
7,8,8,1,1,1,82.21,51.52,85.25,2,2,1,2
8,9,9,1,0,0,56.1,35.55,73.66,3,1,3,1
9,10,10,0,0,1,62.78,69.19,75.05,2,3,1,3
