In [9]:
# Load modules
import numpy as np
import pandas as pd
from gen_data_functions import gen_rum
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss

In [10]:
# Import attributes design
data = pd.read_csv('data/swissmetro.dat',sep='\t')

# Clean unused information
data = data[(data['CHOICE'] != 0) & (data['TRAIN_AV']==1) & (data['SM_AV']==1) & (data['CAR_AV']==1)].reset_index(drop=True)

# Create and re-scale variables
data['SM_COST'] = data['SM_CO'].values * (data['GA'].values == 0)
data['TRAIN_COST'] = data['TRAIN_CO'].values * (data['GA'].values == 0)
data['TRAIN_TT'] =   data['TRAIN_TT'] / 100
data['TRAIN_COST'] = data['TRAIN_COST'] / 100
data['SM_TT'] =      data['SM_TT'] / 100
data['SM_COST'] =    data['SM_COST'] / 100
data['CAR_TT'] =     data['CAR_TT'] / 100
data['CAR_COST'] =     data['CAR_CO'] / 100

In [11]:
# Define scalars
N = len(data)
J = 3
K = 2

# Define attributes
Xvars = ['TRAIN_COST','TRAIN_TT','SM_COST','SM_TT','CAR_COST','CAR_TT']
X = data[Xvars].to_numpy()

# Define random generator
seed = 12345
rng = np.random.default_rng(seed)

In [12]:
# Define parameters
beta = [-2,-3]

# Define utility functions
V1 = X[:,:2] @ beta
V2 = X[:,2:4] @ beta
V3 = X[:,4:] @ beta

V = np.c_[V1,V2,V3]

# Generate RUM choices
Y, V, p = gen_rum(V,rng)

# Export to a CSV file
to_export = pd.DataFrame(
    np.c_[X,Y,V,p],
    columns= Xvars + ['CHOICE'] + ['v_' + str(j+1) for j in range(J)] + ['p_' + str(j+1) for j in range(J)])

# Separate in train/test
data_train, data_test = train_test_split(to_export,test_size=0.2,stratify=to_export['CHOICE'],random_state=seed)

# Compute metrics
ll_full = -log_loss(to_export['CHOICE'],to_export[['p_' + str(j+1) for j in range(J)]],normalize=False)
ll_train = -log_loss(data_train['CHOICE'],data_train[['p_' + str(j+1) for j in range(J)]],normalize=False)
ll_test = -log_loss(data_test['CHOICE'],data_test[['p_' + str(j+1) for j in range(J)]],normalize=False)
r2_test = 1 - ll_test/(len(data_test)*np.log(1/(J)))

metrics = pd.Series(np.r_[ll_full,ll_train,ll_test,r2_test],index=['Log-lik (full)','Log-lik (train)','Log-lik (test)','Rho-sq (test)'],name='Values')

# Print the log-likelihood
print('RUM process 1:')
print('Log-lik (full) = ' + str(round(ll_full,4)) + ' / Log-lik (test) = ' + str(round(ll_test,4)) + ' / Log-lik (train) = ' + str(round(ll_train,4)) + ' / Rho-sq (test): ' + str(round(r2_test,4)))

outputfile = 'data/data_rum_1'
data_train.to_csv(outputfile + '_train.csv',index=False)
data_test.to_csv(outputfile + '_test.csv',index=False)
metrics.to_csv(outputfile + '_metrics.csv',index=True)

print('Train data exported to ' + outputfile + '_train.csv')
print('Test data exported to ' + outputfile + '_test.csv')
print('Metrics exported to ' + outputfile + '_metrics.csv')

RUM process 1:
Log-lik (full) = -5807.5657 / Log-lik (test) = -1186.2038 / Log-lik (train) = -4621.3619 / Rho-sq (test): 0.4028
Train data exported to data/data_rum_1_train.csv
Test data exported to data/data_rum_1_test.csv
Metrics exported to data/data_rum_1_metrics.csv


In [15]:
# Define parameters
delta = 0.
beta = [-3.,-5.]

# Define utility functions
V1 = np.log(X[:,:2]+0.1) @ beta
V2 = np.log(X[:,2:4]+0.1) @ beta
V3 = np.log(X[:,4:]+0.1) @ beta


# Define utility functions
V = np.c_[V1,V2,V3]

# Generate RUM choices
Y, V, p = gen_rum(V,rng)

# Export to a CSV file
to_export = pd.DataFrame(
    np.c_[X,Y,V,p],
    columns= Xvars + ['CHOICE'] + ['v_' + str(j+1) for j in range(J)] + ['p_' + str(j+1) for j in range(J)])

# Separate in train/test
data_train, data_test = train_test_split(to_export,test_size=0.2,stratify=to_export['CHOICE'],random_state=seed)

# Compute metrics
ll_full = -log_loss(to_export['CHOICE'],to_export[['p_' + str(j+1) for j in range(J)]],normalize=False)
ll_train = -log_loss(data_train['CHOICE'],data_train[['p_' + str(j+1) for j in range(J)]],normalize=False)
ll_test = -log_loss(data_test['CHOICE'],data_test[['p_' + str(j+1) for j in range(J)]],normalize=False)
r2_test = 1 - ll_test/(len(data_test)*np.log(1/(J)))

metrics = pd.Series(np.r_[ll_full,ll_train,ll_test,r2_test],index=['Log-lik (full)','Log-lik (train)','Log-lik (test)','Rho-sq (test)'],name='Values')

# Print the log-likelihood
print('RUM process 4:')
print('Log-lik (full) = ' + str(round(ll_full,4)) + ' / Log-lik (test) = ' + str(round(ll_test,4)) + ' / Log-lik (train) = ' + str(round(ll_train,4)) + ' / Rho-sq (test): ' + str(round(r2_test,4)))

outputfile = 'data/data_rum_4'
data_train.to_csv(outputfile + '_train.csv',index=False)
data_test.to_csv(outputfile + '_test.csv',index=False)
metrics.to_csv(outputfile + '_metrics.csv',index=True)

print('Train data exported to ' + outputfile + '_train.csv')
print('Test data exported to ' + outputfile + '_test.csv')
print('Metrics exported to ' + outputfile + '_metrics.csv')

RUM process 4:
Log-lik (full) = -4425.5792 / Log-lik (test) = -911.3263 / Log-lik (train) = -3514.2528 / Rho-sq (test): 0.5412
Train data exported to data/data_rum_4_train.csv
Test data exported to data/data_rum_4_test.csv
Metrics exported to data/data_rum_4_metrics.csv
