In [4]:
import pandas as pd
import os
import glob

In [33]:
# Create CSV files of the tables collected from Access
# This is where to store them

path = './Access_Tables/*.pyobj'
files = glob.glob(path)

for f in files:
    df = pd.read_pickle(f)
    df.to_csv(f[:-6] + ".csv")
    print(f[:-6] + ".csv")

./Access_Tables/R_ASGS_2016.csv
./Access_Tables/WGT_DEMOG.csv
./Access_Tables/R_MAINMODE.csv
./Access_Tables/4_QTS_STOPS.csv
./Access_Tables/R_GROUP_MODE.csv
./Access_Tables/_Readme.csv
./Access_Tables/5_QTS_TRIPS.csv
./Access_Tables/RP_AGE_GROUP.csv
./Access_Tables/R_MAINACT.csv
./Access_Tables/R_LGA.csv
./Access_Tables/1_QTS_HOUSEHOLDS.csv
./Access_Tables/R_GROUP_MAINACT.csv
./Access_Tables/R_TRAVELWHYNOT.csv
./Access_Tables/3_QTS_VEHICLES.csv
./Access_Tables/R_ANZSCO.csv
./Access_Tables/R_OVERALL_PURPOSE.csv
./Access_Tables/R_REGION.csv
./Access_Tables/WGT_HH.csv
./Access_Tables/2_QTS_PERSONS.csv
./Access_Tables/R_TIME.csv
./Access_Tables/R_Distance_bins.csv


##### Modelling Notes:

Approaches
* Aggregate approach - "...model aggregate share of all or a segment of decision makers choosing each alternative as a function of the characteristics of the alternatives and socio-demographic attributes of the group." (Koppelman & Bhat, 2006. p. 9)


* Disaggregate approach - "... model individual choice resonses as a function of the characteristics of the alternagives available to and socio-demographic attributes of each individual." (Koppelman & Bhat, 2006. p9)


* Number of alternatives should be around 3 - 7 to reduce complexity

Contexts

* Urban Travel Model Choice Modelling vs Intercity Mode Choice Models
    * Purpose can differ
    * Modes can differ

* Trip Purposes (contexts)
    * home-based work (from home to work)
    * home-based shop/other (from home to shop/other)

    alternatives:
    * non-home-based trips
    
    
Choice sets

* Universal (all choices) ->
* Feasible (subset of universal that are feasible) ->
* Consideration (subset of feasible that are considered) <- **used in choice modelling**

Decision Rule

* Utility maximization rule:
    * Attribute vector of each alternative can be reduced to scalar utility value for that alternative (Koppelman & Bhat, 2006, p. 20)
    * An individual selects the alternative with the highest utility rate
    
Utility rule: _alternative, ‘i’, is chosen among a set of alternatives, if and only if the utility of alternative, ‘i’, is greater than or equal to the utility of all alternatives 2 , ‘j’, in the choice set, C._

Utility values
* Portion of the utility observed by the analyst
* Difference between the unknown utility used by the individual and the utility estimated by the analyst

U_it = V_it + E_it (epsilon)
where U_it = true utility of alternative 'i' to decision maker 't'
V_it = observable portion
E = the difference


Variables to consider
* Traveller related variables
    * Income
    * Automobile ownership
    * Number of workers
    * Sex
    * Age
    * Derived variables e.g. num autos / num workers
* Trip related variables
    * Trip purpose
    * Employment density at workplace
    * Population density at home location
    * Dummy variable indicating if workplace is in CBD
    * Time of day of travel
    * Origin
    * Destination
    * Travel party size
* Mode related variables
    * Travel time
    * In-vehicle travel time
    * Out of vehicle travel time
    * Walk time
    * Wait time
    * Number of transfers
    * Transit headway
    * Travel cost
    * Service frequency for carrier modes
* Interaction variables
    * Travel cost divided by household income
    * Travel time or cost interacted with sex / age
    * Out of vehicle time divided by total trip distance


In [61]:
# Import libraries
import pandas as pd
import numpy as np
import pickle

### Basic model with few variables

Variables chosen based on Koppelman and Bhat (2006)
* Income
* Automobile ownership
* Sex
* Age group
* Travel time
* Travel cost

In [95]:
# Import files
filepath = './Access_Tables/'
# Persons file
filename = '2_QTS_PERSONS.pyobj'
df_persons = pd.read_pickle(filepath + filename)
# Households file
filename = '1_QTS_HOUSEHOLDS.pyobj'
df_households = pd.read_pickle(filepath + filename)
# Trips file
filename = '5_QTS_TRIPS.pyobj'
df_trips = pd.read_pickle(filepath + filename)



print(df_persons.info())
print(df_households.info())
print(df_trips.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20202 entries, 0 to 20201
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   HHID              20202 non-null  int16         
 1   PERSID            20202 non-null  object        
 2   AGEGROUP          17724 non-null  float64       
 3   SEX               17762 non-null  object        
 4   RELATIONSHIP      10917 non-null  object        
 5   CARLICENCE        20202 non-null  bool          
 6   CARLICTYPE        11830 non-null  object        
 7   MBLICENCE         20202 non-null  bool          
 8   MBLICTYPE         984 non-null    object        
 9   OTHERLICENCE      20202 non-null  bool          
 10  WORKSTATUS        14008 non-null  object        
 11  ANZSCO_1-digit    8094 non-null   Int64         
 12  ANZSCO_3-digit    8094 non-null   Int64         
 13  INDUSTRY          8854 non-null   object        
 14  STUDYING          4359

In [63]:
df_personhhs = df_persons.merge(right = df_households, how = 'left', on = 'HHID')

In [97]:
df_personhhstrips = df_personhhs.merge(right = df_trips, how = 'left', on = 'PERSID')

In [98]:
df_personhhstrips.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48717 entries, 0 to 48716
Data columns (total 84 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   HHID_x            48717 non-null  int16         
 1   PERSID            48717 non-null  object        
 2   AGEGROUP          46236 non-null  float64       
 3   SEX               46277 non-null  object        
 4   RELATIONSHIP      26142 non-null  object        
 5   CARLICENCE        48717 non-null  bool          
 6   CARLICTYPE        32895 non-null  object        
 7   MBLICENCE         48717 non-null  bool          
 8   MBLICTYPE         2826 non-null   object        
 9   OTHERLICENCE      48717 non-null  bool          
 10  WORKSTATUS        37414 non-null  object        
 11  ANZSCO_1-digit    21809 non-null  Int64         
 12  ANZSCO_3-digit    21809 non-null  Int64         
 13  INDUSTRY          23837 non-null  object        
 14  STUDYING          1040

In [99]:
cols_to_include=['AGEGROUP',
                 'SEX',
                 'CARLICENCE', 'MBLICENCE', 'OTHERLICENCE', 'ANYSTOPS', 'ASSISTAGE', 'ASSISTLTHC',
                 'ASSISTSTHC', 'ASSISTDISABILITY', 'ASSISTENGLISH', 'ASSISTOTHER', 'ASSISTANY','RIDESHAREENT',
                 'RIDESHAREHC','RIDESHAREED', 'RIDESHARESHOP', 'RIDESHAREWORK', 'RIDESHAREOTHER', 'TAXIENT',  'TAXIHC',
                 'TAXIED', 'TAXISHOP', 'TAXIWORK', 'TAXIOTHER',
                 'HHSIZE','BIKES',
                 'DWELLTYPE',
                 'SURVEYWEEK', 'STRATA_LGA',
                 'TRAVDATE', 'TRAVMONTH', 'TRAVYEAR', 'TRAVDOW', 'HOME_SA1_2016', 'MAINMODE']    

In [106]:
cols_to_include=['AGEGROUP',
                 #'SEX',
                 #'CARLICENCE', 'MBLICENCE', 'OTHERLICENCE', 'ANYSTOPS',
                 'HHSIZE','BIKES',
                 #'DWELLTYPE',
                 'SURVEYWEEK', 'STRATA_LGA',
                 'TRAVDATE', 'TRAVMONTH', 'TRAVYEAR', 'TRAVDOW', 'HOME_SA1_2016', 'MAINMODE', 'OVERALL_PURPOSE'] 

In [107]:
df_merged = df_personhhstrips[cols_to_include]

In [108]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 48717 entries, 0 to 48716
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   AGEGROUP         46236 non-null  float64
 1   HHSIZE           48717 non-null  float64
 2   BIKES            48521 non-null  float64
 3   SURVEYWEEK       48717 non-null  int64  
 4   STRATA_LGA       48717 non-null  int64  
 5   TRAVDATE         48717 non-null  int64  
 6   TRAVMONTH        48717 non-null  int64  
 7   TRAVYEAR         48717 non-null  int64  
 8   TRAVDOW          48717 non-null  int64  
 9   HOME_SA1_2016    48717 non-null  int64  
 10  MAINMODE         40470 non-null  object 
 11  OVERALL_PURPOSE  40470 non-null  object 
dtypes: float64(3), int64(7), object(2)
memory usage: 4.8+ MB


In [109]:
df_merged = df_merged.dropna()

In [110]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40465 entries, 0 to 48716
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   AGEGROUP         40465 non-null  float64
 1   HHSIZE           40465 non-null  float64
 2   BIKES            40465 non-null  float64
 3   SURVEYWEEK       40465 non-null  int64  
 4   STRATA_LGA       40465 non-null  int64  
 5   TRAVDATE         40465 non-null  int64  
 6   TRAVMONTH        40465 non-null  int64  
 7   TRAVYEAR         40465 non-null  int64  
 8   TRAVDOW          40465 non-null  int64  
 9   HOME_SA1_2016    40465 non-null  int64  
 10  MAINMODE         40465 non-null  object 
 11  OVERALL_PURPOSE  40465 non-null  object 
dtypes: float64(3), int64(7), object(2)
memory usage: 4.0+ MB


In [111]:
import pandas as pd
import biogeme . database as db
import biogeme . biogeme as bio

In [112]:
database = db.Database("BrisbanePT", df_merged)

biogemeError: Column MAINMODE in the database does contain object
Column OVERALL_PURPOSE in the database does contain object

In [92]:
df_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 17724 entries, 0 to 20201
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   AGEGROUP       17724 non-null  float64
 1   HHSIZE         17724 non-null  float64
 2   BIKES          17724 non-null  float64
 3   SURVEYWEEK     17724 non-null  int64  
 4   STRATA_LGA     17724 non-null  int64  
 5   TRAVDATE       17724 non-null  int64  
 6   TRAVMONTH      17724 non-null  int64  
 7   TRAVYEAR       17724 non-null  int64  
 8   TRAVDOW        17724 non-null  int64  
 9   HOME_SA1_2016  17724 non-null  int64  
dtypes: float64(3), int64(7)
memory usage: 1.5 MB


In [94]:
from biogeme.expressions import Beta, DefineVariable
globals().update(database.variables)