In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import linearmodels as lm
import matplotlib
import math
import statsmodels.formula.api as smf
import statsmodels.api as sm

from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects
from linearmodels.panel import FirstDifferenceOLS
from linearmodels.panel import compare
from matplotlib import pyplot as plt

from pandas.api.types import is_numeric_dtype

pd.set_option('display.max_columns', 500)

%matplotlib inline

In [2]:
df=pd.read_csv('final_estimation_file_apta_added.csv',
                    encoding='utf-8')
df.drop(columns=['Unnamed: 0'],inplace=True)

In [3]:
# read in and join the maintenance and restructure data
mr=pd.read_csv('maintenance_restructure.csv',
                    encoding='utf-8')
mr = mr[['CBSA', 'Mode', 'Year', 'MAINTENANCE', 'MAINTENANCE_NYC', 'MAINTENANCE_WMATA', 'RESTRUCTURE']]

df = df.merge(mr, on=['CBSA', 'Mode', 'Year'], how='left')

In [4]:
# read in mechanical and other failures

failures=pd.read_csv('maintenance_final.csv',encoding='utf-8')

failures.drop(columns=['Unnamed: 0'],inplace=True)

df=df.merge(failures, on=['CBSA','MNAME','Mode','Year'],how='left')

In [5]:
list(df)

['CBSA',
 'MNAME',
 'Mode',
 'Year',
 'UPT_ADJUSTED',
 'VRM_ADJUSTED',
 'FARE_ADJUSTED',
 'min_year',
 'max_year',
 'FARE_per_UPT',
 'PROBLEM_FLAG',
 'NOTE',
 'PROBLEM_FLAG2',
 'Tot_Pop',
 'Tot_Instate_Pop',
 'Tot_Outstate_Pop',
 'Tot_NonUSA_POP',
 'Total_Median_Income_Individual',
 'Native_Instate_Med_Inc_Indiv',
 'Native_Outstate_Med_Inc_Indiv',
 'Native_Outcountry_Med_Inc_Indiv',
 'Total_Pop_Poverty',
 'Pop_Below100_Poverty',
 'Pop_Below150_Poverty',
 'Pop_Above150_Poverty',
 'Age_under18',
 'Age_18to64',
 'Age_over64',
 'Total_HH',
 'HH_0Veh',
 'HH_1Veh',
 'HH_2Veh',
 'HH_3Veh',
 'HH_4+Veh',
 'PCT_HH_NO_VEH',
 'HH_MED_INC',
 'HH_MEAN_INC',
 'INC_U35',
 'INC_35_100',
 'INC_100P',
 'TOT_UNEMP_MSA',
 'TOT_EMP_MSA',
 'TOT_LABOR_MSA',
 'UNEMP_RATE_PCT',
 'EMP_RATE_PCT',
 'GasPrice',
 'Area_acre',
 'POP_CENSUSTRACT',
 'AVG_SPEED',
 '2018_Dollar_Multiplier',
 'TNC_ARRIVAL',
 'Link_x',
 'TNC_FLAG',
 'YEARS_SINCE_TNC',
 'FARE_ADJ_2018',
 'TOTAL_MED_INC_INDIV_2018',
 'NATIVE_INSTATE_MED_INC_

In [6]:
df=df.rename(columns={'UPT_ADJUSTED':'UPT_ADJ','VRM_ADJUSTED':'VRM_ADJ','FARE_ADJUSTED':'FARE_ADJ','AVG_FARE_2018':'FARE_per_UPT_2018',
                     'Mecha_Failures':'Mechanical_Failures'})

In [7]:
df['Mechanical_Failures']=df['Mechanical_Failures'].fillna(0)
df['Total_Failures']=df['Total_Failures'].fillna(0)
df['MDBF_Mechanical']=np.where(df['Mechanical_Failures']>0,df['VRM_ADJ']/df['Mechanical_Failures'],df['VRM_ADJ'])
df['MDBF_Total']=np.where(df['Total_Failures']>0,df['VRM_ADJ']/df['Total_Failures'],df['VRM_ADJ'])

In [8]:
# read in and join the bike share updates
bsu=pd.read_csv('bike_share_update.csv',
                    encoding='utf-8')
bsu = bsu[['CBSA', 'Mode', 'Year', 'PBS_Flag_Update']]

df = df.merge(bsu, on=['CBSA', 'Mode', 'Year'], how='left')

#df['PBS_Flag_Missing'] = df['PBS_Flag'].apply(lambda x : x.isnan())
df['PBS_Flag'] = np.where(df['PBS_Flag'].isnull(), df['PBS_Flag_Update'], df['PBS_Flag'])

In [9]:
# 4 apta clusers
df['CLUSTER_APTA4'] = np.floor(df['CLUSTER_APTA']/10)

In [10]:
# exclude if ridership is zero--missing data
df = df[df['UPT_ADJ']>0]

In [11]:
# fill in missing data as needed

# min fare is zero
df['FARE_per_UPT_2018'] = df['FARE_per_UPT_2018'].apply(lambda x : max(x,0))

# max fare is $20
df['FARE_per_UPT_2018'] = df['FARE_per_UPT_2018'].apply(lambda x : min(x,20))

# zero vehicle HH can't go negative
df['HH_0Veh'] = df['HH_0Veh'].apply(lambda x : max(x,0))

In [12]:
df=df.drop_duplicates(['CBSA','Mode','Year'])

In [13]:
# exclude problematic data
old_len = len(df)
df = df[df['VRM_ADJ']>0]
new_len = len(df)
print('Excluding ' + str(old_len-new_len) + ' records with missing VRM') 

# exclude if problem flag 2 is set
old_len = len(df)
df = df[df['PROBLEM_FLAG2']!=1]
new_len = len(df)
print('Excluding ' + str(old_len-new_len) + ' records with problem flag set to 1') 




Excluding 2 records with missing VRM
Excluding 36 records with problem flag set to 1


In [14]:
# exclude discontinuous data
df = df.sort_values(by=['CBSA', 'Mode', 'Year'])

excluded_records = 100
while excluded_records > 0: 
    old_len = len(df)
    df['YearDiff'] = df['Year'].shift(-1) - df['Year']
    df = df[(df['YearDiff']==1) | (df['Year']==2018)]
    excluded_records = old_len - len(df)
    print('Excluding ' + str(excluded_records) + ' records with discontinuities in year.') 




Excluding 1 records with discontinuities in year.
Excluding 1 records with discontinuities in year.
Excluding 0 records with discontinuities in year.


In [15]:
df['BUS_FLAG'] = np.where(df['Mode']=='Bus', 1, 0)
df['RAIL_FLAG'] = np.where(df['Mode']=='Rail', 1, 0)

In [16]:
# include population + employment
df['POP_EMP'] = df['Tot_Pop'] + df['TOT_EMP_MSA']
df['HH_EMP'] = df['Total_HH'] + df['TOT_EMP_MSA']

# percent of population in transit supportive density
df['TSD_POP_PCT'] = df['POP_CENSUSTRACT'] / df['Tot_Pop']

# percent of pop born outside USA
df['Tot_NonUSA_POP_pct'] = df['Tot_NonUSA_POP'] / df['Tot_Pop'] * 100

In [17]:
# time effects
df['YEARS_SINCE_2002']  = df['Year'] - 2002

df['YEARS_2002_2010']  = df['Year'].apply(lambda x : min(x-2002, 8))
df['YEARS_AFTER_2010'] = df['Year'].apply(lambda x : max(x-2010, 0))

df['YEARS_2002_2014']  = df['Year'].apply(lambda x : min(x-2002, 12))
df['YEARS_AFTER_2014'] = df['Year'].apply(lambda x : max(x-2014, 0))


df['YEARS_SINCE_2002_BUS']  = df['YEARS_SINCE_2002'] * df['BUS_FLAG']
df['YEARS_SINCE_2002_RAIL'] = df['YEARS_SINCE_2002'] * df['RAIL_FLAG']

df['YEARS_2002_2010_BUS']   = df['YEARS_2002_2010'] * df['BUS_FLAG']
df['YEARS_AFTER_2010_BUS']  = df['YEARS_AFTER_2010']* df['BUS_FLAG']
df['YEARS_2002_2010_RAIL']  = df['YEARS_2002_2010'] * df['RAIL_FLAG']
df['YEARS_AFTER_2010_RAIL'] = df['YEARS_AFTER_2010']* df['RAIL_FLAG']

df['YEARS_2002_2014_BUS']   = df['YEARS_2002_2014'] * df['BUS_FLAG']
df['YEARS_AFTER_2014_BUS']  = df['YEARS_AFTER_2014']* df['BUS_FLAG']
df['YEARS_2002_2014_RAIL']  = df['YEARS_2002_2014'] * df['RAIL_FLAG']
df['YEARS_AFTER_2014_RAIL'] = df['YEARS_AFTER_2014']* df['RAIL_FLAG']

In [18]:
# more time effects

df['YEARS_AFTER_2012'] = df['Year'].apply(lambda x : max(x-2012, 0))

df['TNC_YEARS_AFTER_2012'] = df['YEARS_AFTER_2012'] * df['TNC_FLAG']

df['TNC_YEARS_AFTER_2012_BUS'] = df['TNC_YEARS_AFTER_2012'] * df['BUS_FLAG']
df['TNC_YEARS_AFTER_2012_RAIL'] = df['TNC_YEARS_AFTER_2012'] * df['RAIL_FLAG']

In [19]:
# test rail ramp up period
df['YEARS_SINCE_RAIL_START'] = (df['Year'] - df['min_year']) * df['RAIL_FLAG']
df['YEARS_SINCE_RAIL_START'] = np.where(df['min_year']==2002, 0, df['YEARS_SINCE_RAIL_START'])

df['YEARS_SINCE_RAIL_START_1'] = df['YEARS_SINCE_RAIL_START'].apply(lambda x : min(x, 1))
df['YEARS_SINCE_RAIL_START_2'] = df['YEARS_SINCE_RAIL_START'].apply(lambda x : min(x, 2))
df['YEARS_SINCE_RAIL_START_3'] = df['YEARS_SINCE_RAIL_START'].apply(lambda x : min(x, 3))



In [20]:
# calculate bus vs rail stuff

# various bus/rail specifics

df['TNC_FLAG_BUS'] = df['TNC_FLAG'] * df['BUS_FLAG']
df['TNC_FLAG_RAIL'] = df['TNC_FLAG'] * df['RAIL_FLAG']

df['YEARS_SINCE_TNC_BUS'] = df['YEARS_SINCE_TNC'] * df['BUS_FLAG']
df['YEARS_SINCE_TNC_RAIL'] = df['YEARS_SINCE_TNC'] * df['RAIL_FLAG']

df['dockless_flag'] = df['dockCt'].apply(lambda x : min(x,1))
df['scooter_flag']  = df['scooterCt'].apply(lambda x : min(x,1))

df['BIKE_SHARE'] = df['PBS_Flag'] + df['dockless_flag']
df['BIKE_SHARE'] = df['BIKE_SHARE'].apply(lambda x : min(x,1))


df['PBS_Flag_BUS']   = df['PBS_Flag']   * df['BUS_FLAG']
df['dockCt_BUS']     = df['dockCt']     * df['BUS_FLAG']
df['docklessCt_BUS'] = df['docklessCt'] * df['BUS_FLAG']
df['scooterCt_BUS']  = df['scooterCt']  * df['BUS_FLAG']
df['dockless_flag_BUS'] = df['dockless_flag'] * df['BUS_FLAG']
df['scooter_flag_BUS']  = df['scooter_flag']  * df['BUS_FLAG']
df['BIKE_SHARE_BUS']  = df['BIKE_SHARE']  * df['BUS_FLAG']


df['PBS_Flag_RAIL']   = df['PBS_Flag']   * df['RAIL_FLAG']
df['dockCt_RAIL']     = df['dockCt']     * df['RAIL_FLAG']
df['docklessCt_RAIL'] = df['docklessCt'] * df['RAIL_FLAG']
df['scooterCt_RAIL']  = df['scooterCt']  * df['RAIL_FLAG']
df['dockless_flag_RAIL'] = df['dockless_flag'] * df['RAIL_FLAG']
df['scooter_flag_RAIL']  = df['scooter_flag']  * df['RAIL_FLAG']
df['BIKE_SHARE_RAIL']  = df['BIKE_SHARE']  * df['RAIL_FLAG']

In [21]:
# convert JTW to numeric columns
df['JTW_DA_PCT']        = df['JTW_DA_PCT'].replace('#VALUE!', np.nan)
df['JTW_CARPOOLED_PCT'] = df['JTW_CARPOOLED_PCT'].replace('#VALUE!', np.nan)
df['JTW_TRANSIT_PCT']   = df['JTW_TRANSIT_PCT'].replace('#VALUE!', np.nan)
df['JTW_WALK_PCT']      = df['JTW_WALK_PCT'].replace('#VALUE!', np.nan)
df['JTW_BICYCLE_PCT']   = df['JTW_BICYCLE_PCT'].replace('#VALUE!', np.nan)
df['JTW_OTHER_PCT']     = df['JTW_OTHER_PCT'].replace('#VALUE!', np.nan)
df['JTW_HOME_PCT']      = df['JTW_HOME_PCT'].replace('#VALUE!', np.nan)


df['JTW_DA_PCT']        = df['JTW_DA_PCT'].replace('N', np.nan)
df['JTW_CARPOOLED_PCT'] = df['JTW_CARPOOLED_PCT'].replace('N', np.nan)
df['JTW_TRANSIT_PCT']   = df['JTW_TRANSIT_PCT'].replace('N', np.nan)
df['JTW_WALK_PCT']      = df['JTW_WALK_PCT'].replace('N', np.nan)
df['JTW_BICYCLE_PCT']   = df['JTW_BICYCLE_PCT'].replace('N', np.nan)
df['JTW_OTHER_PCT']     = df['JTW_OTHER_PCT'].replace('N', np.nan)
df['JTW_HOME_PCT']      = df['JTW_HOME_PCT'].replace('N', np.nan)



In [22]:
# convert JTW to numeric columns
df['JTW_DA_PCT']        = df['JTW_DA_PCT'].astype(float)
df['JTW_CARPOOLED_PCT'] = df['JTW_CARPOOLED_PCT'].astype(float)
df['JTW_TRANSIT_PCT']   = df['JTW_TRANSIT_PCT'].astype(float)
df['JTW_WALK_PCT']      = df['JTW_WALK_PCT'].astype(float)
df['JTW_BICYCLE_PCT']   = df['JTW_BICYCLE_PCT'].astype(float)
df['JTW_OTHER_PCT']     = df['JTW_OTHER_PCT'].astype(float)
df['JTW_HOME_PCT']      = df['JTW_HOME_PCT'].astype(float)

In [23]:
# immigrant population percentage
df['Tot_NonUSA_POP_pct'] = df['Tot_NonUSA_POP'] / df['Tot_Pop'] * 100

In [24]:
# bike share plus scooters for rail
df['BIKE_SCOOTER_RAIL'] = df['BIKE_SHARE_RAIL'] + df['scooter_flag_RAIL']

# different spec of bike and scooter
df['BIKE_SCOOTER_SCOOTER_RAIL'] = df['BIKE_SHARE_RAIL'] + df['scooter_flag_RAIL'] + df['scooter_flag_RAIL']

# different spec of bike and scooter
df['BIKE_SCOOTER_SCOOTER_RAIL'] = df['BIKE_SHARE_RAIL'] + df['scooter_flag_RAIL'] + df['scooter_flag_RAIL']

# merge bike and scooter for bus
df['BIKE_SCOOTER_SCOOTER_BUS'] = df['BIKE_SHARE_BUS'] + df['scooter_flag_BUS'] + df['scooter_flag_BUS']

# merge bike and scooter for bus
df['BIKE_SCOOTER_BUS'] = df['BIKE_SHARE_BUS'] + df['scooter_flag_BUS'] 
df['BIKE_SCOOTER_RAIL'] = df['BIKE_SHARE_RAIL'] + df['scooter_flag_RAIL'] 

# scooter or bike
df['BIKE_OR_SCOOTER_BUS']  = df['BIKE_SHARE_BUS'] + df['scooter_flag_BUS'] 
df['BIKE_OR_SCOOTER_BUS']  = df['BIKE_OR_SCOOTER_BUS'].apply(lambda x : min(x, 1))

df['BIKE_OR_SCOOTER_RAIL'] = df['BIKE_SHARE_RAIL'] + df['scooter_flag_RAIL'] 
df['BIKE_OR_SCOOTER_RAIL'] = df['BIKE_OR_SCOOTER_RAIL'].apply(lambda x : min(x, 1))

# calculate average vehicles
df['AVG_VEHS'] = (df['HH_1Veh'] + 2*df['HH_2Veh'] + 3*df['HH_3Veh'] + 4*df['HH_4+Veh']) / (df['HH_0Veh'] + df['HH_1Veh'] + df['HH_2Veh'] + df['HH_3Veh'] + df['HH_4+Veh'])
df['AVG_VEHS_log'] = np.log(1+df['AVG_VEHS'])

In [25]:
# walk bike other commuting
df['JTW_WALK_BIKE_OTHER_PCT'] = df['JTW_WALK_PCT'] + df['JTW_BICYCLE_PCT'] + df['JTW_OTHER_PCT']

# walk bike commuting
df['JTW_WALK_BIKE_PCT'] = df['JTW_WALK_PCT'] + df['JTW_BICYCLE_PCT']

In [26]:
df['VRM_ADJ_BUS'] = df['VRM_ADJ'] * df['BUS_FLAG']
df['VRM_ADJ_RAIL'] = df['VRM_ADJ'] * df['RAIL_FLAG']


#maintenance effect on competing mode
df['MDBF_Mechanical_RAIL']  = df['MDBF_Mechanical'] * df['RAIL_FLAG']
df['MDBF_Mechanical_BUS'] = df['MDBF_Mechanical']  * df['BUS_FLAG']

df['MDBF_Total_RAIL']  = df['MDBF_Total'] * df['RAIL_FLAG']
df['MDBF_Total_BUS'] = df['MDBF_Total']  * df['BUS_FLAG']

In [27]:
# segment TNCs by cluster
df['HI_OPEX']  = np.where(df['CLUSTER_APTA4']==1, 1, 0)
df['MID_OPEX'] = np.where(df['CLUSTER_APTA4']==2, 1, 0)
df['LOW_OPEX'] = np.where(df['CLUSTER_APTA4']==3, 1, 0)
df['NEW_YORK'] = np.where(df['CLUSTER_APTA4']==10, 1, 0)

df['YEARS_SINCE_TNC_BUS_HI']  = df['YEARS_SINCE_TNC_BUS'] * df['HI_OPEX']
df['YEARS_SINCE_TNC_BUS_MID'] = df['YEARS_SINCE_TNC_BUS'] * df['MID_OPEX']
df['YEARS_SINCE_TNC_BUS_LOW'] = df['YEARS_SINCE_TNC_BUS'] * df['LOW_OPEX']
df['YEARS_SINCE_TNC_BUS_NY']  = df['YEARS_SINCE_TNC_BUS'] * df['NEW_YORK']

df['YEARS_SINCE_TNC_RAIL_HI']  = df['YEARS_SINCE_TNC_RAIL'] * df['HI_OPEX']
df['YEARS_SINCE_TNC_RAIL_MID'] = df['YEARS_SINCE_TNC_RAIL'] * df['MID_OPEX']
df['YEARS_SINCE_TNC_RAIL_LOW'] = df['YEARS_SINCE_TNC_RAIL'] * df['LOW_OPEX']
df['YEARS_SINCE_TNC_RAIL_NY']  = df['YEARS_SINCE_TNC_RAIL'] * df['NEW_YORK']

df['YEARS_SINCE_TNC_BUS_HINY']  = df['YEARS_SINCE_TNC_BUS_HI'] + df['YEARS_SINCE_TNC_BUS_NY']
df['YEARS_SINCE_TNC_RAIL_HINY']  = df['YEARS_SINCE_TNC_RAIL_HI'] + df['YEARS_SINCE_TNC_RAIL_NY']

df['YEARS_SINCE_TNC_BUS_MIDLOW']  = df['YEARS_SINCE_TNC_BUS_MID'] + df['YEARS_SINCE_TNC_BUS_LOW']
df['YEARS_SINCE_TNC_RAIL_MIDLOW']  = df['YEARS_SINCE_TNC_RAIL_MID'] + df['YEARS_SINCE_TNC_RAIL_LOW']

In [28]:
# bikes and scooters by segment

df['BIKE_SHARE_HI']  = df['BIKE_SHARE'] * df['HI_OPEX']
df['BIKE_SHARE_MID'] = df['BIKE_SHARE'] * df['MID_OPEX']
df['BIKE_SHARE_LOW'] = df['BIKE_SHARE'] * df['LOW_OPEX']
df['BIKE_SHARE_NY']  = df['BIKE_SHARE'] * df['NEW_YORK']

df['scooter_flag_HI']  = df['scooter_flag'] * df['HI_OPEX']
df['scooter_flag_MID'] = df['scooter_flag'] * df['MID_OPEX']
df['scooter_flag_LOW'] = df['scooter_flag'] * df['LOW_OPEX']
df['scooter_flag_NY']  = df['scooter_flag'] * df['NEW_YORK']


df['BIKE_SHARE_HINY']  = df['BIKE_SHARE_HI'] + df['BIKE_SHARE_NY']
df['BIKE_SHARE_MIDLOW']  = df['BIKE_SHARE_MID'] + df['BIKE_SHARE_LOW']

df['scooter_flag_HINY']  = df['scooter_flag_HI'] + df['scooter_flag_NY']
df['scooter_flag_MIDLOW']  = df['scooter_flag_MID'] + df['scooter_flag_LOW']

In [29]:
# set the indices
df['ID'] = df['MNAME'] + '-' + df['Mode']
df=df.set_index(['ID','Year'])

In [30]:
# keep only the numeric columns -- the estimation will give an error otherwise
df = df.select_dtypes(include=[np.number])

In [31]:
# create a log of all fields
for col in df.columns:
    df[col+'_log'] = np.log(df[col]+1)

  This is separate from the ipykernel package so we can avoid doing imports until
  This is separate from the ipykernel package so we can avoid doing imports until


In [32]:
# Test TNC ramp up

df['YEARS_SINCE_TNC_BUS_2'] = df['YEARS_SINCE_TNC_BUS'].apply(lambda x : min(x, 2))
df['YEARS_SINCE_TNC_BUS_3'] = df['YEARS_SINCE_TNC_BUS'].apply(lambda x : min(x, 3))
df['YEARS_SINCE_TNC_BUS_4'] = df['YEARS_SINCE_TNC_BUS'].apply(lambda x : min(x, 4))
df['YEARS_SINCE_TNC_BUS_5'] = df['YEARS_SINCE_TNC_BUS'].apply(lambda x : min(x, 5))

df['YEARS_SINCE_TNC_RAIL_2'] = df['YEARS_SINCE_TNC_RAIL'].apply(lambda x : min(x, 2))
df['YEARS_SINCE_TNC_RAIL_3'] = df['YEARS_SINCE_TNC_RAIL'].apply(lambda x : min(x, 3))
df['YEARS_SINCE_TNC_RAIL_4'] = df['YEARS_SINCE_TNC_RAIL'].apply(lambda x : min(x, 4))
df['YEARS_SINCE_TNC_RAIL_5'] = df['YEARS_SINCE_TNC_RAIL'].apply(lambda x : min(x, 5))

In [33]:
# count the first year of TNC presence
df['YEARS_SINCE_TNC_BUS2'] = df['YEARS_SINCE_TNC_BUS'] + df['TNC_FLAG_BUS']
df['YEARS_SINCE_TNC_RAIL2'] = df['YEARS_SINCE_TNC_RAIL'] + df['TNC_FLAG_RAIL']


In [34]:
## start in first year TNCs show up

df['YEARS_SINCE_TNC_BUS2_NY']  = df['YEARS_SINCE_TNC_BUS2'] * df['NEW_YORK']
df['YEARS_SINCE_TNC_BUS2_HI']  = df['YEARS_SINCE_TNC_BUS2'] * df['HI_OPEX']
df['YEARS_SINCE_TNC_BUS2_MID'] = df['YEARS_SINCE_TNC_BUS2'] * df['MID_OPEX']
df['YEARS_SINCE_TNC_BUS2_LOW'] = df['YEARS_SINCE_TNC_BUS2'] * df['LOW_OPEX']

df['YEARS_SINCE_TNC_BUS2_HINY']   = df['YEARS_SINCE_TNC_BUS2_NY'] + df['YEARS_SINCE_TNC_BUS2_HI']
df['YEARS_SINCE_TNC_BUS2_MIDLOW'] = df['YEARS_SINCE_TNC_BUS2_MID'] + df['YEARS_SINCE_TNC_BUS2_LOW']

df['YEARS_SINCE_TNC_RAIL2_NY']  = df['YEARS_SINCE_TNC_RAIL2'] * df['NEW_YORK']
df['YEARS_SINCE_TNC_RAIL2_HI']  = df['YEARS_SINCE_TNC_RAIL2'] * df['HI_OPEX']
df['YEARS_SINCE_TNC_RAIL2_MID'] = df['YEARS_SINCE_TNC_RAIL2'] * df['MID_OPEX']
df['YEARS_SINCE_TNC_RAIL2_LOW'] = df['YEARS_SINCE_TNC_RAIL2'] * df['LOW_OPEX']

df['YEARS_SINCE_TNC_RAIL2_HINY']   = df['YEARS_SINCE_TNC_RAIL2_NY'] + df['YEARS_SINCE_TNC_RAIL2_HI']
df['YEARS_SINCE_TNC_RAIL2_MIDLOW'] = df['YEARS_SINCE_TNC_RAIL2_MID'] + df['YEARS_SINCE_TNC_RAIL2_LOW']

In [35]:
## BIKE SHARE SEGMENTATION

df['BIKE_SHARE_NY']  = df['BIKE_SHARE'] * df['NEW_YORK']
df['BIKE_SHARE_HI']  = df['BIKE_SHARE'] * df['HI_OPEX']
df['BIKE_SHARE_MID'] = df['BIKE_SHARE'] * df['MID_OPEX']
df['BIKE_SHARE_LOW'] = df['BIKE_SHARE'] * df['LOW_OPEX']

df['BIKE_SHARE_HINY']   = df['BIKE_SHARE_NY'] + df['BIKE_SHARE_HI']
df['BIKE_SHARE_MIDLOW'] = df['BIKE_SHARE_MID'] + df['BIKE_SHARE_LOW']

df['scooter_flag_NY']  = df['scooter_flag'] * df['NEW_YORK']
df['scooter_flag_HI']  = df['scooter_flag'] * df['HI_OPEX']
df['scooter_flag_MID'] = df['scooter_flag'] * df['MID_OPEX']
df['scooter_flag_LOW'] = df['scooter_flag'] * df['LOW_OPEX']

df['scooter_flag_HINY']   = df['scooter_flag_NY'] + df['scooter_flag_HI']
df['scooter_flag_MIDLOW'] = df['scooter_flag_MID'] + df['scooter_flag_LOW']



In [36]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS2_HINY \
                    + YEARS_SINCE_TNC_BUS2_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL2_HINY \
                    + YEARS_SINCE_TNC_RAIL2_MIDLOW \
                    + BIKE_SHARE \
                    + MDBF_Total_log \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6283
Estimator:                   PanelOLS   R-squared (Between):              0.9426
No. Observations:                4145   R-squared (Within):               0.6283
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9433
Time:                        11:38:35   Log-likelihood                    1309.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      434.67
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(15,3858)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             434.67
                            

Inputs contain missing values. Dropping rows with missing observations.


In [37]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS2_HINY \
                    + YEARS_SINCE_TNC_BUS2_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL2_HINY \
                    + YEARS_SINCE_TNC_RAIL2_MIDLOW \
                    + BIKE_SHARE \
                    + MDBF_Mechanical_log \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9325
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9329
Time:                        11:38:43   Log-likelihood                    1278.6
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      460.36
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(14,3868)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             460.36
                            

In [38]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS2_HINY \
                    + YEARS_SINCE_TNC_BUS2_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL2_HINY \
                    + YEARS_SINCE_TNC_RAIL2_MIDLOW \
                    + BIKE_SHARE \
                    + MDBF_Total_log \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6301
Estimator:                   PanelOLS   R-squared (Between):              0.9705
No. Observations:                4145   R-squared (Within):               0.6301
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9710
Time:                        11:38:44   Log-likelihood                    1319.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      438.16
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(15,3858)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             438.16
                            

In [39]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS2_HINY \
                    + YEARS_SINCE_TNC_BUS2_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL2_HINY \
                    + YEARS_SINCE_TNC_RAIL2_MIDLOW \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6300
Estimator:                   PanelOLS   R-squared (Between):              0.9708
No. Observations:                4145   R-squared (Within):               0.6300
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9712
Time:                        11:38:44   Log-likelihood                    1318.6
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      469.26
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(14,3859)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             469.26
                            

In [40]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6305
Estimator:                   PanelOLS   R-squared (Between):              0.9641
No. Observations:                4145   R-squared (Within):               0.6305
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9647
Time:                        11:38:45   Log-likelihood                    1321.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      549.04
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             549.04
                            

In [41]:
df['YEARS_SINCE_TNC_BUS_SQRD']=df['YEARS_SINCE_TNC_BUS']*df['YEARS_SINCE_TNC_BUS']
df['YEARS_SINCE_TNC_RAIL_SQRD']=df['YEARS_SINCE_TNC_RAIL']*df['YEARS_SINCE_TNC_RAIL']

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6301
Estimator:                   PanelOLS   R-squared (Between):              0.9748
No. Observations:                4145   R-squared (Within):               0.6301
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9753
Time:                        11:38:45   Log-likelihood                    1319.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      548.08
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             548.08
                            

In [42]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6309
Estimator:                   PanelOLS   R-squared (Between):              0.9646
No. Observations:                4145   R-squared (Within):               0.6309
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9652
Time:                        11:38:46   Log-likelihood                    1323.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      471.08
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(14,3859)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             471.08
                            

In [43]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    + MDBF_Total_log \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6310
Estimator:                   PanelOLS   R-squared (Between):              0.9645
No. Observations:                4145   R-squared (Within):               0.6310
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9651
Time:                        11:38:47   Log-likelihood                    1324.6
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      439.87
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(15,3858)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             439.87
                            

In [44]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    + MDBF_Total \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6309
Estimator:                   PanelOLS   R-squared (Between):              0.9652
No. Observations:                4145   R-squared (Within):               0.6309
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9658
Time:                        11:38:48   Log-likelihood                    1324.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      439.66
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(15,3858)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             439.66
                            

In [45]:
def alt_mode(df):
    if (df['BIKE_SHARE']==1) or (df['scooter_flag']==1) or (df['TNC_FLAG']==1):
        x=1
    else:
        x=0
    return x
    
df['ALT_MODE']=df.apply(alt_mode,axis=1)
df['FAILURE_ALT_MODE_INTERACTION']=df['ALT_MODE']*df['MDBF_Total']
df['FAILURE_ALT_MODE_INTERACTION_log'] = np.log(df['FAILURE_ALT_MODE_INTERACTION']+1)

In [46]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    + FAILURE_ALT_MODE_INTERACTION_log  \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6309
Estimator:                   PanelOLS   R-squared (Between):              0.9667
No. Observations:                4145   R-squared (Within):               0.6309
Date:                Tue, Mar 31 2020   R-squared (Overall):              0.9673
Time:                        22:59:31   Log-likelihood                    1323.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      439.64
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(15,3858)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             439.64
                            

In [46]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6309
Estimator:                   PanelOLS   R-squared (Between):              0.9646
No. Observations:                4145   R-squared (Within):               0.6309
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9652
Time:                        11:38:54   Log-likelihood                    1323.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      471.08
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(14,3859)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             471.08
                            

In [47]:
## Create FACs for this model as well

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6306
Estimator:                   PanelOLS   R-squared (Between):              0.9636
No. Observations:                4145   R-squared (Within):               0.6306
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9642
Time:                        11:39:03   Log-likelihood                    1322.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      506.90
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             506.90
                            

In [48]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    + FAILURE_ALT_MODE_INTERACTION_log  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6306
Estimator:                   PanelOLS   R-squared (Between):              0.9660
No. Observations:                4145   R-squared (Within):               0.6306
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9666
Time:                        11:39:08   Log-likelihood                    1322.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      506.78
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             506.78
                            

In [49]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + EntityEffects \
                    + FAILURE_ALT_MODE_INTERACTION_log  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6300
Estimator:                   PanelOLS   R-squared (Between):              0.9744
No. Observations:                4145   R-squared (Within):               0.6300
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9749
Time:                        11:39:12   Log-likelihood                    1319.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      547.94
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             547.94
                            

In [50]:
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + EntityEffects \
                    + MDBF_Total_log  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6301
Estimator:                   PanelOLS   R-squared (Between):              0.9723
No. Observations:                4145   R-squared (Within):               0.6301
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9727
Time:                        11:39:15   Log-likelihood                    1319.4
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      548.06
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             548.06
                            

In [51]:
### preferred model
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + EntityEffects \
                    + scooter_flag  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6305
Estimator:                   PanelOLS   R-squared (Between):              0.9641
No. Observations:                4145   R-squared (Within):               0.6305
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9647
Time:                        11:39:16   Log-likelihood                    1321.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      549.04
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             549.04
                            

In [52]:
## Create FACs for this model as well

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6306
Estimator:                   PanelOLS   R-squared (Between):              0.9636
No. Observations:                4145   R-squared (Within):               0.6306
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9642
Time:                        11:39:17   Log-likelihood                    1322.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      506.90
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             506.90
                            

In [53]:
##################################################################3333

In [54]:
### preferred model -- from before
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + WEIGHTED_POP_DENSITY_log \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + EntityEffects \
                    + scooter_flag  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6305
Estimator:                   PanelOLS   R-squared (Between):              0.9641
No. Observations:                4145   R-squared (Within):               0.6305
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9647
Time:                        11:40:19   Log-likelihood                    1321.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      549.04
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(12,3861)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             549.04
                            

In [55]:
# without weighted population density

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + EntityEffects \
                    + scooter_flag  \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9274
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9277
Time:                        11:40:57   Log-likelihood                    1278.2
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      586.20
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(11,3871)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             586.20
                            

In [56]:
# with maintenance

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9263
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9266
Time:                        11:42:30   Log-likelihood                    1278.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.33
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.33
                            

In [57]:
# with maintenance

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total_log \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6250
Estimator:                   PanelOLS   R-squared (Between):              0.9275
No. Observations:                4155   R-squared (Within):               0.6250
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9279
Time:                        11:42:50   Log-likelihood                    1278.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.48
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.48
                            

In [58]:
# split bus and rail VRM

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_BUS_log \
                    + VRM_ADJ_RAIL_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6451
Estimator:                   PanelOLS   R-squared (Between):              0.9259
No. Observations:                4155   R-squared (Within):               0.6451
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9284
Time:                        11:43:58   Log-likelihood                    1393.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      541.09
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(13,3869)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             541.09
                            

In [59]:
# add square on TNC term

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_BUS_log \
                    + VRM_ADJ_RAIL_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6452
Estimator:                   PanelOLS   R-squared (Between):              0.9260
No. Observations:                4155   R-squared (Within):               0.6452
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9285
Time:                        11:45:57   Log-likelihood                    1393.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      468.71
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(15,3867)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             468.71
                            

In [60]:
# simplify

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6252
Estimator:                   PanelOLS   R-squared (Between):              0.9249
No. Observations:                4155   R-squared (Within):               0.6252
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9253
Time:                        11:46:40   Log-likelihood                    1280.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      460.95
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(14,3868)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             460.95
                            

In [61]:
# simplify

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL \
                    + YEARS_SINCE_TNC_RAIL_SQRD \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6252
Estimator:                   PanelOLS   R-squared (Between):              0.9260
No. Observations:                4155   R-squared (Within):               0.6252
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9264
Time:                        11:47:23   Log-likelihood                    1280.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      496.43
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(13,3869)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             496.43
                            

In [62]:
# simplify

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_BUS_SQRD \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9264
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9267
Time:                        11:47:51   Log-likelihood                    1278.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.25
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.25
                            

In [68]:
# simplify

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9263
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9266
Time:                        11:51:48   Log-likelihood                    1278.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.33
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.33
                            

In [69]:
# splity TNCs by city type

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS2_HINY \
                    + YEARS_SINCE_TNC_BUS2_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL2_HINY \
                    + YEARS_SINCE_TNC_RAIL2_MIDLOW \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6250
Estimator:                   PanelOLS   R-squared (Between):              0.9314
No. Observations:                4155   R-squared (Within):               0.6250
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9317
Time:                        11:52:48   Log-likelihood                    1279.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      460.54
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(14,3868)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             460.54
                            

In [70]:
# preferred simple model

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6249
Estimator:                   PanelOLS   R-squared (Between):              0.9263
No. Observations:                4155   R-squared (Within):               0.6249
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9266
Time:                        11:55:23   Log-likelihood                    1278.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.33
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.33
                            

In [75]:
# bus only model
bus_df = df[df['BUS_FLAG']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=bus_df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6732
Estimator:                   PanelOLS   R-squared (Between):              0.9813
No. Observations:                3533   R-squared (Within):               0.6732
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9812
Time:                        12:47:02   Log-likelihood                    1434.1
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      616.54
Entities:                         229   P-value                           0.0000
Avg Obs:                       15.428   Distribution:                 F(11,3293)
Min Obs:                       4.0000                                           
Max Obs:                       17.000   F-statistic (robust):             616.54
                            

In [76]:
# rail only model
rail_df = df[df['RAIL_FLAG']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=rail_df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.5449
Estimator:                   PanelOLS   R-squared (Between):              0.6684
No. Observations:                 622   R-squared (Within):               0.5449
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.6707
Time:                        12:47:13   Log-likelihood                    43.944
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      61.729
Entities:                         261   P-value                           0.0000
Avg Obs:                       2.3831   Distribution:                  F(11,567)
Min Obs:                       0.0000                                           
Max Obs:                       17.000   F-statistic (robust):             61.729
                            

In [74]:
# rail only model
rail_df = df[df['RAIL_FLAG']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC \
                    + BIKE_SHARE \
                    + EntityEffects \
                    ',data=rail_df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.5409
Estimator:                   PanelOLS   R-squared (Between):              0.5961
No. Observations:                 622   R-squared (Within):               0.5409
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.6004
Time:                        11:58:44   Log-likelihood                    41.156
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      74.473
Entities:                         261   P-value                           0.0000
Avg Obs:                       2.3831   Distribution:                   F(9,569)
Min Obs:                       0.0000                                           
Max Obs:                       17.000   F-statistic (robust):             74.473
                            

In [81]:
# high op-exp model

df2 = df[df['HI_OPEX']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df2)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.8659
Estimator:                   PanelOLS   R-squared (Between):             -0.0030
No. Observations:                 628   R-squared (Within):               0.8659
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.0013
Time:                        12:51:33   Log-likelihood                    317.27
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      310.90
Entities:                         261   P-value                           0.0000
Avg Obs:                       2.4061   Distribution:                  F(12,578)
Min Obs:                       0.0000                                           
Max Obs:                       17.000   F-statistic (robust):             310.90
                            

In [82]:
# mid op-exp model

df2 = df[df['MID_OPEX']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df2)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.5903
Estimator:                   PanelOLS   R-squared (Between):              0.8446
No. Observations:                1308   R-squared (Within):               0.5903
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.8437
Time:                        12:51:42   Log-likelihood                    516.30
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      145.43
Entities:                          85   P-value                           0.0000
Avg Obs:                       15.388   Distribution:                 F(12,1211)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             145.43
                            

In [83]:
# low op-exp model

df2 = df[df['LOW_OPEX']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df2)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6074
Estimator:                   PanelOLS   R-squared (Between):              0.9806
No. Observations:                2185   R-squared (Within):               0.6074
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9803
Time:                        12:52:54   Log-likelihood                    740.74
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      261.03
Entities:                         148   P-value                           0.0000
Avg Obs:                       14.764   Distribution:                 F(12,2025)
Min Obs:                       4.0000                                           
Max Obs:                       17.000   F-statistic (robust):             261.03
                            

In [84]:
# new york model

df2 = df[df['NEW_YORK']==1]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df2)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.8270
Estimator:                   PanelOLS   R-squared (Between):             -8.0805
No. Observations:                  34   R-squared (Within):               0.8270
Date:                Wed, Apr 08 2020   R-squared (Overall):             -8.0802
Time:                        12:53:57   Log-likelihood                    55.566
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      7.9676
Entities:                         168   P-value                           0.0000
Avg Obs:                       0.2024   Distribution:                   F(12,20)
Min Obs:                       0.0000                                           
Max Obs:                       17.000   F-statistic (robust):             7.9676
                            

In [85]:
# main model, without NY

df2 = df[df['NEW_YORK']==0]
mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df2)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6255
Estimator:                   PanelOLS   R-squared (Between):              0.9296
No. Observations:                4121   R-squared (Within):               0.6255
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9299
Time:                        12:54:56   Log-likelihood                    1257.0
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      534.25
Entities:                         271   P-value                           0.0000
Avg Obs:                       15.207   Distribution:                 F(12,3838)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             534.25
                            

In [90]:
# main model, different TNC spec

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS_HINY \
                    + YEARS_SINCE_TNC_BUS_MIDLOW \
                    + YEARS_SINCE_TNC_RAIL_HINY \
                    + YEARS_SINCE_TNC_RAIL_MIDLOW \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)

                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6259
Estimator:                   PanelOLS   R-squared (Between):              0.9325
No. Observations:                4155   R-squared (Within):               0.6259
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9328
Time:                        12:57:43   Log-likelihood                    1283.9
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      462.25
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(14,3868)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             462.25
                            

In [94]:
# without weighted population density

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6250
Estimator:                   PanelOLS   R-squared (Between):              0.9257
No. Observations:                4155   R-squared (Within):               0.6250
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9260
Time:                        13:00:19   Log-likelihood                    1278.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      537.40
Entities:                         273   P-value                           0.0000
Avg Obs:                       15.220   Distribution:                 F(12,3870)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             537.40
                            

In [93]:
# with weighted population density

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6291
Estimator:                   PanelOLS   R-squared (Between):              0.9418
No. Observations:                4145   R-squared (Within):               0.6291
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9425
Time:                        13:00:08   Log-likelihood                    1313.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      503.58
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             503.58
                            

In [95]:
# with weighted population density, logged

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6306
Estimator:                   PanelOLS   R-squared (Between):              0.9651
No. Observations:                4145   R-squared (Within):               0.6306
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9657
Time:                        13:01:03   Log-likelihood                    1322.2
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      506.87
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             506.87
                            

In [96]:
## it's better without the log


mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Mechanical \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6291
Estimator:                   PanelOLS   R-squared (Between):              0.9418
No. Observations:                4145   R-squared (Within):               0.6291
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9425
Time:                        13:01:56   Log-likelihood                    1313.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      503.58
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             503.58
                            

In [98]:
## MBDF Total


mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6290
Estimator:                   PanelOLS   R-squared (Between):              0.9424
No. Observations:                4145   R-squared (Within):               0.6290
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9431
Time:                        13:03:38   Log-likelihood                    1313.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      503.50
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             503.50
                            

In [100]:
# weighted population density per capita
df['WEIGHTED_POP_DENSITY_PER_CAP'] = df['WEIGHTED_POP_DENSITY'] / df['Tot_Pop']

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY_PER_CAP \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6262
Estimator:                   PanelOLS   R-squared (Between):              0.9463
No. Observations:                4145   R-squared (Within):               0.6262
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9464
Time:                        13:05:44   Log-likelihood                    1297.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      497.41
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             497.41
                            

In [101]:
# weighted population density per capita
df['WEIGHTED_POP_DENSITY_PER_CAP'] = df['WEIGHTED_POP_DENSITY'] / df['Tot_Pop']
df['WEIGHTED_POP_DENSITY_PER_CAP_log'] = np.log(1+df['WEIGHTED_POP_DENSITY_PER_CAP'])

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY_PER_CAP_log \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6262
Estimator:                   PanelOLS   R-squared (Between):              0.9461
No. Observations:                4145   R-squared (Within):               0.6262
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9463
Time:                        13:07:19   Log-likelihood                    1297.7
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      497.40
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             497.40
                            

In [102]:
## This one remains our favorite

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6290
Estimator:                   PanelOLS   R-squared (Between):              0.9424
No. Observations:                4145   R-squared (Within):               0.6290
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9431
Time:                        13:08:39   Log-likelihood                    1313.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      503.50
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             503.50
                            

In [107]:
# split pop and emp

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + Tot_Pop_log \
                    + TOT_EMP_MSA_log \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6292
Estimator:                   PanelOLS   R-squared (Between):              0.9489
No. Observations:                4145   R-squared (Within):               0.6292
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9494
Time:                        13:11:23   Log-likelihood                    1314.3
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      467.71
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(14,3859)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             467.71
                            

In [108]:
# emp per person
df['EMP_PER_POP'] = df['TOT_EMP_MSA'] / df['Tot_Pop']

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + Tot_Pop_log \
                    + EMP_PER_POP \
                    + WEIGHTED_POP_DENSITY \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6290
Estimator:                   PanelOLS   R-squared (Between):              0.9313
No. Observations:                4145   R-squared (Within):               0.6290
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9321
Time:                        13:12:22   Log-likelihood                    1313.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      467.42
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(14,3859)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             467.42
                            

In [111]:
## This one remains our favorite
df['WEIGHTED_POP_DENSITY_1000s'] = df['WEIGHTED_POP_DENSITY'] / 1000.

mod=PanelOLS.from_formula('UPT_ADJ_log \
                    ~ VRM_ADJ_log \
                    + FARE_per_UPT_2018_log \
                    + POP_EMP_log \
                    + WEIGHTED_POP_DENSITY_1000s \
                    + GAS_PRICE_2018_log \
                    + TOTAL_MED_INC_INDIV_2018_log \
                    + PCT_HH_NO_VEH \
                    + JTW_HOME_PCT \
                    + YEARS_SINCE_TNC_BUS \
                    + YEARS_SINCE_TNC_RAIL \
                    + BIKE_SHARE \
                    + scooter_flag  \
                    + MDBF_Total \
                    + EntityEffects \
                    ',data=df)
res=mod.fit()
print(res)


                          PanelOLS Estimation Summary                           
Dep. Variable:            UPT_ADJ_log   R-squared:                        0.6290
Estimator:                   PanelOLS   R-squared (Between):              0.9424
No. Observations:                4145   R-squared (Within):               0.6290
Date:                Wed, Apr 08 2020   R-squared (Overall):              0.9431
Time:                        13:14:33   Log-likelihood                    1313.5
Cov. Estimator:            Unadjusted                                           
                                        F-statistic:                      503.50
Entities:                         272   P-value                           0.0000
Avg Obs:                       15.239   Distribution:                 F(13,3860)
Min Obs:                       1.0000                                           
Max Obs:                       17.000   F-statistic (robust):             503.50
                            

# Calculate FAC

In [209]:
# keep the relevant fields in the data set
app_data = pd.concat([df[['RAIL_FLAG', 'CLUSTER_APTA']], 
                      res.model.dependent.dataframe, 
                      res.model.exog.dataframe, 
                      res.estimated_effects, 
                      res.resids], axis=1)

In [210]:
# calculate the fitted values
app_data['fitted'] = app_data['estimated_effects']
for var in res.params.keys(): 
    app_data['fitted'] = app_data['fitted'] + (res.params[var] * app_data[var])

# and check that we add up correctly
app_data['check_fitted'] = app_data['fitted'] + app_data['residual'] - app_data['UPT_ADJ_log']

In [211]:
# create the necessary columns

# dependents
app_data['UPT_ADJ']    = np.exp(app_data['UPT_ADJ_log']) - 1
app_data['fitted_exp'] = np.exp(app_data['fitted']) - 1

# linear versions of logged variables
for var in res.params.keys(): 
    if '_log' in var: 
        app_data[var.replace('_log', '')] = np.exp(app_data[var]) - 1              

# ratios and differences
for var in res.params.keys(): 
    if '_log' in var: 
        app_data[var.replace('_log', '_ratio')] = 1
    else: 
        app_data[var+'_diff'] = 0

# FAC multipliers
for var in res.params.keys(): 
    app_data[var+'_FAC_ratio'] = 1

app_data['UPT_ADJ_base'] = 0
app_data['UPT_ADJ_ratio'] = 1
app_data['fitted_exp_ratio'] = 1
app_data['UPT_ADJ_diff'] = 0
app_data['fitted_exp_diff'] = 0

# for tracking new systems
app_data['UPT_ADJ_first_year'] = 0
app_data['UPT_ADJ_new_reporter'] = 0



In [212]:
# get the ridership in the first year the system starts. This allows us to track new systems. 
# calculate the FAC relative to a specific base year
ids = app_data.index.get_level_values(0).unique()

for id in ids: 
    years = app_data.loc[id].index.get_level_values(0).sort_values()
    first_year = years[0]

    app_data.loc[(id,first_year),'UPT_ADJ_new_reporter'] = app_data.loc[(id,first_year),'UPT_ADJ']
    
    for year in years: 
        app_data.loc[(id,year),'UPT_ADJ_first_year']   = app_data.loc[(id,first_year),'UPT_ADJ']

In [213]:
# calculate the FAC relative to a specific base year
ids = app_data.index.get_level_values(0).unique()

for id in ids: 
    years = app_data.loc[id].index.get_level_values(0).sort_values()

    for year in years[1:]:       
        base_year = year-1
        
        for var in res.params.keys(): 
            # ratios and differences
            if '_log' in var: 
                out_var = var.replace('_log', '_ratio')
                app_data.loc[(id,year), out_var] = (app_data.loc[(id,year), var] 
                                                 / app_data.loc[(id,base_year), var])
            else: 
                out_var = var+'_diff'                    
                app_data.loc[(id,year), out_var] = (app_data.loc[(id,year), var] 
                                                 - app_data.loc[(id,base_year), var])

            # FAC multipliers
            app_data.loc[(id,year),var+'_FAC_ratio'] = np.exp(res.params[var] * (
                                                    app_data.loc[(id,year), var] 
                                                    - app_data.loc[(id,base_year), var]))

            # estimated effects (if time effects is zero, no change)
            app_data.loc[(id,year),'effects_FAC_ratio'] = np.exp(
                                                    app_data.loc[(id,year), 'estimated_effects'] 
                                                    - app_data.loc[(id,base_year), 'estimated_effects'])

            # residual
            app_data.loc[(id,year),'residual_FAC_ratio'] = np.exp(
                                                    app_data.loc[(id,year), 'residual'] 
                                                    - app_data.loc[(id,base_year), 'residual'])

        # observed and fitted changes            
        app_data.loc[(id,year),'UPT_ADJ_base'] = app_data.loc[(id,base_year),'UPT_ADJ']
        app_data.loc[(id,year),'UPT_ADJ_ratio'] = (app_data.loc[(id,year),'UPT_ADJ'] 
                                                 / app_data.loc[(id,base_year),'UPT_ADJ'])
        app_data.loc[(id,year),'fitted_exp_ratio'] = (app_data.loc[(id,year),'fitted_exp'] 
                                                 / app_data.loc[(id,base_year),'fitted_exp'])

        app_data.loc[(id,year),'UPT_ADJ_diff'] = (app_data.loc[(id,year),'UPT_ADJ'] 
                                                 - app_data.loc[(id,base_year),'UPT_ADJ'])
        app_data.loc[(id,year),'fitted_exp_diff'] = (app_data.loc[(id,year),'fitted_exp'] 
                                                 - app_data.loc[(id,base_year),'fitted_exp'])
        

  from ipykernel import kernelapp as app
  from ipykernel import kernelapp as app


In [214]:
# calculate the FAC
app_data['FAC_Sum'] = 0
for var in res.params.keys(): 
    app_data[var+'_FAC'] = (app_data[var+'_FAC_ratio'] - 1) * app_data['UPT_ADJ_base']
    app_data['FAC_Sum'] = app_data['FAC_Sum'] + app_data[var+'_FAC']



In [215]:

# total FAC is based on the fitted model (applied multiplicitively)
app_data['Known_FAC'] = (app_data['fitted_exp_ratio'] - 1) * app_data['UPT_ADJ_base']

# uknown change is the difference between the observed change and the known change
app_data['Unknown_FAC'] = app_data['UPT_ADJ_diff'] - app_data['Known_FAC']

# the change in ridership associated new systems
app_data['New_Reporter_FAC'] = app_data['UPT_ADJ_new_reporter'] 

# should be teh same as UPT_ADJ_DIFF
app_data['Total_Change'] = app_data['Known_FAC'] + app_data['Unknown_FAC'] + app_data['New_Reporter_FAC']

In [216]:
# calculate APTA - 4 groups
app_data['CLUSTER_APTA4'] = np.floor(app_data['CLUSTER_APTA']/10)

In [217]:
# reset the index so I get ID and year
app_data = app_data.reset_index()
app_data = app_data.rename(columns={'level_0' : 'ID', 'level_1' : 'Year'})
app_data.to_csv('FAC_Model2.csv')

In [218]:
# these are the fields we keep
out_fields = ['ID', 'Year', 'RAIL_FLAG', 
              'CLUSTER_APTA', 'CLUSTER_APTA4',    
              'UPT_ADJ_first_year', 
              'UPT_ADJ_base', 'UPT_ADJ', 'UPT_ADJ_diff', 
              'fitted_exp', 'fitted_exp_diff']

# exogenous variables 
for var in res.params.keys(): 
    out_var = var.replace('_log', '')
    out_fields = out_fields + [out_var]
    
# FAC
for var in res.params.keys(): 
    out_fields = out_fields + [var+'_FAC']
out_fields = out_fields + ['FAC_Sum', 'Known_FAC', 'Unknown_FAC', 'New_Reporter_FAC', 'Total_Change']
    
# keep those fields
summary_data = app_data.reset_index()[out_fields]

In [219]:
# take weighted average of exogenous variables
for var in res.params.keys(): 
    out_var = var.replace('_log', '')
    summary_data[out_var] = summary_data[out_var] * summary_data['UPT_ADJ_first_year']

# aggregate to bus/rail totals
summary_data_apta4 = summary_data.groupby(by=['CLUSTER_APTA4', 'RAIL_FLAG', 'Year']).agg('sum')

# divide for weighted averages
for var in res.params.keys(): 
    out_var = var.replace('_log', '')
    summary_data_apta4[out_var] = summary_data_apta4[out_var] / summary_data_apta4['UPT_ADJ_first_year']

In [220]:

summary_data_apta4.to_csv('FAC_totals_APTA4_CLUSTERS_Model2.csv')

