In [2]:
#Fit a linear regression model predicting the ADHERENCE using the ROUTE_ABBR and ROUTE_DIRECTION_NAME columns. 
#Measure the performance of the model using the R^2 and mean absolute error metrics. 
#Interpret the meaning of each metric.

In [11]:
#import packages
import pandas as pd
import numpy as np
import sklearn as sk
import matplotlib as plt

In [2]:
#read in data
wego = pd.read_csv('../data/wego_ml.csv')

In [17]:
#create dummy variables for ROUTE_DIRECTION_NAME
wego_dum = pd.get_dummies(wego['ROUTE_DIRECTION_NAME'], drop_first= True)

Unnamed: 0,TO DOWNTOWN
0,1
1,0
2,1
3,0
4,1


In [18]:
#concat dummies onto wego
wego_dum = pd.concat([wego, wego_dum], axis = 1)
wego_dum = wego_dum.drop('ROUTE_DIRECTION_NAME', axis = 1)

Unnamed: 0,ID,CALENDAR_ID,SERVICE_ABBR,ADHERENCE_ID,DATE,ROUTE_ABBR,BLOCK_ABBR,OPERATOR,TRIP_ID,OVERLOAD_ID,...,STOP_CANCELLED,PREV_SCHED_STOP_CANCELLED,IS_RELIEF,BLOCK_STOP_ORDER,DWELL_IN_MINS,NextDay_Scheduled,NextDay_Actual_Arrival,NextDay_Actual_Departure,STARTING_ADHERENCE,TO DOWNTOWN
0,120230801_345104,120230801,1,99457892,2023-08-01,22,2200,1040,345104,0,...,0,0.0,0,19,0.0,0,0,0,-2.133333,1
1,120230801_345105,120230801,1,99457895,2023-08-01,22,2200,1040,345105,0,...,0,0.0,0,51,0.0,0,0,0,-1.583333,0
2,120230801_345106,120230801,1,99457899,2023-08-01,22,2200,1040,345106,0,...,0,0.0,0,80,0.0,0,0,0,-1.716666,1
3,120230801_345107,120230801,1,99457902,2023-08-01,22,2200,1040,345107,0,...,0,0.0,0,112,0.0,0,0,0,-1.316666,0
4,120230801_345108,120230801,1,99457906,2023-08-01,22,2200,1040,345108,0,...,0,0.0,0,141,0.0,0,0,0,-1.516666,1


In [19]:
#create X & y
X = wego_dum[['ROUTE_ABBR', 'TO DOWNTOWN']].values
y = wego_dum['ADHERENCE'].values

In [20]:
#import splitter
from sklearn.model_selection import train_test_split

In [21]:
#split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 42)

In [22]:
#import regression model
from sklearn.linear_model import LinearRegression

In [32]:
#fit regression model and run
reg = LinearRegression()

reg.fit(X_train, y_train)
y_pred = reg.predict(X_test)

In [24]:
# R^2 score: 3.8% of the variance in adherence is explained by the route and direction.
reg.score(X_test, y_test)

0.03771256365928333

In [25]:
# import MAE
from sklearn.metrics import mean_absolute_error

In [26]:
# MAE: The model predicts the adherence time within +/- 3.6 minutes.
mean_absolute_error(y_test, y_pred)

3.6048465707695296

In [None]:
# Now, try using the ROUTE_ABBR, ROUTE_DIRECTION_NAME, and OPERATOR. Does this improve the model? 
# Warning: Your model may perform very poorly once you add the OPERATOR. 
#     If so, this is likely caused because some operators have very few observations. 
#     One option to correct this is to assign an "Other" (or -999999) value to operators with few observations.

In [28]:
#create XX & yy
XX = wego_dum[['ROUTE_ABBR', 'TO DOWNTOWN', 'OPERATOR']].values
yy = wego_dum['ADHERENCE'].values

In [29]:
#split data into training and test sets
XX_train, XX_test, yy_train, yy_test = train_test_split(XX, yy, test_size = 0.25, random_state = 42)

In [33]:
#fit regression model and run
reg2 = LinearRegression()

reg2.fit(XX_train, yy_train)
yy_pred = reg2.predict(XX_test)

In [34]:
# R^2 score: 4.6% of the variance in adherence is explained by the route, direction, and operator.
reg2.score(XX_test, yy_test)

0.04696710713155117

In [35]:
# MAE: The model predicts the adherence time within +/- 3.6 minutes.
mean_absolute_error(yy_test, yy_pred)

3.5910729341830203

In [None]:
# Finally, the data you have been provided has an STARTING_ADHERENCE column, which contains the ADHERENCE 
# at the beginning of the route. If you add this metric, does it improve the model? Is this of any practical use?
# a) R^2 increased and the error decreased, but this is not practically useful, as we already know that
#   starting time has the largest impact on ending time.

In [36]:
#create XXX & yyy
XXX = wego_dum[['ROUTE_ABBR', 'TO DOWNTOWN', 'OPERATOR', 'STARTING_ADHERENCE']].values
yyy = wego_dum['ADHERENCE'].values

In [38]:
#split data into training and test sets
XXX_train, XXX_test, yyy_train, yyy_test = train_test_split(XXX, yyy, test_size = 0.25, random_state = 42)

In [39]:
#fit regression model and run
reg3 = LinearRegression()

reg3.fit(XXX_train, yyy_train)
yyy_pred = reg3.predict(XXX_test)

In [40]:
# R^2 score: 35% of the variance in adherence is explained by the route, direction, operator, and starting adherence.
reg3.score(XXX_test, yyy_test)

0.35391543995027097

In [41]:
# MAE: The model predicts the adherence time within +/- 3.0 minutes.
mean_absolute_error(yyy_test, yyy_pred)

2.99836123850187