In [1]:
import pandas as pd, numpy as np, holidays, seaborn as sns, matplotlib.pyplot as plt, time, xgboost as xgb
from sqlalchemy import create_engine
from datetime import datetime
from matplotlib import pyplot

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, PolynomialFeatures, StandardScaler
from sklearn.feature_selection import SelectKBest, f_regression, chi2
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics

from config import config

<h2>Data Collection</h2>

In [2]:
#establish connection to db and retrieve data
config=config()
engine=create_engine("postgresql://"+config["user"]+":"+config["password"]+"@"+config["host"]+"/"+config["database"])
line='39A'
direction=1
sql=("SELECT lt.daystamp, lt.progr_number, lt.stoppoint_id,lt.arrival_time_p,lt.arrival_time_a,"
    "lt.departure_time_p,lt.departure_time_a,trips.route_id,"
     "trips.arrival_time_p,trips.departure_time_p,trips.departure_time_a,"
     "weather_main,temp,feels_like,temp_min,temp_max,pressure,humidity,wind_speed,wind_deg,clouds_all,weather_description "
     "FROM leavetimes AS lt, trips, weather "
     "WHERE trips.line_id='"+line+"' AND trips.direction="+str(direction)+" AND trips.suppressed=0 "
     "AND lt.daystamp = trips.daystamp AND lt.trip_id = trips.trip_id AND lt.suppressed=0"
     "AND lt.weather_id = weather.daytime")
df = pd.read_sql(sql,engine)

In [3]:
#renames the features of the dataframe
features=list(df.columns)
features[0]="daystamp"
features[2]="stop_id"
features[3]="arr_p"
features[4]="arr_a"
features[5]="dep_p"
features[6]="dep_a"
features[8]="end_p"
features[9]="start_p"
features[10]="start_a"
df.columns=features

In [4]:
def daystamp_converter(time):
    """extracts and returns tuple of (weekday,month,hour,holiday) from datetime object."""
    date=datetime.fromtimestamp(time)
    holidays_IE=holidays.Ireland()
    if date in holidays_IE:
        holiday=1
    else:
        holiday=0
    return (date.weekday(),date.month,date.hour,holiday)

In [5]:
#convert time specific data from the dataframe
df["dt"]=df.daystamp.values+df.dep_p.values
df["weekday"],df["month"],df["hour"],df["holiday"]=zip(*df['dt'].apply(daystamp_converter))
df["dur_s"]=df.dep_p.values-df.start_p.values
df["dur_a"]=df.dep_a.values-df.start_a.values

<h2>Feature Cleaning</h2>

In [40]:
#create log dict
logs={
    "rows":{
        "start":df.shape[0]
    }
}

In [41]:
#filter out uncommon routes
routes=df.route_id.value_counts().index[0]
df_clean=df[df.route_id==routes]
rows_routes=df_clean.shape[0]
logs["rows"]["route_filter"]=rows_routes-logs["rows"]["start"]

In [42]:
#remove null values
df_clean = df_clean.dropna(axis = 0, how ='any') 
rows_after_nan=df_clean.shape[0]
logs["rows"]["nan_filter"]=rows_after_nan-rows_routes

In [43]:
#calculate ratio of least to most visited stop
stop_counts=df_clean.stop_id.value_counts()
logs["scr"]=min(stop_counts)/max(stop_counts)

In [44]:
#assign features to type
categorical=["stop_id","route_id","weather_main","weather_description","weekday","month","hour","holiday"]
ints=['daystamp','progr_number','arr_p','arr_a','dep_p','dep_a','dur_s','dur_a','pressure','humidity','wind_deg','clouds_all']
floats=['temp','feels_like','temp_min','temp_max','wind_speed']

In [45]:
#clean up dataframe
df_clean=df_clean.drop(["route_id"],axis=1)
df_clean[ints]=df_clean[ints].astype('int64')
df_clean=df_clean[df_clean.dur_a>=0]
rows_after_dur=df_clean.shape[0]
logs["rows"]["negative_dur"]=rows_after_dur-rows_after_nan
df_clean=df_clean.drop(["stop_id"],axis=1)

In [46]:
#create dataframe with means and standard deviations per progr_number
dur_stats_df=pd.DataFrame(columns=["progr_number","dur_mean","dur_std"])
for progr_number in df_clean.progr_number.unique():
    durations=df_clean.dur_a[df_clean["progr_number"]==progr_number]
    dur_stats_df=dur_stats_df.append(pd.Series([progr_number,durations.mean(),durations.std()],index=dur_stats_df.columns),ignore_index=True)
    dur_stats_df["progr_number"]=dur_stats_df["progr_number"].astype('int64')

#merge duration stats with df_clean and look for outliers (outside 3 SDs from mean)
df_clean=df_clean.merge(dur_stats_df,how='inner',on='progr_number')
df_clean["outlier"]=abs(df_clean["dur_a"]-df_clean["dur_mean"])>3*df_clean["dur_std"]

#only keep rows that aren't outliers
df_clean=df_clean[df_clean["outlier"]==False]
rows_after_outliers=df_clean.shape[0]
logs["rows"]["outliers"]=rows_after_outliers-rows_after_dur