In [1]:
import folium
import pandas as pd
import numpy as np
import seaborn as sns
import statsmodels.formula.api as smf
from helpers import *

# this allows plots to appear directly in the notebook
%matplotlib inline

In [2]:
# These files use \N as a missing value indicator.  When reading the CSVs, we will tell
# it to use that value as missing or NA.  The double backslash is required because
# otherwise it will interpret \N as a carriage return.
train_data = pd.read_csv("data/trip.csv", header=None, na_values='\\N')
train_data.columns = ["time", "time_formated", "id", "route_id", "vehicle_id", "vehicle_label", "delay", "lat", "lon", "general_weather", "temp", "temp_min", "temp_max", "visibility", "wind_speed"]
train_data["time_formated"] = pd.to_datetime(train_data["time_formated"])

In [None]:
train_data['normalized_delay'] = train_data['delay'] / (train_data['delay'].std() * 1)

In [None]:
train_data['color'] = train_data.apply(colorRange, axis=1)

In [None]:
train_data['time_since_midnight'] = train_data.apply(timeSinceMidnight, axis=1)

In [None]:
train_data['time_group_since_midnight'] = train_data.apply(groupTime, axis=1)

In [None]:
train_data['lat_round'] = train_data.apply(lambda row: round(row['lat'],3), axis=1)
train_data['lon_round'] = train_data.apply(lambda row: round(row['lon'],3), axis=1)

In [None]:
train_data.to_csv('data/train_data.csv')

In [None]:
all_data_map = folium.Map(location=[41.8781,-87.6298], zoom_start=9, tiles='Stamen Toner')
for name, row in train_data.iterrows():
    marker = folium.CircleMarker([row['lat'], row['lon']], radius=1, popup=str(row['delay']))
    marker.add_to(all_data_map)
all_data_map.save('Maps/all_data_map.html')

In [None]:
color_coded_map = folium.Map(location=[41.8781,-87.6298], zoom_start=9, tiles='Stamen Toner')

for name, row in train_data.iterrows():
    marker = folium.CircleMarker([row['lat'], row['lon']], 
                                 opacity=0.5, radius=0.5, 
                                 color='#'+str(hex(row['color'], 
                                 fill_color='#'+str(hex(row['color'])), 
                                 popup=row['delay']
    marker.add_to(color_coded_map)
color_coded_map.save('Maps/color_coded_map.html')

In [None]:
loc_groups = train_data.groupby(['lat','lon'])
delay = loc_groups['delay']
delay_mean = delay.mean()
delay_mean['color'] = delay_mean.apply(delaycolors)

In [None]:
sns.distplot(train_data['delay'])

In [None]:
sns.distplot(train_data['visibility'])

In [None]:
sns.jointplot(x="delay", y="temp", data=train_data, size=8)

In [None]:
sns.regplot(x="delay", y="temp", data=train_data)

In [None]:
mod = smf.ols(formula='delay ~ temp', data=train_data)
res = mod.fit()
print(res.summary())

In [None]:
train_data.head(100)

In [None]:
delay_group = train_data.groupby(['lat_round', 'lon_round', 'time_group_since_midnight'])['normalized_delay'].mean()
delay_group

In [None]:
group_dict = {}
for group_id, group_delay in delay_group.iteritems():
    lat, lon, time = group_id
    try:
        current = group_dict[time]
    except KeyError:
        group_dict[time] = {0:{'lat': lat, 'lon': lon, 'delay': group_delay}}
    else:
        group_dict[time][len(current)] = {'lat': lat, 'lon': lon, 'delay': group_delay}

In [None]:
import json
with open('data/stops.json') as stops_file:  
    stops = json.load(stops_file)    

In [None]:
time_groups = [time for time in range(0,86400,900)]
line_groups = {}
for time in time_groups:
    time_map = folium.Map(location=[41.8781,-87.6298], zoom_start=9)
    
    for stop in stops:
        marker = folium.CircleMarker([stop['stop_lat'],
                                      stop['stop_lon']], 
                                     radius=1, 
                                     popup=stop['stop_name'])
        marker.add_to(time_map)
        #time_map.keep_in_front(marker)
    
    try:
        time_group = group_dict[time]
    except KeyError:
        continue
    for index, delay_data in time_group.items():
        color = colorRangeNum(delay_data['delay'])
        marker = folium.CircleMarker([delay_data['lat'], delay_data['lon']],
                                     opacity=0.5,
                                     radius=0.5, color='#'+str(color)[2:], 
                                     fill_color='#'+str(color)[2:], 
                                     popup=str(int(delay_data['delay'] * (train_data['delay'].std() * 1))))
        marker.add_to(time_map)
    time_map.save('Maps/Time_Maps/Map_'+format_time(time)+'.html') 

In [None]:
time_delay = train_data.groupby(['time_since_midnight', 'route_id'])['delay'].mean()

In [None]:
time_delay

In [None]:
time_delay_dict = {}
for info_id, delay in time_delay.iteritems():
    time, route = info_id
    time_delay_dict[len(time_delay_dict)] = {'time' : time, 'route_id': route, 'delay' : delay}

In [None]:
time_delay_df = pd.DataFrame.from_dict(time_delay_dict, orient='index')

In [None]:
time_delay_df

In [None]:
mod = smf.ols(formula='delay ~ time', data=time_delay_df)
res = mod.fit()
print(res.summary())

In [None]:
mod = smf.ols(formula='delay ~ time + C(route_id)', data=train_data)
res = mod.fit()
print(res.summary())