In [26]:
import pandas as pd
from fiona.crs import from_epsg
from geopandas import GeoDataFrame
from shapely.geometry import Point
from bokeh.tile_providers import CARTODBPOSITRON
import re



In [172]:
def process_time_ped(value):
    
    # First, direction 1

    # There are some early morning values e.g. 12:15-12:30 that should be 00:15-00:30
    value = re.sub(r'^12(:\d{2}-(\d{1,2}):\d{2}$)', r'00\1', value)
    value = re.sub(r'(\d{1,2}:\d{2}-)(12)(:\d{2})$', r'\g<1>00\3', value)

    # There are some special cases where am/pm are included. Fix these
    if value == '12:00-12:15 am':
        value = '00:00-00:15'
    elif value == '11:45-12:00 am':
        value = '23:45-00:00'
    value = re.sub(r'(.+) pm$', r'\1', value)

    # Afternoon values for direction 1 are in the format '1:00-1:15.1'
    # First, turn to 24 hour time
    # e.g. 1:00-1:15.1' will be '13:00-13:15'
    m = re.match(r'^(\d{1,2}):\d{2}-(\d{1,2}):\d{2}\.1$', value)
    if m:
        if int(m[1]) < 12:
            hour1 = str(int(m[1]) + 12)
        else:
            hour1 = m[1]
        if int(m[2]) < 12:
            hour2 = str(int(m[2]) + 12)
        else:
            hour2 = m[2]
        value = re.sub(
            r'^\d{1,2}:(\d{2})-\d{1,2}:(\d{2})\.1$', rf"{hour1}:\1-{hour2}:\2", value
        )
  
    # some values are in format '2:00-2:15' and these should be in form '02:00-02:15'
    if re.match(r'^\d{1,2}:\d{2}-\d{1,2}:\d{2}(\.1)?$', value):
        value = re.sub(
            r'(^\d{1}:\d{2}-\d{1,2}:\d{2}$)', r"0\1", value)
        value = re.sub(
            r'(^\d{1,2}:\d{2}-)(\d{1}:\d{2}$)', r'\g<1>0\2', value)
        
        # All the these should be a direction 1 value
        value = re.sub(
            r'(^\d{2}:\d{2}-\d{2}:\d{2}$)', r"\1 Direction 1", value)
        
    # Next, Direction 2
    # They are in the form 2-000 or 2-1200
    m = re.match(r'^2-(\d{1,2})(\d{2})$', value)
    if m:
        start_hour = int(m[1])
        end_hour = int(m[1])
        start_minute = int(m[2])
        
        end_minute = (start_minute + 15) % 60
        if end_minute == 0:
            end_minute = '00'
            
            if start_hour == 23:
                end_hour = 0
            else:
                end_hour += 1
            
        if start_minute == 0:
            start_minute = '00'
            
        if start_hour < 10:
            start_hour = '0' + str(start_hour)
        if end_hour < 10:
            end_hour = '0' + str(end_hour)
        
        value = str(start_hour) + ':' + str(start_minute) + '-' + str(end_hour) + ':' + str(end_minute) + ' Direction 2'
        print(value)
    # For the rest, don't modify column name
    return value

In [173]:
# Process ped data
ped_df = pd.read_csv('~/cambridge_data/access_dumps/ped_counts.csv')

ped_df.rename(
    columns={
        'Text153':'Location',
        'Text480':'Day of week',
        'E': 'x',
        'N': 'y',
        'Width                           ft': 'Width in ft',
    }, 
    inplace=True
)

ped_df.rename(columns={
   element: process_time_ped(element)
    for element in ped_df.columns.tolist()}, inplace=True)

# Segment ID is always null for this dataset, so drop it
ped_df = ped_df.drop(['Segment ID'], axis=1)

# Text159, Text163, Text326 and Text327 are duplicate information of 'Location 1' and 'Location 2', so drop
ped_df = ped_df.drop(['Text159', 'Text163', 'Text326', 'Text327'], axis=1)

# 15 minute interval seems to always be true, but sometimes is set to -1 and occasionally to NaN.
# Leave this field in, unmodified

# Annual Growth, Seasonal Factor appear to always be NaN
# Width, Type, Weather, Seasonal Factor almost always undefined. May not want to include, decide later

geometry = [Point(xy) for xy in zip(ped_df.x, ped_df.y)]

# Dataset uses Massachusetts Mainland Projections
crs = {'init': 'epsg:2249'}
ped_gdf = GeoDataFrame(ped_df, crs=crs, geometry=geometry)
ped_gdf['geometry'] = ped_gdf['geometry'].to_crs(epsg=3857)
ped_gdf.crs = from_epsg(3857)


pd.set_option('display.max_rows', None)
ped_gdf['geometry'].head()

ped_gdf['Type']
ped_gdf.iloc[1]


00:00-00:15 Direction 2
00:15-00:30 Direction 2
00:30-00:45 Direction 2
00:45-01:00 Direction 2
01:00-01:15 Direction 2
01:15-01:30 Direction 2
01:30-01:45 Direction 2
01:45-02:00 Direction 2
02:00-02:15 Direction 2
02:15-02:30 Direction 2
02:30-02:45 Direction 2
02:45-03:00 Direction 2
03:00-03:15 Direction 2
03:15-03:30 Direction 2
03:30-03:45 Direction 2
03:45-04:00 Direction 2
04:00-04:15 Direction 2
04:15-04:30 Direction 2
04:30-04:45 Direction 2
04:45-05:00 Direction 2
05:00-05:15 Direction 2
05:15-05:30 Direction 2
05:30-05:45 Direction 2
05:45-06:00 Direction 2
06:00-06:15 Direction 2
06:15-06:30 Direction 2
06:30-06:45 Direction 2
06:45-07:00 Direction 2
07:00-07:15 Direction 2
07:15-07:30 Direction 2
07:30-07:45 Direction 2
07:45-08:00 Direction 2
08:00-08:15 Direction 2
08:15-08:30 Direction 2
08:30-08:45 Direction 2
08:45-09:00 Direction 2
09:00-09:15 Direction 2
09:15-09:30 Direction 2
09:30-09:45 Direction 2
09:45-10:00 Direction 2
10:00-10:15 Direction 2
10:15-10:30 Dire

  return _prepare_from_string(" ".join(pjargs))


Location ID                                                         342
Location                                                 FAWCETT STREET
at                                                       CONCORD AVENUE
leg                                                               North
Count Type                                                  Pedestrians
Day of week                                                         Wed
Date                                                          05-Oct-16
15 min interval                                                      -1
Direction 1                                                        East
Direction 2                                                        West
Weather                                                             NaN
Collected By                                                        PDI
for                                                                 VHB
Source                                            55 Fawcett Str

In [None]:
# Mucking around with plotting points
#x = []
#y = []
# # Get x and y coords for plotting
#for point in geometry:
#    print(point.x)
#p = figure(x_range=(-7919969.67, -7911590.87), y_range=(5221575.24, 5214113.74), x_axis_type="mercator", y_axis_type="mercator")
#p.add_tile(CARTODBPOSITRON)
#show(p)
