In [3]:
import pandas as pd
import re
from datetime import timedelta
import xml.etree.ElementTree as et
import bokeh.plotting as bk
from bokeh.models.glyphs import Patches, Line, Circle
from bokeh.models import (
    GMapPlot, Range1d, ColumnDataSource, LinearAxis,
    HoverTool, PanTool, WheelZoomTool, BoxSelectTool, ResetTool, PreviewSaveTool,
    GMapOptions, DataRange1d, Circle, MultiLine, ResetTool, UndoTool, RedoTool,
    NumeralTickFormatter, PrintfTickFormatter, tools, BoxZoomTool)
from bokeh.resources import CDN
from bokeh.palettes import brewer
from bokeh.embed import components, autoload_static, autoload_server
from bokeh.io import output_file, show, output_notebook
import itertools
from collections import defaultdict, Counter
from math import log2, exp
from colorsys import hls_to_rgb

In [4]:
def xml_to_pandas(xml_file):
    tree = et.parse(xml_file)
    root = tree.getroot()

    l = []
    for station in root:
        d = {}
        for attrib in station:
            d[str(attrib.tag)] = str(attrib.text)
            #print("    " + str(attrib.tag) + " : " + str(attrib.text))
        l.append(d)

    df = pd.DataFrame.from_dict(l)
    return df

# read the data
data = pd.read_csv('data/2016-Q1-Trips-History-Data.csv',
                   parse_dates=['Start date', 'End date'], infer_datetime_format=True)
data['Duration'] = data['Duration (ms)'].apply(lambda x: timedelta(milliseconds=int(x)))

data['End station number'] = data['End station number'].astype(int)
data['Start station number'] = data['Start station number'].astype(int)

data = data.dropna()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [10]:
bike_stations = xml_to_pandas('data/bike_stations.xml')
bike_stations['terminalName'] = bike_stations['terminalName'].astype(int)
bike_stations['lat'] = bike_stations['lat'].astype(float)
bike_stations['long'] = bike_stations['long'].astype(float)
station_locations = bike_stations[['terminalName']]
station_locations['location'] = list(zip(bike_stations['lat'], bike_stations['long']))
bike_stations.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,id,installDate,installed,lastCommWithServer,lat,latestUpdateTime,locked,long,name,nbBikes,nbEmptyDocks,public,removalDate,temporary,terminalName
0,1,0,True,1465765388595,38.858662,1465759880305,False,-77.053199,Eads St & 15th St S,14,1,True,,False,31000
1,2,0,True,1465765336531,38.85725,1465762641762,False,-77.05332,18th & Eads St.,2,9,True,,False,31001
2,3,0,True,1465765438506,38.856425,1465764621010,False,-77.049232,20th & Crystal Dr,13,0,True,,False,31002
3,4,0,True,1465765332743,38.86017,1465765332270,False,-77.049593,15th & Crystal Dr,8,3,True,,False,31003
4,5,0,True,1465765440377,38.857866,1465762076746,False,-77.05949,Aurora Hills Community Ctr/18th & Hayes St,1,9,True,,False,31004


In [None]:
# merge location and usage info
data = data.merge(station_locations, how='left', left_on='Start station number', right_on='terminalName')
data.columns = [w if w != 'location' else 'start location' for w in data.columns]
data.drop('terminalName', axis=1, inplace=True)

data = data.merge(station_locations, left_on='End station number', right_on='terminalName')
data.columns = [w if w != 'location' else 'end location' for w in data.columns]
data.drop('terminalName', axis=1, inplace=True)
data = data[data['end location'].map(lambda x: isinstance(x,tuple))]
data = data[data['start location'].map(lambda x: isinstance(x,tuple))]

In [5]:
pairs = []

for index, row in data.iterrows():
    pair = tuple(sorted([row['start location'],row['end location']]))
    pairs.append(pair)

pair_dict = Counter(pairs)
most_common,ncm = pair_dict.most_common(1)[0]

lats = []
lons = []
size_scale = []
num = []
colors = []
for k,v in pair_dict.items():
    lats.append([k[0][0],k[1][0]])
    lons.append([k[0][1],k[1][1]])
    transformed =(v-1)/(ncm-1)
    size_scale.append(transformed)
    num.append(v)


In [12]:
source = ColumnDataSource(
    data=dict(
        lat=bike_stations['lat'].values,
        lon=bike_stations['long'].values,
        name=bike_stations['name'].values,
    )
)

line_source = ColumnDataSource(
    data=dict(
        lats=lats,
        lons=lons,
        line_alpha=[max(0.1,scale) for scale in size_scale],
        line_width=[scale*20 for scale in size_scale],
        num=num,
        colors=colors
    )
)

hover = HoverTool(
    tooltips=[
        ("index", "$index"),
        ("Name", "@name"),
    ]
)


map_options = GMapOptions(lat=38.889490, lng=-77.035180, map_type="terrain", zoom=13)

plot = GMapPlot(
    x_range=DataRange1d(), y_range=DataRange1d(), map_options=map_options, title="Washington, DC",
    plot_width=1280, plot_height=1280, responsive=True
)


lines = MultiLine(xs="lons", ys="lats", line_alpha="line_alpha",
                  line_width="line_width", line_color="red", line_cap="round")
circle = Circle(x="lon", y="lat", size=10, fill_color="blue", fill_alpha=0.8, line_color=None)
plot.add_glyph(source, circle)
plot.add_glyph(line_source, lines)

plot.add_tools(PanTool(), WheelZoomTool(), BoxZoomTool(), hover, ResetTool(), UndoTool(), RedoTool())
output_notebook()
output_file("gmap_plot.html")
show(plot)
