In [2]:
# script to explore Chicago L stop data for Datascope data challenge:

# data origin: https://data.cityofchicago.org/Transportation/CTA-Ridership-L-Station-Entries-Daily-Totals/5neh-572f

# station information origin:
# https://data.cityofchicago.org/Transportation/CTA-System-Information-List-of-L-Stops/8pix-ypme

# interactive map resource: https://blog.dominodatalab.com/creating-interactive-crime-maps-with-folium/

import folium
import os
import pandas as pd
print(folium.__version__)
import sys
sys.version

0.2.1


'2.7.12 |Anaconda 4.1.1 (x86_64)| (default, Jul  2 2016, 17:43:17) \n[GCC 4.2.1 (Based on Apple Inc. build 5658) (LLVM build 2336.11.00)]'

In [3]:
# data location (keep raw data off of github):

path_to_data = '/Users/kylefrankovich/Desktop/datascope_CTA_data/' \
               'CTA_-_Ridership_-__L__Station_Entries_-_Daily_Totals.csv'

path_to_station_info = '/Users/kylefrankovich/Desktop/datascope_CTA_data/' \
                       'CTA_-_System_Information_-_List_of__L__Stops.csv'


data = pd.read_csv(path_to_data)

station_data = pd.read_csv(path_to_station_info)

print 'size of CTA data:', len(data) # 809326 rows, matches data from CTA data site

print 'size of station data:', len(station_data) # 300 rows, matches data from CTA data site

# let's see what we got:

data.head()

station_data.head()

data.columns # list of column names

station_data.columns

print data.describe()

station_data[station_data['STATION_NAME'] == 'Wilson']['Location']

data.isnull().values.any() # looks like we have no missing data. dope.

station_names = data.stationname.unique()

len(station_names) # 147 station names

station_IDs = data.station_id.unique()

len(station_IDs) # 146 station IDs

map_IDs = station_data.MAP_ID.unique()

len(map_IDs) # 144 map IDs; NB: it appears that map IDs from this dataset
# matches with station IDs in the main dataset, although there is a mismatch in
# number (146 vs. 144)



size of CTA data: 809326
size of station data: 300
          station_id          rides
count  809326.000000  809326.000000
mean    40759.453916    3275.253554
std       444.387019    3144.853450
min     40010.000000       0.000000
25%     40370.000000    1112.000000
50%     40750.000000    2300.000000
75%     41150.000000    4329.000000
max     41690.000000   36323.000000


144

We need to manipulate our data. What we want is a filtered dataframe containing stop name and number of rides within our selected dates.

In [4]:
#  data mapipulation:

# we're interested in the past 5 years of Lollapalooza data:

lolla_dates = ['07/28/2016', '07/29/2016', '07/30/2016', '07/31/2016',
               '07/31/2015', '08/01/2015', '08/02/2015',
               '08/01/2014', '08/02/2014', '08/03/2014',
               '08/02/2013', '08/03/2013', '08/04/2013',
               '08/03/2012', '08/04/2012', '08/05/2012']

data_filtered_lolla = data[data['date'].isin(lolla_dates)] # data filtered by Lolla dates

print len(data_filtered_lolla)
#print data_filtered_lolla.head(10)

# collapse across dates by station_id:
#  (NB: here we're collapsing across year, might eventually want to look at year-by-year trends)

data_filtered_lolla = data_filtered_lolla.groupby('station_id').sum()
print data_filtered_lolla.head(10)
print data_filtered_lolla.describe()
print data_filtered_lolla.sort_values('rides')

print data_filtered_lolla.sort_values('rides').iloc[0]

# now need to add column for scaling factor of bubbles:
# example max: 1250; example min: 350


2295
             rides
station_id        
40010        23575
40020        55471
40030        23834
40040        69825
40050        52321
40060        68391
40070       145438
40080        81451
40090        32005
40100        55321
               rides
count     144.000000
mean    57864.909722
std     52576.730736
min      6093.000000
25%     22101.500000
50%     39174.500000
75%     70637.000000
max    265518.000000
             rides
station_id        
40600         6093
41680         9313
41690         9386
41140         9967
40840        10121
40400        10899
41250        11363
40940        12139
41670        12143
40520        12297
40300        13008
41040        13944
40690        13993
41360        14229
40740        14938
41270        14972
40980        15375
40780        15835
40150        15899
40130        16499
40270        16974
40610        17515
40700        17633
40440        17986
41050        18121
40970        18351
40210        18353
41070        18596
40870   

In [7]:
import numpy as np

chicago_coordinates = [41.8781, -87.6298]
m = folium.Map(location=chicago_coordinates, zoom_start=11)
# m.simple_marker(chicago_coordinates)
# folium.CircleMarker(location=chicago_coordinates, radius=1250,
#                     popup='Chicago Coordinates', color='#3186cc',
#                     fill_color='#3186cc').add_to(m)
#folium.Marker(chicago_coordinates, popup='Chicago Coordinates').add_to(m)
division_coordinates = [41.9033, -87.6665]
#folium.Marker(division_coordinates, popup='Division Station').add_to(m)
# folium.CircleMarker(location=division_coordinates, radius=350,
#                     popup='Division Station', color='#3186cc',
#                     fill_color='#3186cc').add_to(m)
wilson_coordinates = [41.964273, -87.657588]
# folium.RegularPolygonMarker(location=wilson_coordinates, popup='Wilson Station',
#                    fill_color='#3186cc', number_of_sides=6, radius=10).add_to(m)

# try adding each station:

for index, row in station_data.iterrows():
    current_ID = row['MAP_ID']
    current_name = row['STOP_NAME']
    string_coords = ''.join(
            c for c in row['Location'] if c not in '(){}<>')
    # coordinates = map(float, string_coords.split(',')) # map was throwing error, try:
    coordinates = [float(i) for i in string_coords.split(',')]
    folium.CircleMarker(location=coordinates, radius=150,
                    popup=current_name, color='#3186cc',
                    fill_color='#3186cc').add_to(m)


m

In [62]:
# alternative to get bubble map?
# http://kartograph.org/showcase/symbols/

# or, plotly:
# https://plot.ly/python/bubble-maps/

In [65]:
!pip install plotly

Collecting plotly
  Downloading plotly-1.12.11.tar.gz (800kB)
[K    100% |████████████████████████████████| 808kB 828kB/s 
Building wheels for collected packages: plotly
  Running setup.py bdist_wheel for plotly ... [?25l- \ done
[?25h  Stored in directory: /Users/kylefrankovich/Library/Caches/pip/wheels/96/2f/58/a27345a87a6d26f8ce5f5f6db147f05674a521813709a276c9
Successfully built plotly
Installing collected packages: plotly
Successfully installed plotly-1.12.11
[33mYou are using pip version 8.1.2, however version 9.0.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
