In [1]:
import pandas as pd
import numpy as np
from numpy import average
import matplotlib.pyplot as plt 
import matplotlib.mlab as mlab
import matplotlib.patches as mpatches
import seaborn as sb
import datetime as dt
import glob
import json
from collections import Counter

# called to counteract dark-mode Jupyter from swallowing axes &c.
plt.style.use('seaborn-whitegrid')

In [2]:
# Get station information from json file

with open('data/cbs_data/station_information_a.json', 'r') as json_file:
	json_load = json.load(json_file)

station_info = pd.DataFrame(json_load['data']['stations'])

In [3]:
station_info.head()

Unnamed: 0,region_id,electric_bike_surcharge_waiver,legacy_id,station_id,rental_uris,external_id,eightd_station_services,has_kiosk,capacity,lon,name,rental_methods,lat,short_name,station_type,eightd_has_key_dispenser
0,41,False,1,1,"{'ios': 'https://dc.lft.to/lastmile_qr_scan', ...",082469cc-1f3f-11e7-bf6b-3863bb334450,[],True,15,-77.05323,Eads St & 15th St S,"[KEY, CREDITCARD]",38.858971,31000,classic,False
1,41,False,3,3,"{'ios': 'https://dc.lft.to/lastmile_qr_scan', ...",08246c35-1f3f-11e7-bf6b-3863bb334450,[],True,17,-77.049232,Crystal Dr & 20th St S,"[KEY, CREDITCARD]",38.856425,31002,classic,False
2,41,False,4,4,"{'ios': 'https://dc.lft.to/lastmile_qr_scan', ...",08246cd5-1f3f-11e7-bf6b-3863bb334450,[],True,16,-77.049417,Crystal Dr & 15th St S,"[KEY, CREDITCARD]",38.861056,31003,classic,False
3,41,False,5,5,"{'ios': 'https://dc.lft.to/lastmile_qr_scan', ...",08246d68-1f3f-11e7-bf6b-3863bb334450,[],True,12,-77.05949,Aurora Hills Cmty Ctr / 18th St & S Hayes St,"[KEY, CREDITCARD]",38.857866,31004,classic,False
4,41,False,6,6,"{'ios': 'https://dc.lft.to/lastmile_qr_scan', ...",08246df5-1f3f-11e7-bf6b-3863bb334450,[],True,19,-77.059936,Pentagon City Metro / 12th St & S Hayes St,"[KEY, CREDITCARD]",38.862303,31005,classic,False


In [4]:
station_info.columns

Index(['region_id', 'electric_bike_surcharge_waiver', 'legacy_id',
       'station_id', 'rental_uris', 'external_id', 'eightd_station_services',
       'has_kiosk', 'capacity', 'lon', 'name', 'rental_methods', 'lat',
       'short_name', 'station_type', 'eightd_has_key_dispenser'],
      dtype='object')

In [5]:
# Drop not needed columns in station information

station_info = station_info.drop(['electric_bike_surcharge_waiver', 'rental_uris', 'external_id', 'eightd_station_services',\
	'rental_methods', 'station_type', 'eightd_has_key_dispenser'], axis = 1)


In [6]:
station_info.rename(columns={'lon': 'longitude', 'lat': 'latitude'}, inplace=True)

In [7]:
station_info.head()

Unnamed: 0,region_id,legacy_id,station_id,has_kiosk,capacity,longitude,name,latitude,short_name
0,41,1,1,True,15,-77.05323,Eads St & 15th St S,38.858971,31000
1,41,3,3,True,17,-77.049232,Crystal Dr & 20th St S,38.856425,31002
2,41,4,4,True,16,-77.049417,Crystal Dr & 15th St S,38.861056,31003
3,41,5,5,True,12,-77.05949,Aurora Hills Cmty Ctr / 18th St & S Hayes St,38.857866,31004
4,41,6,6,True,19,-77.059936,Pentagon City Metro / 12th St & S Hayes St,38.862303,31005


In [14]:
station_info['longitude']=='NaN'

0      False
1      False
2      False
3      False
4      False
       ...  
676    False
677    False
678    False
679    False
680    False
Name: longitude, Length: 681, dtype: bool

In [8]:
station_info.region_id.unique()

array(['41', '42', '40', '44', '43', '104', '133', '152'], dtype=object)

In [9]:
station_info['region_id'] = station_info['region_id'].astype('int64')

In [10]:
# Regions are used only for bike stations.
# According to Capital Bike Share sub Real-Time Data the regions defined are:

station_info['region_name'] = station_info.region_id.map({
        40:'Alexandria, VA', 41:'Arlington, VA', 42:'Washington, DC', 43:'Montgomery County, MD (North)',
        44:'Montgomery County, MD (South)', 48:'Test & Operations', 104:'Fairfax, VA',
        128:'8D', 133:'Prince George\'s County', 152:'Falls Church, VA'
    }).astype('category')


In [None]:
station_info.to_csv('data/mod_data/stationinfo_conv.csv') 

In [15]:
bike_trips = pd.read_csv('data/mod_data/bike_trips_2011_2012.csv')


In [51]:
# All stations of 2011 and 2012 
# TODO: make this more pythonic
stations_2011 = bike_trips.loc[bike_trips['year']==2011]
stations_2011 = stations_2011['start_station_number'].unique()
stations_2012 = bike_trips.loc[bike_trips['year']==2012]
stations_2012 = stations_2012['start_station_number'].unique()

#np.savetxt('data/mod_data/stations_2011_conv.csv', stations_2011, delimiter="\t", fmt='%s')
#np.savetxt('data/mod_data/stations_2012_conv.csv', stations_2012, delimiter="\t", fmt='%s')


In [25]:
# These are the 3 stations that exist in 2011 but not anymore in 2012
# returns values in stations_2011 not in stations_2012
lost_stations = np.setdiff1d(stations_2011, stations_2012, assume_unique=False)
lost_stations


array([31261, 31266, 31704], dtype=int64)

In [26]:
# These are the 50 stations that exist in 2012 but not in 2011
# returns values in stations_2012 not in stations_2011
new_stations = np.setdiff1d(stations_2012, stations_2011, assume_unique=False)
new_stations


array([31023, 31024, 31025, 31026, 31027, 31028, 31029, 31030, 31031,
       31032, 31033, 31034, 31035, 31036, 31037, 31038, 31039, 31040,
       31041, 31042, 31043, 31044, 31045, 31046, 31047, 31048, 31049,
       31050, 31051, 31234, 31240, 31243, 31247, 31248, 31249, 31250,
       31251, 31252, 31253, 31254, 31255, 31256, 31257, 31258, 31306,
       31507, 31508, 31612, 31628, 31803], dtype=int64)

In [53]:
current_stations = station_info['short_name'].unique().astype('int64')
current_stations

array([31000, 31002, 31003, 31004, 31005, 31006, 31007, 31009, 31010,
       31011, 31012, 31100, 31101, 31102, 31104, 31105, 31106, 31107,
       31108, 31201, 31202, 31203, 31204, 31205, 31400, 31401, 31502,
       31600, 31601, 31602, 31700, 31305, 31702, 31703, 31704, 31801,
       31802, 31206, 31500, 31111, 31207, 31209, 31110, 31109, 31013,
       31208, 31200, 31603, 31212, 31213, 31604, 31605, 31606, 31607,
       31214, 31300, 31503, 31608, 31301, 31302, 31402, 31804, 31805,
       31610, 31216, 31705, 31217, 31215, 31501, 31220, 31218, 31219,
       31211, 31613, 31221, 31303, 31611, 31620, 31222, 31223, 31112,
       31224, 31225, 31609, 31612, 31226, 31227, 31228, 31504, 31505,
       31615, 31616, 31617, 31618, 31619, 31701, 31622, 31229, 31230,
       31231, 31232, 31233, 31234, 31621, 31235, 31236, 31237, 31623,
       31624, 31266, 31304, 31238, 31052, 31240, 31262, 31260, 31014,
       31015, 31016, 31017, 31018, 31706, 31261, 31019, 31113, 31239,
       31241, 31020,

In [55]:
# stations existing in 2011 and not present in the current station info file
lost_2011_stations = np.setdiff1d(stations_2011, current_stations, assume_unique=False)
lost_2011_stations

array([31001, 31008, 31103, 31614], dtype=int64)

In [56]:
# stations existing in 2012 and not present in the current station info file
lost_2012_stations = np.setdiff1d(stations_2012, current_stations, assume_unique=False)
lost_2012_stations

array([31001, 31008, 31103, 31614], dtype=int64)

Four stations present in 2011 and 2012 have been retired in the current station info.<br />
These four stations will not have geodata to map.

In [64]:
# Important: go by the station number and not by the station name, some stations have been renamed!!
# Example station ID 31009: in 2011/12 it was called "27th & Crystal Dr" and in the current station
# file it is called "Crystal Dr & 27th St S"

stations_2012 = bike_trips.loc[bike_trips['year']==2012]
stations_2012 = stations_2012['start_station'].unique()
current_stations = station_info['name'].unique()#.astype('int64')
np.setdiff1d(stations_2012, current_stations, assume_unique=False)

array(['11th & H St NE', '12th & Army Navy Dr', '15th & Crystal Dr',
       '15th & N Scott St', '18th & Eads St.', '19th & L St NW',
       '1st & N St  SE', '20th & Crystal Dr', '21st & M St NW',
       '23rd & Crystal Dr', '26th & S Clark St', '27th & Crystal Dr',
       'Aurora Hills Community Ctr/18th & Hayes St',
       'Ballston Metro / N Stuart & 9th St N', 'Braddock Rd Metro',
       'Central Library / N Quincy St & 10th St N',
       'Connecticut Ave & Newark St NW / Cleveland Park',
       'Crystal City Metro / 18th & Bell St', 'Eads & 22nd St S',
       'Eastern Market Metro / Pennsylvania Ave & 7th St SE',
       'Fairfax Dr & Kenmore St', 'Georgia Ave and Fairmont St NW',
       'Key Blvd & N Quinn St', 'Lee Hwy & N Adams St',
       'Lynn & 19th St North', 'N Randolph St & Fairfax Dr',
       'N Rhodes & 16th St N', 'N Veitch  & 20th St N',
       'N Veitch & Key Blvd', 'Pentagon City Metro / 12th & S Hayes St',
       'Rosslyn Metro / Wilson Blvd & Ft Myer Dr',
       '

<hr>