# Import usage stats

In [22]:
import pandas as pd
import glob
import re
import pickle as pkl

dirs = glob.glob('./usage_stats/*')

def get_numbers_from_filename(filename):
    return re.search(r'\d+', filename).group(0)

for dirname in dirs:
    fname_list = glob.glob(dirname + '/*.csv')    
    dfs = (pd.read_csv(fname, encoding = "ISO-8859-1") for fname in fname_list)
    all_data = pd.concat(dfs, ignore_index=True)
    
    ## Clean data - NAs, and start/end being the same station, drop additional useless columns

    print(all_data.shape)

    all_data.dropna(axis=0, subset=["StartStation Id", "EndStation Id", "Start Date", "End Date"], inplace=True)

    print(all_data.shape)

    all_data["EndStation Id"] = pd.to_numeric(all_data["EndStation Id"], errors='coerce')
    all_data["StartStation Id"] = pd.to_numeric(all_data["StartStation Id"], errors='coerce')

    #all_data = all_data[all_data["StartStation Id"] != all_data["EndStation Id"]]

    all_data = all_data.loc[:,('Start Date',
                               'StartStation Id',
                               'End Date',
                               'EndStation Id',
                               'Duration')]

    print(all_data.shape)

    ## Extra drop for duplicates

    all_data.drop_duplicates(inplace=True)
    print(all_data.shape)

    all_data.head()
    
    fname_out = get_numbers_from_filename(dirname)
    fname_out = 'usage_data' + fname_out[0:4] + '.pkl'
    with open(fname_out, 'wb') as handle:
        pkl.dump(all_data, handle, protocol=pkl.HIGHEST_PROTOCOL)
    

(10216388, 15)
(9882294, 15)
(9882294, 5)
(9302479, 5)


  


(11481596, 12)
(10242351, 12)
(10242351, 5)
(9681802, 5)
(8042370, 9)
(8026318, 9)
(8026318, 5)
(7969749, 5)


# Getting the bike station locations

TfL have a live "cycle hire updates" feed which lists information for each cycle hire station, updated once every minute or so. I don't utilise this live data - instead I just take the name, ID, lat/lon, and capacity for each bike station.

In [21]:
import requests
from xml.etree import ElementTree as ET
import pandas as pd

site = "https://tfl.gov.uk/tfl/syndication/feeds/cycle-hire/livecyclehireupdates.xml"

response = requests.get(site)
root = ET.fromstring(response.content)

id_list = [int(root[i][0].text) for i in range(0, len(root))]
name_list = [root[i][1].text for i in range(0, len(root))]
lat_list = [float(root[i][3].text) for i in range(0, len(root))]
lon_list = [float(root[i][4].text) for i in range(0, len(root))]
capacity_list = [int(root[i][12].text) for i in range(0, len(root))]

all_locs = pd.DataFrame(list(zip(name_list, id_list, lat_list, 
                                 lon_list, capacity_list)), columns = ["name","id","lat","lon","capacity"])

all_locs.to_csv('bike_point_locations_saved.csv', header=True, index=None)

fname_out = 'all_locs.pkl' 
with open(fname_out, 'wb') as handle:
    pkl.dump(all_locs, handle, protocol=pkl.HIGHEST_PROTOCOL)

print(all_locs.shape)
all_locs.head()

(781, 5)


Unnamed: 0,name,id,lat,lon,capacity
0,"River Street , Clerkenwell",1,51.529163,-0.109971,19
1,"Phillimore Gardens, Kensington",2,51.499607,-0.197574,37
2,"Christopher Street, Liverpool Street",3,51.521284,-0.084606,32
3,"St. Chad's Street, King's Cross",4,51.530059,-0.120974,23
4,"Sedding Street, Sloane Square",5,51.49313,-0.156876,27
