In [4]:
import itertools
import pandas as pd

from scripts.preprocess.parse_json import parse_dir

In [5]:
def parse_station(element):
    """Parses a JSON bicycle station object to a dictionary"""

    obj = {
        'Id': element['id'],
        'Name': element['commonName'],
        'Latitude': element['lat'],
        'Longitude': element['lon'],
        'PlaceType': element['placeType']
    }

    for p in element['additionalProperties']:
        obj[p['key']] = p['value']

        if 'timestamp' not in obj:
            obj['Timestamp'] = p['modified']
        elif obj['Timestamp'] != p['modified']:
            raise ValueError('The properties\' timestamps for station %s do not match: %s != %s' % (
            obj['id'], obj['Timestamp'], p['modified']))

    return obj

def parse_cycles(json_obj):
    """Parses TfL's BikePoint JSON response"""

    return [parse_station(element) for element in json_obj]

In [31]:
records = parse_dir('/home/jfconavarrete/Documents/Work/Dissertation/spts-uoe/data/dev', parse_cycles)

dataset = pd.DataFrame(list(itertools.chain.from_iterable(records)))

In [32]:
# convert columns to their appropriate datatypes
dataset['InstallDate'] = pd.to_numeric(dataset['InstallDate'], errors='raise')
dataset['Installed'] = dataset['Installed'].astype('bool_')
dataset['Temporary'] = dataset['Temporary'].astype('bool_')
dataset['Locked'] = dataset['Locked'].astype('bool_')
dataset['NbBikes'] = dataset['NbBikes'].astype('uint16')
dataset['NbDocks'] = dataset['NbDocks'].astype('uint16')
dataset['NbEmptyDocks'] = dataset['NbEmptyDocks'].astype('uint16')

# convert string timestamp to datetime
dataset['Timestamp'] =  pd.to_datetime(dataset['Timestamp'], format='%Y-%m-%dT%H:%M:%S.%f')
dataset['InstallDate'] = pd.to_datetime(dataset['InstallDate'], unit='ms')

# 

In [33]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 15 columns):
Id              1524 non-null object
InstallDate     1350 non-null datetime64[ns]
Installed       1524 non-null bool
Latitude        1524 non-null float64
Locked          1524 non-null bool
Longitude       1524 non-null float64
Name            1524 non-null object
NbBikes         1524 non-null uint16
NbDocks         1524 non-null uint16
NbEmptyDocks    1524 non-null uint16
PlaceType       1524 non-null object
RemovalDate     1524 non-null object
Temporary       1524 non-null bool
TerminalName    1524 non-null object
Timestamp       1524 non-null datetime64[ns]
dtypes: bool(3), datetime64[ns](2), float64(2), object(5), uint16(3)
memory usage: 120.6+ KB


In [28]:
dataset.head()

Unnamed: 0,Id,InstallDate,Installed,Latitude,Locked,Longitude,Name,NbBikes,NbDocks,NbEmptyDocks,PlaceType,RemovalDate,Temporary,TerminalName,Timestamp
0,BikePoints_1,2010-07-12 15:08:00,True,51.529163,True,-0.10997,"River Street , Clerkenwell",12,19,6,BikePoint,,True,1023,2016-05-15 20:28:05.490
1,BikePoints_2,2010-07-08 10:43:00,True,51.499606,True,-0.197574,"Phillimore Gardens, Kensington",10,37,27,BikePoint,,True,1018,2016-05-15 20:13:03.720
2,BikePoints_3,2010-07-04 10:46:00,True,51.521283,True,-0.084605,"Christopher Street, Liverpool Street",19,32,13,BikePoint,,True,1012,2016-05-15 20:43:07.243
3,BikePoints_4,2010-07-04 10:58:00,True,51.530059,True,-0.120973,"St. Chad's Street, King's Cross",23,23,0,BikePoint,,True,1013,2016-05-15 17:37:43.557
4,BikePoints_5,2010-07-04 11:04:00,True,51.49313,True,-0.156876,"Sedding Street, Sloane Square",19,27,8,BikePoint,,True,3420,2016-05-15 21:08:10.827
