# Bike Redistribution Preprocessing

## Set Up

In [1166]:
%matplotlib inline

import logging
import itertools
import json
import os
import re
import pickle
import folium
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from IPython.display import Image
from datetime import date

from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from src.data.string_format import format_name, to_short_name
from src.data.visualization import lon_min_longitude, lon_min_latitude, lon_max_longitude, lon_max_latitude, lon_center_latitude, lon_center_longitude, create_london_map

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [1167]:
collected = pd.read_csv('data/raw/redistribution/collected.csv', encoding='latin-1')
distributed = pd.read_csv('data/raw/redistribution/distributed.csv', encoding='latin-1')
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))

## Technically Correct Data

In [1168]:
# remove null entries, the dataset has very few features to support nulls
collected.dropna(inplace=True)
distributed.dropna(inplace=True)

# convert columns to their appropriate datatypes
collected['NbBikes'] = collected['NbBikes'].astype('uint16')
distributed['NbBikes'] = distributed['NbBikes'].astype('uint16')

# format station name
distributed['Name'] = distributed['Name'].apply(format_name)
collected['Name'] = collected['Name'].apply(format_name)

distributed['Timestamp'] =  pd.to_datetime(distributed['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='raise')
collected['Timestamp'] =  pd.to_datetime(collected['Timestamp'], format='%d/%m/%Y %H:%M:%S', errors='raise')

### Distributed Cycles

In [1169]:
distributed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31124 entries, 0 to 31123
Data columns (total 3 columns):
Timestamp    31124 non-null datetime64[ns]
Name         31124 non-null object
NbBikes      31124 non-null uint16
dtypes: datetime64[ns](1), object(1), uint16(1)
memory usage: 790.3+ KB


### Collected Cycles

In [1170]:
collected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 31124 entries, 0 to 31123
Data columns (total 3 columns):
Timestamp    31124 non-null datetime64[ns]
Name         31124 non-null object
NbBikes      31124 non-null uint16
dtypes: datetime64[ns](1), object(1), uint16(1)
memory usage: 790.3+ KB


## Derive Data

In [1171]:
distributed['ShortName'] = distributed['Name'].map(to_short_name)
collected['ShortName'] = collected['Name'].map(to_short_name)

## Consistent Data

### Distributed Cycles

In [1172]:
distributed.describe()

Unnamed: 0,NbBikes
count,31124.0
mean,9.894487
std,5.502689
min,0.0
25%,6.0
50%,9.0
75%,15.0
max,36.0


#### Assign Station Id

In [1173]:
def drop_multinomial(idxs, merged_incorrectly, merged_correctly):
    # get the df of the given indexes
    df = merged_incorrectly.loc[idxs]
    
    # get the counts of the station ids in the dataset
    selector = merged_correctly['Id'].isin(df['Id'])
    counts = merged_correctly[selector]['Id'].value_counts()
    
    # choose one station with the multinomial distribution
    probs = counts / counts.sum()
    multinomial_dist = np.random.multinomial(1, probs)
    station_id = counts.index[np.argmax(multinomial_dist)]
    
    # drop the other ones
    to_drop_selector = df['Id'] != station_id
    to_drop = df[to_drop_selector].index.values
    return to_drop    

def drop_randomly(idxs, merged_incorrectly=None, merged_correctly=None):
    idxs.remove(random.sample(idxs, 1)[0])
    return idxs

In [1174]:
def split_null(df):
    return df[~df['Id'].isnull()].copy(), df[df['Id'].isnull()][['Timestamp', 'Name', 'NbBikes', 'ShortName']].copy()

def assign_station_id(df, drop_using):
    # merge using the full station name
    merged = pd.merge(df, stations[['Id', 'Name']], how='left', left_on='Name', right_on='Name')
    merged_on_name, remaining_null = split_null(merged)
    print '%d readings could not be merged with the full station name' % len(remaining_null) 
    
    # merge using the short name
    merged = pd.merge(remaining_null, stations[['Id', 'ShortName']], how='left', left_on='ShortName', right_on='ShortName')
    merged_on_shortname, remaining_null = split_null(merged)
    print '%d readings could not be merged with the short station name' % len(remaining_null) 
    
    # drop duplicate entries from merging by short name
    # select the duplicate entries only
    selector = merged_on_shortname.duplicated(subset=['Name', 'Timestamp', 'NbBikes'], keep=False)
    duplicates = pd.DataFrame(merged_on_shortname[selector])
    # add the index as a column for selection
    duplicates['Idx'] = duplicates.index
    # group the duplicates
    groups = duplicates.groupby(['Timestamp', 'ShortName'])['Idx'].aggregate(lambda x: set(x))
    # select indexes to drop from each group
    to_drop = []
    for idxs in groups:
        to_drop.extend(drop_using(idxs, merged_on_shortname, merged_on_name))
    # drop selected indexes from dataframe
    merged_on_shortname.drop(to_drop, inplace=True)
        
    return pd.concat([merged_on_name, merged_on_shortname, remaining_null]).sort_values(by=['Timestamp']).reset_index(drop=True)

In [1175]:
distributed = assign_station_id(distributed, drop_multinomial)

1777 readings could not be merged with the full station name
147 readings could not be merged with the short station name


In [1176]:
distributed[distributed['Id'].isnull()]['Name'].unique()

array([u'Embankment (Horse Guards), Westminster',
       u'High Holborn, Covent Garden', u'Altab Ali Park, Whitechapel',
       u'Kingsway, Covent Garden', u'Vaughan Way, Wapping',
       u'Sidney Street, Stepney', u'Killick Street, Kings Cross',
       u'Coborn Street, Mile End', u'South Quay West, Canary Wharf',
       u'Churchill Place, Canary Wharf', u'Penton Street Depot'], dtype=object)

**These stations do not exist in our stations dataset so they will be removed.**

In [1177]:
distributed.dropna(inplace=True)

In [1178]:
distributed.isnull().sum()

Id           0
Name         0
NbBikes      0
ShortName    0
Timestamp    0
dtype: int64

In [1179]:
distributed.apply(lambda x:x.nunique())

Id             736
Name           770
NbBikes         26
ShortName      727
Timestamp    28276
dtype: int64

### Collected Cycles

In [1180]:
collected.describe()

Unnamed: 0,NbBikes
count,31124.0
mean,8.528403
std,5.071224
min,0.0
25%,5.0
50%,8.0
75%,11.0
max,36.0


#### Assign Station Id

In [1181]:
collected = assign_station_id(collected, drop_multinomial)

1721 readings could not be merged with the full station name
374 readings could not be merged with the short station name


In [1182]:
collected[collected['Id'].isnull()]['Name'].unique()

array([u'Altab Ali Park, Whitechapel',
       u'Embankment (Horse Guards), Westminster',
       u'High Holborn, Covent Garden', u'Killick Street, Kings Cross',
       u'Sidney Street, Stepney', u'Penton Street Depot',
       u'Churchill Place, Canary Wharf', u'South Quay West, Canary Wharf',
       u'Kingsway, Covent Garden', u'Vaughan Way, Wapping',
       u'Coborn Street, Mile End', u'Mechanical Workshop Clapham',
       u'Vereker Road, West Kensington'], dtype=object)

**These stations do not exist in our stations dataset so they will be removed.**

In [1183]:
collected.dropna(inplace=True)

In [1184]:
collected.isnull().sum()

Id           0
Name         0
NbBikes      0
ShortName    0
Timestamp    0
dtype: int64

In [1185]:
collected.apply(lambda x:x.nunique())

Id             703
Name           706
NbBikes         23
ShortName      695
Timestamp    27259
dtype: int64

## Build Datasets


### Distributed

In [1186]:
distributed.head()

Unnamed: 0,Id,Name,NbBikes,ShortName,Timestamp
0,BikePoints_531,"Twig Folly Bridge, Mile End",14,Twig Folly Bridge,2015-01-01 08:45:00
1,BikePoints_722,"Finnis Street, Bethnal Green",14,Finnis Street,2015-01-01 09:37:00
2,BikePoints_379,"Turquoise Island, Notting Hill",10,Turquoise Island,2015-01-01 12:06:00
3,BikePoints_335,"Tavistock Street, Covent Garden",18,Tavistock Street,2015-01-01 12:29:00
5,BikePoints_147,"Portugal Street, Holborn",8,Portugal Street,2015-01-01 12:48:00


In [1187]:
distributed.describe()

Unnamed: 0,NbBikes
count,30977.0
mean,9.904994
std,5.49957
min,0.0
25%,6.0
50%,9.0
75%,15.0
max,36.0


In [1188]:
distributed.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30977 entries, 0 to 31123
Data columns (total 5 columns):
Id           30977 non-null object
Name         30977 non-null object
NbBikes      30977 non-null uint16
ShortName    30977 non-null object
Timestamp    30977 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3), uint16(1)
memory usage: 12.4 MB


In [1189]:
pickle.dump(distributed, open("data/parsed/distributed_dataset_final.p", "wb"))

### Distributed

In [1190]:
collected.head()

Unnamed: 0,Id,Name,NbBikes,ShortName,Timestamp
0,BikePoints_263,"St. Mary Axe, Aldgate",5,St. Mary Axe,2015-01-01 07:29:00
1,BikePoints_46,"Nesham Street, Wapping",5,Nesham Street,2015-01-01 07:50:00
3,BikePoints_282,"Royal London Hospital, Whitechapel",4,Royal London Hospital,2015-01-01 08:12:00
4,BikePoints_461,"Aston Street, Stepney",0,Aston Street,2015-01-01 08:28:00
5,BikePoints_460,"Burdett Road, Mile End",10,Burdett Road,2015-01-01 09:03:00


In [1191]:
collected.describe()

Unnamed: 0,NbBikes
count,30750.0
mean,8.522244
std,5.061514
min,0.0
25%,5.0
50%,8.0
75%,11.0
max,36.0


In [1192]:
collected.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 30750 entries, 0 to 31123
Data columns (total 5 columns):
Id           30750 non-null object
Name         30750 non-null object
NbBikes      30750 non-null uint16
ShortName    30750 non-null object
Timestamp    30750 non-null datetime64[ns]
dtypes: datetime64[ns](1), object(3), uint16(1)
memory usage: 12.3 MB


In [1193]:
pickle.dump(collected, open("data/parsed/collected_dataset_final.p", "wb"))