# Bike Redistribution Preprocessing

## Set Up

In [57]:
%matplotlib inline

import logging
import itertools
import json
import os
import re
import pickle
import folium
import random
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from mpl_toolkits.basemap import Basemap
from datetime import datetime
from os import listdir
from os.path import isfile, join
from IPython.display import Image
from datetime import date

from src.data.parse_dataset import parse_dir, parse_json_files, get_file_list
from src.data.string_format import format_name, to_short_name
from src.data.visualization import lon_min_longitude, lon_min_latitude, lon_max_longitude, lon_max_latitude, lon_center_latitude, lon_center_longitude, create_london_map

logger = logging.getLogger()
logger.setLevel(logging.INFO)

In [58]:
collected = pd.read_csv('data/raw/redistribution/collected.csv', encoding='latin-1')
distributed = pd.read_csv('data/raw/redistribution/distributed.csv', encoding='latin-1')
stations = pickle.load(open('data/parsed/stations_dataset_final.p', 'rb'))

## Technically Correct Data

In [59]:
# remove null entries, the dataset has very few features to support nulls
collected.dropna(inplace=True)
distributed.dropna(inplace=True)

# convert columns to their appropriate datatypes
collected['NbBikes'] = collected['NbBikes'].astype('uint16')
distributed['NbBikes'] = distributed['NbBikes'].astype('uint16')

# format station name
distributed['Name'] = distributed['Name'].apply(format_name)
collected['Name'] = collected['Name'].apply(format_name)

distributed['Timestamp'] =  pd.to_datetime(distributed['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')
collected['Timestamp'] =  pd.to_datetime(collected['Timestamp'], format='%d/%m/%Y %H:%M', errors='raise').dt.tz_localize('UTC')

### Distributed Cycles

In [60]:
distributed.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 39692 entries, 0 to 39691
Data columns (total 3 columns):
Timestamp    39692 non-null datetime64[ns, UTC]
Name         39692 non-null object
NbBikes      39692 non-null uint16
dtypes: datetime64[ns, UTC](1), object(1), uint16(1)
memory usage: 1007.8+ KB


### Collected Cycles

In [61]:
collected.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 43488 entries, 0 to 43487
Data columns (total 3 columns):
Timestamp    43488 non-null datetime64[ns, UTC]
Name         43488 non-null object
NbBikes      43488 non-null uint16
dtypes: datetime64[ns, UTC](1), object(1), uint16(1)
memory usage: 1.1+ MB


## Derive Data

In [62]:
distributed['ShortName'] = distributed['Name'].map(to_short_name)
collected['ShortName'] = collected['Name'].map(to_short_name)

## Consistent Data

### Distributed Cycles

In [63]:
distributed.describe()

Unnamed: 0,NbBikes
count,39692.0
mean,9.956717
std,5.458785
min,0.0
25%,6.0
50%,9.0
75%,15.0
max,36.0


#### Assign Station Id

In [64]:
def drop_multinomial(idxs, merged_incorrectly, merged_correctly):
    # get the df of the given indexes
    df = merged_incorrectly.loc[idxs]
    
    # get the counts of the station ids in the dataset
    selector = merged_correctly['Id'].isin(df['Id'])
    counts = merged_correctly[selector]['Id'].value_counts()
    
    # choose one station with the multinomial distribution
    probs = counts / counts.sum()
    multinomial_dist = np.random.multinomial(1, probs)
    station_id = counts.index[np.argmax(multinomial_dist)]
    
    # drop the other ones
    to_drop_selector = df['Id'] != station_id
    to_drop = df[to_drop_selector].index.values
    return to_drop    

def drop_randomly(idxs, merged_incorrectly=None, merged_correctly=None):
    idxs.remove(random.sample(idxs, 1)[0])
    return idxs

In [65]:
def split_null(df):
    return df[~df['Id'].isnull()].copy(), df[df['Id'].isnull()][['Timestamp', 'Name', 'NbBikes', 'ShortName']].copy()

def assign_station_id(df, drop_using):
    # merge using the full station name
    merged = pd.merge(df, stations[['Id', 'Name']], how='left', left_on='Name', right_on='Name')
    merged_on_name, remaining_null = split_null(merged)
    print '%d readings could not be merged with the full station name' % len(remaining_null) 
    
    if drop_using is None:
        return merged_on_name
    
    # merge using the short name
    merged = pd.merge(remaining_null, stations[['Id', 'ShortName']], how='left', left_on='ShortName', right_on='ShortName')
    merged_on_shortname, remaining_null = split_null(merged)
    print '%d readings could not be merged with the short station name' % len(remaining_null) 
    
    # drop duplicate entries from merging by short name
    # select the duplicate entries only
    selector = merged_on_shortname.duplicated(subset=['Name', 'Timestamp', 'NbBikes'], keep=False)
    duplicates = pd.DataFrame(merged_on_shortname[selector])
    # add the index as a column for selection
    duplicates['Idx'] = duplicates.index
    # group the duplicates
    groups = duplicates.groupby(['Timestamp', 'ShortName'])['Idx'].aggregate(lambda x: set(x))
    # select indexes to drop from each group
    to_drop = []
    for idxs in groups:
        to_drop.extend(drop_using(idxs, merged_on_shortname, merged_on_name))
    # drop selected indexes from dataframe
    merged_on_shortname.drop(to_drop, inplace=True)
        
    return pd.concat([merged_on_name, merged_on_shortname, remaining_null]).sort_values(by=['Timestamp']).reset_index(drop=True)

In [66]:
#distributed = assign_station_id(distributed, drop_multinomial)
distributed = assign_station_id(distributed, None)

1882 readings could not be merged with the full station name


In [67]:
distributed[distributed['Id'].isnull()]['Name'].unique()

array([], dtype=object)

**These stations do not exist in our stations dataset so they will be removed.**

In [68]:
distributed.dropna(inplace=True)

In [69]:
distributed.isnull().sum()

Timestamp    0
Name         0
NbBikes      0
ShortName    0
Id           0
dtype: int64

In [70]:
distributed.apply(lambda x:x.nunique())

Timestamp    34226
Name           716
NbBikes         23
ShortName      708
Id             716
dtype: int64

### Collected Cycles

In [71]:
collected.describe()

Unnamed: 0,NbBikes
count,43488.0
mean,8.946215
std,5.180654
min,0.0
25%,6.0
50%,8.0
75%,12.0
max,36.0


#### Assign Station Id

In [72]:
#collected = assign_station_id(collected, drop_multinomial)
collected = assign_station_id(collected, None)

1740 readings could not be merged with the full station name


In [73]:
collected[collected['Id'].isnull()]['Name'].unique()

array([], dtype=object)

**These stations do not exist in our stations dataset so they will be removed.**

In [74]:
collected.dropna(inplace=True)

In [75]:
collected.isnull().sum()

Timestamp    0
Name         0
NbBikes      0
ShortName    0
Id           0
dtype: int64

In [76]:
collected.apply(lambda x:x.nunique())

Timestamp    37199
Name           707
NbBikes         26
ShortName      699
Id             707
dtype: int64

## Build Datasets


### Distributed

In [77]:
distributed.head()

Unnamed: 0,Timestamp,Name,NbBikes,ShortName,Id
0,2016-01-01 01:49:00+00:00,"St. Peters Terrace, Fulham",11,St. Peters Terrace,BikePoints_729
1,2016-01-01 01:58:00+00:00,"Aintree Street, Fulham",7,Aintree Street,BikePoints_616
2,2016-01-01 02:26:00+00:00,"Shoreditch Park, Hoxton",14,Shoreditch Park,BikePoints_253
3,2016-01-01 02:39:00+00:00,"Kingsway Southbound, Strand",10,Kingsway Southbound,BikePoints_594
4,2016-01-01 03:06:00+00:00,"Fore Street, Guildhall",6,Fore Street,BikePoints_509


In [78]:
distributed.describe()

Unnamed: 0,NbBikes
count,37810.0
mean,9.953161
std,5.458986
min,0.0
25%,6.0
50%,9.0
75%,15.0
max,36.0


In [79]:
distributed.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37810 entries, 0 to 39691
Data columns (total 5 columns):
Timestamp    37810 non-null datetime64[ns, UTC]
Name         37810 non-null object
NbBikes      37810 non-null uint16
ShortName    37810 non-null object
Id           37810 non-null object
dtypes: datetime64[ns, UTC](1), object(3), uint16(1)
memory usage: 15.1 MB


In [80]:
pickle.dump(distributed, open("data/parsed/distributed_dataset_final.p", "wb"))

### Collected

In [81]:
collected.head()

Unnamed: 0,Timestamp,Name,NbBikes,ShortName,Id
0,2016-01-01 01:35:00+00:00,"Putney Pier, Wandsworth",18,Putney Pier,BikePoints_302
1,2016-01-01 01:48:00+00:00,"Broadwick Street, Soho",8,Broadwick Street,BikePoints_260
3,2016-01-01 02:43:00+00:00,"Belgrave Road, Victoria",7,Belgrave Road,BikePoints_268
4,2016-01-01 02:45:00+00:00,"British Museum, Bloomsbury",0,British Museum,BikePoints_24
5,2016-01-01 02:48:00+00:00,"West Smithfield Rotunda, Farringdon",8,West Smithfield Rotunda,BikePoints_203


In [82]:
collected.describe()

Unnamed: 0,NbBikes
count,41748.0
mean,8.978011
std,5.194398
min,0.0
25%,6.0
50%,8.0
75%,12.0
max,36.0


In [83]:
collected.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
Int64Index: 41748 entries, 0 to 43487
Data columns (total 5 columns):
Timestamp    41748 non-null datetime64[ns, UTC]
Name         41748 non-null object
NbBikes      41748 non-null uint16
ShortName    41748 non-null object
Id           41748 non-null object
dtypes: datetime64[ns, UTC](1), object(3), uint16(1)
memory usage: 16.7 MB


In [84]:
pickle.dump(collected, open("data/parsed/collected_dataset_final.p", "wb"))