# Build Modelling Dataset

## Set Up

In [1]:
%matplotlib inline

import logging
import pickle
import pandas as pd
import numpy as np
import time

from datetime import datetime

logger = logging.getLogger()
logger.setLevel(logging.INFO)

## Resample Weather

In [2]:
weather = pickle.load(open('data/parsed/weather_dataset_utc.p', 'rb'))

weather.set_index(['Timestamp'], inplace=True)
weather = weather.resample('5min').mean().ffill()

## Preprocess Redistribution

In [3]:
distributed = pickle.load(open('data/parsed/distributed_dataset_final.p', 'rb')).sort_values(by=['Id', 'Timestamp']).set_index(['Id', 'Timestamp'])
distributed = distributed.query('NbBikes != 0').drop(['ShortName', 'Name'], axis=1)
distributed.columns = ['DistNbBikes']

In [4]:
collected = pickle.load(open('data/parsed/collected_dataset_final.p', 'rb')).sort_values(by=['Id', 'Timestamp']).set_index(['Id', 'Timestamp'])
collected = collected.query('NbBikes != 0').drop(['ShortName', 'Name'], axis=1)
collected.columns = ['CollNbBikes']

In [5]:
distributed.loc['BikePoints_374'].to_csv('dist.csv')

## Merge Readings

In [6]:
readings = pickle.load(open('data/parsed/readings_dataset_utc.p', 'rb'))

readings.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings.set_index(['Id', 'Timestamp'], inplace=True)

In [7]:
dfs = []
for station_id in readings.index.get_level_values('Id').unique():
    # resample with a freq of 5 mins
    resampled_readings = readings.loc[station_id].resample('5min').mean().ffill()
    
    # merge  weather
    merged = resampled_readings.merge(weather, how='left', left_index=True, right_index=True)
    
    # merge distributed
    if station_id in distributed.index:
        dist_resampled = distributed.loc[station_id].resample('5min').sum()
        merged = merged.merge(dist_resampled, how='left', left_index=True, right_index=True)
        
    # merge collected
    if station_id in collected.index:
        coll_resampled = collected.loc[station_id].resample('5min').sum()
        merged = merged.merge(coll_resampled, how='left', left_index=True, right_index=True)
    
    # set the id
    merged['Id'] = station_id

    dfs.append(merged.reset_index())

readings_weather = pd.concat(dfs, ignore_index=True)

### Reduce Column Size

In [8]:
start_time = time.time()

readings_weather.NbBikes = readings_weather.NbBikes.astype('int16')
readings_weather.NbDocks = readings_weather.NbDocks.astype('int16')
readings_weather.NbEmptyDocks = readings_weather.NbEmptyDocks.astype('int16')
readings_weather.NbUnusableDocks = readings_weather.NbUnusableDocks.astype('int16')
readings_weather.DewPt = readings_weather.DewPt.astype('float16')
readings_weather.Humidity = readings_weather.Humidity.astype('float16')
readings_weather.Pressure = readings_weather.Pressure.astype('float16')
readings_weather.Temp = readings_weather.Temp.astype('float16')
readings_weather.Visibility = readings_weather.Visibility.astype('float16')
readings_weather.WindDirD = readings_weather.WindDirD.astype('float16')
readings_weather.WindSpeed = readings_weather.WindSpeed.astype('float16')

readings_weather.Fog = readings_weather.Fog.astype('int8')
readings_weather.Rain = readings_weather.Rain.astype('int8')

end_time = time.time()
print 'Modifying data types took %s' % (end_time - start_time)

Modifying data types took 0.429966926575


### Modify Timestamp

In [9]:
start_time = time.time()

readings_weather['Holiday'] = readings_weather.Timestamp.apply(lambda x: x.month == 5 and x.day == 30).astype('int8')
readings_weather['Weekday'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek < 5).astype('int8')
readings_weather['Weekend'] = readings_weather.Timestamp.apply(lambda x: x.dayofweek > 4).astype('int8')
readings_weather['TimeOfYear'] = readings_weather.Timestamp.apply(lambda x: (x - datetime(2016,1,1)).total_seconds())
readings_weather['TimeOfDay'] = readings_weather.Timestamp.apply(lambda x: (x - x.replace(hour=0, minute=0, second=0, microsecond=0)).total_seconds())
readings_weather['WeekOfYear'] = readings_weather.Timestamp.apply(lambda x: x.isocalendar()[1])

end_time = time.time()
print 'Changing timestamp took %s' % (end_time - start_time)

Changing timestamp took 361.224951982


### Re-arrange

In [10]:
readings_weather.sort_values(by=['Id', 'Timestamp'], inplace=True)
readings_weather.set_index(['Id', 'Timestamp'], inplace=True)

### Autoregressive

In [11]:
def shift_cols(df, cols, periods, mode):
    for period in periods:    
        target_cols = []
        for col in cols:
            label = '%s%s%d' % (col, mode, period)
            target_cols.append(label)
            df[label] = df[col]
    
        station_ids = df.index.get_level_values('Id').unique()   
        for station_id in station_ids:   
            shifted = df.loc[station_id, target_cols].shift(periods=period)
            df.loc[station_id, target_cols] = shifted.values

In [12]:
start_time = time.time()
shift_cols(readings_weather, ['Temp', 'Humidity', 'Rain', 'Fog'], [1], 'TMinus')
shift_cols(readings_weather, ['Temp', 'Humidity'], [12], 'TMinus')
end_time = time.time()

print 'Adding previous weather columns took %s' % (end_time - start_time)

Adding previous weather columns took 155.414750099


In [13]:
start_time = time.time()
shift_cols(readings_weather, ['NbBikes'], [1, 2, 12, 18, 24], 'TMinus')
end_time = time.time()

print 'Adding autoregressive columns took %s' % (end_time - start_time)

Adding autoregressive columns took 380.563352108


## Remove

Delete stations which experienced periods of inactivity.

In [14]:
invalid_ids = ['BikePoints_109', 'BikePoints_112', 'BikePoints_120', 'BikePoints_129', 'BikePoints_133', 
               'BikePoints_153', 'BikePoints_184', 'BikePoints_192', 'BikePoints_218', 'BikePoints_226', 
               'BikePoints_237', 'BikePoints_260', 'BikePoints_277', 'BikePoints_3', 'BikePoints_31', 
               'BikePoints_311', 'BikePoints_317', 'BikePoints_323', 'BikePoints_368', 'BikePoints_383', 
               'BikePoints_386', 'BikePoints_404', 'BikePoints_460', 'BikePoints_476', 'BikePoints_478', 
               'BikePoints_494', 'BikePoints_497', 'BikePoints_543', 'BikePoints_556', 'BikePoints_583', 
               'BikePoints_643', 'BikePoints_646', 'BikePoints_672', 'BikePoints_742', 'BikePoints_787', 
               'BikePoints_790', 'BikePoints_791', 'BikePoints_793', 'BikePoints_796', 'BikePoints_798', 
               'BikePoints_799', 'BikePoints_802', 'BikePoints_803', 'BikePoints_805', 'BikePoints_807', 
               'BikePoints_809', 'BikePoints_810', 'BikePoints_811', 'BikePoints_814', 'BikePoints_817', 
               'BikePoints_818', 'BikePoints_86', 'BikePoints_9']

readings_weather.drop(invalid_ids, inplace=True)

## Save

In [17]:
readings_weather.info(memory_usage='deep', verbose=False)

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 8865854 entries, (BikePoints_1, 2016-05-15 13:35:00+00:00) to (BikePoints_99, 2016-06-26 23:25:00+00:00)
Columns: 36 entries, CollNbBikes to NbBikesTMinus24
dtypes: float16(11), float64(15), int16(4), int64(1), int8(5)
memory usage: 1.7 GB


In [None]:
pickle.dump(readings_weather, open("data/parsed/readings_model.p", "wb"))