In [1]:
import pandas as pd
import numpy as np

## Import and tidy up Weather data

In [2]:
wr = pd.read_csv("../data/weather_history.csv", low_memory=False) 

In [3]:
wr = wr.dropna(how='all', axis=1)
wr = wr.drop('WW', axis=1)
wr = wr.rename(columns={ wr.columns[0]: "Time" })

In [4]:
wr = wr.rename(columns={'T':'Temp','U':'Rel_Humidity','DD':'Wind_Dir',
                        'Tn':'Min_Temp','Tx':'Max_Temp','Td':'Dew_Temp',
                        'RRR':'Rain','tR':'Rain_Time','Tg':'Night_Soil_Temp'})
wr = wr.drop(['Dew_Temp','Night_Soil_Temp','Min_Temp','Max_Temp','Wind_Dir'], axis=1)

In [5]:
wr['Rain'].replace('No precipitation',0, inplace=True)
wr['Rain'] = pd.to_numeric(wr['Rain'])
wr['Hourly_Rain'] = wr['Rain'] / wr['Rain_Time']
wr = wr.drop(['Rain','Rain_Time'], axis = 1) # unclear why Rain_Time is sometimes 6 and sometimes 12
wr['Time'] = pd.to_datetime(wr['Time'], dayfirst=True)

In [6]:
wr['Hour'] = wr['Time'].dt.hour
wr['Date'] = wr['Time'].dt.date

## Aggregate weather by day

In [7]:
wrd = wr.groupby('Date').mean().reset_index()
wrd = wrd.drop('Hour', axis=1)
wrd['Date'] = pd.to_datetime(wrd['Date'])
wrd['Date'] = wrd['Date'].dt.date
wrd.to_csv('daily-weather.csv')

## Import and tidy up Daily Cycle Hire data

In [8]:
dh = pd.read_csv("../data/daily-hires.csv")

In [9]:
dh['Date'] = pd.to_datetime(dh['Date'], dayfirst=True)
dh['Month'] = dh['Date'].dt.month
dh['Year'] = dh['Date'].dt.year
dh['Weekday'] = dh['Date'].dt.weekday
dh=dh.rename(columns={'Weekday':'Day_of_Week'})
dh['Weekend'] = dh['Day_of_Week'] > 4
dh['Day_of_Year'] = dh['Date'].dt.dayofyear
dh['Hires']=dh['Hires'].astype(int)
dh['Date'] = dh['Date'].dt.date
dh = dh.sort_values(by="Date")

In [10]:
days=["Mon","Tues","Wed","Thu","Fri","Sat","Sun"]
def day(number):
    return days[number]

dh['Day'] = dh['Day_of_Week'].apply(day)

### Import Strikes data

In [11]:
strikes = pd.read_csv('../data/tube-strikes.csv')
strikes['Strike_Evening'] = pd.to_datetime(strikes['Strike_Evening'], dayfirst=True)
strikes['Strike_Daytime'] = pd.to_datetime(strikes['Strike_Daytime'], dayfirst=True)

### import bank holidays

In [12]:
bank = pd.read_csv('../data/bank-holidays.csv')
bank['Date'] = pd.to_datetime(bank['Date'], dayfirst=True)

### import major London cycle accidents

In [13]:
cyc = pd.read_csv('../data/cycle-deaths-injuries.csv')

In [14]:
death = cyc[cyc['Type']=='death']
death = list(death['Date'])
inj = cyc[cyc['Type']=='injury']
inj = list(inj['Date'])


## Import network size data

In [15]:
nw = pd.read_csv("../data/hire-bike-volumes.csv")

nw = nw.rename(columns={'Stations':'Dock_Points'})
nw['Date'] = pd.to_datetime(nw['Date'], dayfirst=True).dt.date

#impute missing values using last known or estimated value
nw = nw.fillna(method='ffill')
nw =nw.replace({'Registration': {'Yes': True, 'No': False},
               'Weekly_Fee':{'y': True, 'n': False}})

### Join Daily Cycle Hire, Aggregated Daily Weather, and Network Size data

In [16]:
daily = dh.merge(wrd, how='left', on='Date')
daily = daily.merge(nw, how='left', on='Date')
daily['Date'] = pd.to_datetime(daily['Date'])
daily = daily.sort_values(by="Date")

In [17]:
fill_cols = ['Bicycles','Dock_Points','Registration','Sponsor','Fee','Weekly_Fee','Mobike','Ofo','Obike','Urbo']
daily[fill_cols] = daily[fill_cols].fillna(method='ffill')

### Add strikes and bank holidays

In [18]:
daily['Strike_Evening'] = daily['Date'].isin(strikes['Strike_Evening'])
daily['Strike_Daytime'] = daily['Date'].isin(strikes['Strike_Daytime'])
daily['Bank_Hol'] = daily['Date'].isin(bank['Date'])

In [19]:
daily['Wkend_or_Hol'] = (daily['Bank_Hol']) | (daily['Weekend'])

In [20]:
daily['Death'] = daily['Date'].isin(death)
daily['Inj'] = daily['Date'].isin(inj)
daily['KSI'] = (daily['Death']) | (daily['Inj'])

### import daylight hours

In [21]:
light = pd.read_csv('../data/civil-twilight-2017.csv')

light['Dark'] = pd.to_datetime(light['Dark'],format= '%H:%M')
light['Light'] = pd.to_datetime(light['Light'],format= '%H:%M')

light['Hours_Light'] = (light['Dark'] - light['Light'])
light['Dark'] = light['Dark'].dt.time
light['Light'] = light['Light'].dt.time

light['Hours_Light'] = (light['Hours_Light'].dt.seconds) / 3600

light['Day_of_Year'] = pd.to_datetime(light['Date']).dt.dayofyear
light = light.drop(['Date'], axis=1)

In [22]:
daily = daily.merge(light, how='left', on='Day_of_Year')

In [23]:
daily['KSI_day_before'] = daily['KSI'].shift(1)

In [24]:
daily.to_csv('daily.csv', index=False)