In [1]:
# Ignore warnings
import warnings
warnings.filterwarnings('ignore')

# Handle table-like data and matrices
import numpy as np
import pandas as pd
from pandas import DataFrame,Series
import csv
import pprint
from sklearn.cluster import KMeans,FeatureAgglomeration
from sklearn.ensemble import RandomForestRegressor
from sklearn import preprocessing

# Visualisation
import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns

# Others
import math
import datetime

# Configure visualisations
%matplotlib inline
mpl.style.use( 'ggplot' )
sns.set_style( 'white' )
pylab.rcParams[ 'figure.figsize' ] = 16, 8

In [13]:
train_volume = pd.read_csv(
    '../datasets/train/volume(table_6)_training.csv', parse_dates=True, index_col=[0])
test_volume = pd.read_csv(
    '../datasets/test/volume(table_6)_test1.csv', parse_dates=True, index_col=[0])

volume = pd.concat([
    train_volume,
    test_volume
])
volume['counter'] = 1
volume['route'] = 'T' + \
    volume.tollgate_id.map(str) + 'D' + volume.direction.map(str)
del volume['tollgate_id']
del volume['direction']

del train_volume
del test_volume

train_weather = pd.read_csv(
    '../datasets/train/weather_(table_7)_training_update.csv')
test_weather = pd.read_csv(
    '../datasets/test/weather_(table_7)_test1.csv')

weather = pd.concat([
    train_weather,
    test_weather
])
del train_weather
del test_weather

In [14]:
def get_precipitation_level(precipitation):
    if precipitation == 0.0:
        return 'Sunny'
    return 'Rain'


def parse_datetime(x):
    date, hour = x[0], x[1]
    datetime_str = str(date) + '-' + str(hour)
    return pd.to_datetime(datetime_str, format='%Y-%m-%d-%H')

holidays_index = pd.date_range('2016-9-30', '2016-10-9')

def cluster_rule(x):
    vehicle_model, has_etc = x[0], x[1]
    if has_etc == 0 and vehicle_model == 1:
        return 'Cluster1'
    if has_etc == 1 and vehicle_model == 1:
        return 'Cluster2'
    if has_etc == 0 and vehicle_model == 2:
        return 'Cluster3'
#     if has_etc == 1 and vehicle_model == 2:
    return 'Cluster4'
#     return 'Cluster5'


volume['cluster'] = volume.apply(cluster_rule, axis=1)

In [4]:
# vehicle_model_clus = {
#     0: 'Car',
#     1: 'Car',
#     2: 'Truck',
#     3: 'Truck',
#     4: 'Truck',
#     5: 'Truck',
#     6: 'Truck',
#     7: 'Truck',
# }

# volume.vehicle_model = volume.vehicle_model.map(vehicle_model_clus)
# volume.vehicle_type = volume.vehicle_type.fillna(-1)

In [15]:
volume_grouped = volume.groupby(
    ['route', 'cluster']).resample('20min').count()['counter']
volume_grouped = volume_grouped.unstack().fillna(0).astype(int).T
# volume_grouped['Cluster'] = volume_grouped.index.get_level_values(1)
# volume_grouped['Hour'] = volume_grouped.index.get_level_values(0).map(lambda dt: dt.hour)
# volume_grouped['Time'] = volume_grouped.index.get_level_values(0).map(lambda dt: dt.time)
# volume_grouped.index = volume_grouped.index.droplevel(1)
volume_grouped = volume_grouped.between_time('15:00', '18:40')
# volume_grouped = volume_grouped.between_time('6:00', '9:40')

volume_grouped.index = pd.MultiIndex.from_arrays(
    [volume_grouped.index.date, volume_grouped.index.time], names=['date', 'time'])
volume_grouped = volume_grouped.stack().stack().reset_index()
volume_grouped.columns = ['date', 'time', 'cluster', 'route', 'volumn']
volume_grouped = volume_grouped.set_index(
    ['date', 'route', 'cluster', 'time']).sort_index().unstack()
volume_grouped.columns = [str(x)[:-3]
                          for x in volume_grouped.columns.get_level_values(1)]

diff_volume = volume_grouped.T.iloc[:6].diff(1).dropna().T.astype(int)
diff_volume.columns = "diff-" + diff_volume.columns
volume_grouped = volume_grouped.join(diff_volume)
volume_grouped['1-hour-mean'] = volume_grouped.iloc[:, :3].mean(axis=1)
volume_grouped['2-hour-mean'] = volume_grouped.iloc[:, 3:6].mean(axis=1)
volume_grouped['cluster'] = volume_grouped.index.get_level_values(2)
volume_grouped['route'] = volume_grouped.index.get_level_values(1)

cols = volume_grouped.columns.tolist()
cols = cols[:6] + cols[12:] + cols[6:12]

volume_grouped = volume_grouped[cols]
volume_grouped.index = volume_grouped.index.droplevel([1, 2])

non_holidays_index = volume_grouped.index.difference(holidays_index)

non_holidays_volume_grouped = volume_grouped.loc[non_holidays_index]

In [16]:
train_X = non_holidays_volume_grouped.iloc[:-140, :-6]
train_y = non_holidays_volume_grouped.iloc[:-140, -6:]

test_X = non_holidays_volume_grouped.iloc[-140:, :-6]

cluster_le = preprocessing.LabelEncoder()
route_le = preprocessing.LabelEncoder()
train_X['cluster'] = cluster_le.fit(
    train_X['cluster']).transform(train_X['cluster'])
train_X['route'] = route_le.fit(train_X['route']).transform(train_X['route'])
test_X['cluster'] = cluster_le.transform(test_X['cluster'])
test_X['route'] = route_le.transform(test_X['route'])

In [17]:
fr = RandomForestRegressor(n_estimators=100,criterion='mae')
fr.fit(train_X,train_y)

RandomForestRegressor(bootstrap=True, criterion='mae', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [19]:
test_index = pd.MultiIndex.from_product([pd.date_range(
    '2016-10-18', '2016-10-24'), ['T1D0', 'T1D1', 'T2D0', 'T3D0', 'T3D1']])
test_col = pd.date_range('17:00','18:40',freq='20min').map(lambda dt:dt.time)
# test_col = pd.date_range('8:00','9:40',freq='20min').map(lambda dt:dt.time)

In [20]:
result = fr.predict(test_X).reshape((-1,4,6)).sum(axis=1)
predict_result = pd.DataFrame(result, index=test_index, columns=test_col).stack().to_frame(name='volume')
predict_result['tollgate_id'] = predict_result.index.get_level_values(1).map(lambda s:int(s[1]))
predict_result['direction'] = predict_result.index.get_level_values(1).map(lambda s:int(s[3]))
predict_result.index = predict_result.index.droplevel(1)
date = predict_result.index.get_level_values(0).map(lambda dt: (dt.date()))
time = predict_result.index.get_level_values(1)

dt_from = [datetime.datetime.combine(d, t) for d, t in zip(date, time)]
dt_to = [d + datetime.timedelta(minutes=20) for d in dt_from]
time_windows = ['[' + str(d_from) + ',' + str(d_to) + ')' for d_from,d_to in zip(dt_from,dt_to)]

predict_result['time_window'] = time_windows

In [30]:
train_X

Unnamed: 0_level_0,15:00,15:20,15:40,16:00,16:20,16:40,diff-15:20,diff-15:40,diff-16:00,diff-16:20,diff-16:40,1-hour-mean,2-hour-mean,cluster,route
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2016-09-19,30.0,46.0,28.0,28.0,33.0,34.0,16,-18,0,5,1,34.666667,31.666667,0,0
2016-09-19,14.0,6.0,10.0,10.0,9.0,18.0,-8,4,0,-1,9,10.000000,12.333333,1,0
2016-09-19,3.0,3.0,5.0,3.0,2.0,2.0,0,2,-2,-1,0,3.666667,2.333333,2,0
2016-09-19,1.0,2.0,0.0,1.0,2.0,1.0,1,-2,1,1,-1,1.000000,1.333333,3,0
2016-09-19,56.0,71.0,63.0,74.0,86.0,71.0,15,-8,11,12,-15,63.333333,77.000000,0,1
2016-09-19,17.0,17.0,19.0,25.0,23.0,33.0,0,2,6,-2,10,17.666667,27.000000,1,1
2016-09-19,2.0,11.0,6.0,7.0,6.0,6.0,9,-5,1,-1,0,6.333333,6.333333,2,1
2016-09-19,2.0,5.0,4.0,4.0,7.0,6.0,3,-1,0,3,-1,3.666667,5.666667,3,1
2016-09-19,43.0,43.0,48.0,50.0,62.0,67.0,0,5,2,12,5,44.666667,59.666667,0,2
2016-09-19,16.0,15.0,14.0,16.0,23.0,19.0,-1,-1,2,7,-4,15.000000,19.333333,1,2


In [12]:
morning_predict_result = predict_result

In [21]:
afternoon_predict_result= predict_result

In [25]:
result = pd.concat([morning_predict_result, afternoon_predict_result])

In [29]:
result[['tollgate_id', 'time_window', 'direction', 'volume']].to_csv('rf.csv',index=False)