# Building Model of predicted duration

In [85]:
import pandas as pd
from collections import Counter

In [78]:
df = pd.read_csv("historical_data.csv")
print("Data shape:", df.shape)

Data shape: (197428, 9)


In [79]:
df.head()

Unnamed: 0,market_id,created_at,actual_delivery_time,store_id,subtotal,total_onshift_dashers,total_busy_dashers,total_outstanding_orders,estimated_store_to_consumer_driving_duration
0,1.0,2015-02-06 22:24:17,2015-02-06 23:27:16,1845,3441,33.0,14.0,21.0,861.0
1,2.0,2015-02-10 21:49:25,2015-02-10 22:56:29,5477,1900,1.0,2.0,2.0,690.0
2,3.0,2015-01-22 20:39:28,2015-01-22 21:09:09,5477,1900,1.0,0.0,0.0,690.0
3,3.0,2015-02-03 21:21:45,2015-02-03 22:13:00,5477,6900,1.0,1.0,2.0,289.0
4,3.0,2015-02-15 02:40:36,2015-02-15 03:20:26,5477,3900,6.0,6.0,9.0,650.0


### Discard rows with invalid creation and delivery times

In [80]:
df = df[ (~df.created_at.isnull()) & (~df.actual_delivery_time.isnull()) ]
print("Valid rows:", df.shape[0])

Valid rows: 197421


### Invalid market features

In [81]:
no_any_market = df[ (df.total_onshift_dashers.isnull()) | (df.total_busy_dashers.isnull()) | (df.total_outstanding_orders.isnull()) ]
print("Any market features is NaN:", no_any_market.shape[0])
no_all_market = df[ (df.total_onshift_dashers.isnull()) & (df.total_busy_dashers.isnull()) & (df.total_outstanding_orders.isnull()) ]
print("All market features is NaN:", no_all_market.shape[0])

Any market features is NaN: 16262
All market features is NaN: 16262


### market_id

In [87]:
print("NaN count:", df[df.market_id.isnull()].shape[0])
print("Unique:", df.market_id.unique())
print("Replace NaN by 0")
df.market_id = df.market_id.fillna(0)
print("Transform to int")
df.market_id =  df.market_id.astype(int)
print("Unique:", df.market_id.unique())
print("Distribution:")
Counter(df.market_id)

NaN count: 0
Unique: [1 2 3 4 0 5 6]
Replace NaN by 0
Transform to int
Unique: [1 2 3 4 0 5 6]
Distribution:


Counter({1: 38037, 2: 55055, 3: 23296, 4: 47597, 0: 987, 5: 17999, 6: 14450})

### estimated_store_to_consumer_driving_duration

In [74]:
print("NaN count:", df[df.estimated_store_to_consumer_driving_duration.isnull()].shape[0])
print("Unique count:", df.estimated_store_to_consumer_driving_duration.unique().shape[0])
max_driving = df.estimated_store_to_consumer_driving_duration.max()
print("Max:", max_driving)
min_driving = df.estimated_store_to_consumer_driving_duration.min()
print("Min:", min_driving)
print("Min count:", df[df.estimated_store_to_consumer_driving_duration==min_driving].shape[0])
df['estimated_store_to_consumer_driving_duration'] =  df.estimated_store_to_consumer_driving_duration.astype(float)

NaN count: 526
Unique count: 1337
Max: 2088.0
Min: 0.0
Min count: 9


In [66]:
df.estimated_store_to_consumer_driving_duration.unique().shape[0]

1337

In [48]:
print("No store_id:", df[df.store_id.isnull()].shape[0])
print("No subtotal:", df[df.subtotal.isnull()].shape[0])
print("No estimated_store_to_consumer_driving_duration:", df[df.estimated_store_to_consumer_driving_duration.isnull()].shape[0])

No market_id: 987
No store_id: 0
No subtotal: 0
No estimated_store_to_consumer_driving_duration: 526


In [45]:
df.market_id.unique()

array([ 1.,  2.,  3.,  4., nan,  5.,  6.])

In [None]:
df['estimated_store_to_consumer_driving_duration'] =  df.estimated_store_to_consumer_driving_duration.astype(float)