# https://www.kaggle.com/willieliao/beat-the-benchmark

##  Use the weighted average of train trips to estimate trip duration.

In [2]:
import json
import zipfile
import numpy as np
import pandas as pd

In [3]:
##Control the number of trips read for training 
##Control the number of closest trips used to calculate trip duration
##Parameters for train set trips to keep and how much to pad travel time based on polyline

In [4]:
N_read = 80000
N_trips = 100
P_keep = 0.95
P_pad = 1.25

In [7]:
def get_dist(lonlat1, lonlat2):
    lon_diff = np.abs(lonlat1[0]-lonlat2[0])*np.pi/360.0
    lat_diff = np.abs(lonlat1[1]-lonlat2[1])*np.pi/360.0
    a = np.sin(lat_diff)**2 + np.cos(lonlat1[1]*np.pi/180.0) * np.cos(lonlat2[1]*np.pi/180.0) * np.sin(lon_diff)**2  
    d = 2*6371*np.arctan2(np.sqrt(a), np.sqrt(1-a))
    return(d)

## test

In [21]:
zf = zipfile.ZipFile('all (2).zip')
test = pd.read_csv(zf.open('test.csv'), usecols=['TRIP_ID', 'POLYLINE'])
test['POLYLINE'] = test['POLYLINE'].apply(json.loads)
test['snapshots'] = test['POLYLINE'].apply(lambda x: np.log(len(x)))
test['lonlat'] = test['POLYLINE'].apply(lambda x: x[0])
test.drop('POLYLINE', axis=1, inplace=True)

In [22]:
test.head()

Unnamed: 0,TRIP_ID,snapshots,lonlat
0,T1,2.397895,"[-8.585676, 41.148522]"
1,T2,3.688879,"[-8.610876, 41.14557]"
2,T3,3.688879,"[-8.585739, 41.148558]"
3,T4,2.079442,"[-8.613963, 41.141169]"
4,T5,0.693147,"[-8.619903, 41.148036]"


## train

In [17]:
zf = zipfile.ZipFile('all (2).zip')
train = pd.read_csv(zf.open('train.csv'), usecols=['POLYLINE'], nrows=N_read)
train['POLYLINE'] = train['POLYLINE'].apply(json.loads)
train['snapshots'] = train['POLYLINE'].apply(lambda x: np.log(len(x)))
train = train[train.snapshots>3.0]
train['lonlat'] = train['POLYLINE'].apply(lambda x: x[0])
train.drop('POLYLINE', axis=1, inplace=True)

In [19]:
train.head()

Unnamed: 0,snapshots,lonlat
0,3.135494,"[-8.618643, 41.141412]"
2,4.174387,"[-8.612964, 41.140359]"
3,3.7612,"[-8.574678, 41.151951]"
4,3.367296,"[-8.645994, 41.18049]"
5,3.258097,"[-8.615502, 41.140674]"


## 

行程持續時間的加權平均值
由於我們使用1 /距離^ 2作為重量，因此下方綁定10米
將5％的持續時間最長的旅行作為異常值

In [23]:
test['TRAVEL_TIME'] = 0
for row, ll in enumerate(test['lonlat']):
    d = train['lonlat'].apply(lambda x: get_dist(x, ll))
    i = np.argpartition(d, N_trips)[0:N_trips]
    w = np.maximum(d.iloc[i], 0.01)
    s = train.iloc[i]['snapshots']
    j = np.argpartition(s, int(N_trips*P_keep))[0:int(N_trips*P_keep)]
    test.loc[row, 'TRAVEL_TIME'] = np.maximum(P_pad*test.loc[row, 'snapshots'], np.average(s.iloc[j], weights=1/w.iloc[j]**2))

In [25]:
test.head()

Unnamed: 0,TRIP_ID,snapshots,lonlat,TRAVEL_TIME
0,T1,2.397895,"[-8.585676, 41.148522]",3.818554
1,T2,3.688879,"[-8.610876, 41.14557]",4.611099
2,T3,3.688879,"[-8.585739, 41.148558]",4.611099
3,T4,2.079442,"[-8.613963, 41.141169]",3.63898
4,T5,0.693147,"[-8.619903, 41.148036]",3.779418


In [26]:
test['TRAVEL_TIME'] = 15*np.exp(.5*test['TRAVEL_TIME']+.5*P_pad*np.maximum(test.snapshots.mean(), test['snapshots']))
test['TRAVEL_TIME'] = test['TRAVEL_TIME'].astype(int)
test[['TRIP_ID', 'TRAVEL_TIME']].to_csv('submission.csv', index=False)

In [27]:
test

Unnamed: 0,TRIP_ID,snapshots,lonlat,TRAVEL_TIME
0,T1,2.397895,"[-8.585676, 41.148522]",766
1,T2,3.688879,"[-8.610876, 41.14557]",1508
2,T3,3.688879,"[-8.585739, 41.148558]",1508
3,T4,2.079442,"[-8.613963, 41.141169]",700
4,T5,0.693147,"[-8.619903, 41.148036]",751
5,T6,4.919981,"[-8.630613, 41.178249]",7030
6,T7,3.178054,"[-8.585622, 41.148918]",827
7,T8,2.833213,"[-8.582922, 41.181057]",834
8,T9,3.761200,"[-8.606529, 41.14467]",1651
9,T10,4.369448,"[-8.585658, 41.148576]",3532
