In [3]:
import pandas as pd
import random
import numpy as np
import ast
from math import radians, cos, sin, asin, sqrt  
import time 
from sklearn.svm import SVR

In [4]:
def caltime(row):
    
    # calculate the time cost
    polylen = len(ast.literal_eval(row["POLYLINE"]))
    timecost = (polylen-1)*15
    return timecost

def snapshot(row):
    # get the polyline numbers
    snap = len(ast.literal_eval(row["POLYLINE"]))
    return snap

def initial_long(row):
    # get the initial longitude
    loc = ast.literal_eval(row["POLYLINE"])[0]
    longitude = loc[0]
    return longitude

def initial_lat(row):
    # get the initial latitude
    loc = ast.literal_eval(row["POLYLINE"])[0]
    latitude = loc[1]
    return latitude

def haversine(row): 
    """ 
    Calculate the great circle distance between two points  
    on the earth (specified in decimal degrees) 
    """   
    lon1 = row['initial_lon']
    lat1 = row['initial_lat']
    lon2 = ast.literal_eval(row["POLYLINE"])[-1][0]
    lat2 = ast.literal_eval(row["POLYLINE"])[-1][1]
    
    # change Decimal to radian   
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])  
  
    # haversine
    dlon = lon2 - lon1   
    dlat = lat2 - lat1   
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2  
    c = 2 * asin(sqrt(a))
    
    
    # The average radius of the earth, in kilometers    
    r = 6371
    diff = c * r * 1000  
    
    return diff/1000 

def get_date(row):
    tt = time.gmtime(row["TIMESTAMP"])
    time_data = [tt.tm_mon,tt.tm_mday,tt.tm_hour]
    
    return pd.Series(np.array(time_data, dtype=float))

In [5]:
filename = "D:/Program/dataset/Taxi Trip Time Prediction/train.csv"

#number of records in file (excludes header)
df_rows = sum(1 for line in open(filename)) - 1 

# Sample size - in this case ~10%
size = int(df_rows / 500)

#the 0-indexed header will not be included in the skip list
skip_idx = sorted(random.sample(range(1, df_rows), df_rows - size))

df = pd.read_csv(filename,skiprows=skip_idx)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1372664761620000421,B,,42.0,20000421,1372664761,A,False,"[[-8.611146,41.172039],[-8.611155,41.171589],[..."
1,1372668313620000452,A,43951.0,,20000452,1372668313,A,False,"[[-8.561898,41.21793],[-8.562474,41.219073],[-..."
2,1372663907620000015,A,38581.0,,20000015,1372663907,A,False,"[[-8.622639,41.157414],[-8.622657,41.157243],[..."
3,1372680072620000031,B,,52.0,20000031,1372680072,A,False,"[[-8.613216,41.154417],[-8.613207,41.154417],[..."
4,1372682121620000207,B,,36.0,20000207,1372682121,A,False,"[[-8.649387,41.154345],[-8.650008,41.154255],[..."


In [6]:
df.shape

(3421, 9)

In [7]:
df = df.drop(["CALL_TYPE","ORIGIN_CALL","ORIGIN_STAND","TRIP_ID","DAY_TYPE","MISSING_DATA"],axis=1)
# normalize the taxi id
df['TAXI_ID'] -= np.min(df['TAXI_ID']) 

df["timecost"] = df.apply(caltime,axis=1)
df["snapshots"] = df.apply(snapshot,axis=1)

# drop the empty polyline
empty_poly = df.loc[df['POLYLINE'] == '[]'].index.tolist()
df = df.drop(empty_poly)

# drop the snapshot less than the 4
snap_short = df.loc[df['snapshots']<4].index.tolist()
df = df.drop(snap_short)

df['initial_lon'] = df.apply(initial_long,axis=1)
df['initial_lat'] = df.apply(initial_lat,axis=1)
df['diff'] = df.apply(haversine,axis=1)

ds = df.apply(get_date,axis=1)
ds.columns = ["mon","day","hour"]
df = df.join(ds)

# let the y_label to the last column
cols = df.columns.tolist()
y_label = cols[3]
del cols[3]
cols.append(y_label)
df = df[cols]

df = df.drop(["TIMESTAMP"],axis=1)
df = df.drop(["POLYLINE"],axis=1)
df = df.drop(["TAXI_ID"],axis=1)
df.head()

Unnamed: 0,snapshots,initial_lon,initial_lat,diff,mon,day,hour,timecost
0,15,-8.611146,41.172039,0.911422,7.0,1.0,7.0,210
1,92,-8.561898,41.21793,9.262417,7.0,1.0,8.0,1365
2,98,-8.622639,41.157414,7.468086,7.0,1.0,7.0,1455
3,30,-8.613216,41.154417,1.73362,7.0,1.0,12.0,435
4,45,-8.649387,41.154345,3.605933,7.0,1.0,12.0,660


In [8]:
X_train,Y_train = df[df.columns[:-1]],df[df.columns[-1]]
X_train = X_train.drop(["mon","day"],axis=1)

X_train.head()

Unnamed: 0,snapshots,initial_lon,initial_lat,diff,hour
0,15,-8.611146,41.172039,0.911422,7.0
1,92,-8.561898,41.21793,9.262417,8.0
2,98,-8.622639,41.157414,7.468086,7.0
3,30,-8.613216,41.154417,1.73362,12.0
4,45,-8.649387,41.154345,3.605933,12.0


In [9]:
clf = SVR(C=1.0, epsilon=0.2)
clf.fit(X_train, Y_train) 

SVR(C=1.0, cache_size=200, coef0=0.0, degree=3, epsilon=0.2, gamma='auto',
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [10]:
clf.score(X_train, Y_train) 

-0.011940150766795554

In [11]:
filename_test = "D:/Program/dataset/Taxi Trip Time Prediction/test.csv"

df_t = pd.read_csv(filename_test)

df_t = df_t.drop(["CALL_TYPE","ORIGIN_CALL","ORIGIN_STAND","TRIP_ID","DAY_TYPE","MISSING_DATA"],axis=1)
df_t['TAXI_ID'] -= np.min(df_t['TAXI_ID']) 
df_t["timecost"] = df_t.apply(caltime,axis=1)
df_t["snapshots"] = df_t.apply(snapshot,axis=1)*3


df_t['initial_lon'] = df_t.apply(initial_long,axis=1)
df_t['initial_lat'] = df_t.apply(initial_lat,axis=1)
df_t['diff'] = df_t.apply(haversine,axis=1)

dss = df_t.apply(get_date,axis=1)
dss.columns = ["mon","day","hour"]
df_t = df_t.join(dss)

cols = df_t.columns.tolist()
y_label = cols[3]
del cols[3]
cols.append(y_label)
df_t = df_t[cols]

df_t = df_t.drop(["TIMESTAMP"],axis=1)
df_t = df_t.drop(["POLYLINE"],axis=1)
df_t = df_t.drop(["TAXI_ID"],axis=1)

X_test,Y_train = df_t[df.columns[:-1]],df_t[df.columns[-1]]
X_test = X_test.drop(["mon","day"],axis=1)

print(df_t.head())
X_test.head()

   snapshots  initial_lon  initial_lat      diff  mon   day  hour  timecost
0         33    -8.585676    41.148522  0.221328  8.0  14.0  17.0       150
1        120    -8.610876    41.145570  2.140926  8.0  14.0  17.0       585
2        120    -8.585739    41.148558  2.315694  8.0  14.0  17.0       585
3         24    -8.613963    41.141169  0.060305  8.0  14.0  17.0       105
4          6    -8.619903    41.148036  0.000754  8.0  14.0  17.0        15


Unnamed: 0,snapshots,initial_lon,initial_lat,diff,hour
0,33,-8.585676,41.148522,0.221328,17.0
1,120,-8.610876,41.14557,2.140926,17.0
2,120,-8.585739,41.148558,2.315694,17.0
3,24,-8.613963,41.141169,0.060305,17.0
4,6,-8.619903,41.148036,0.000754,17.0


In [14]:
preSVR = clf.predict(X_test)

In [15]:
sub = "D:/Program/dataset/Taxi Trip Time Prediction/test.csv"
submission = pd.read_csv(sub)
submission = submission.drop(submission.columns[1:],axis=1)
submission["TRAVEL_TIME"] = preSVR
submission.to_csv('submission_svm.csv', index=False)