In [150]:
import pandas as pd

In [151]:
import random

filename = "D:/Program/dataset/Taxi Trip Time Prediction/train.csv"

#number of records in file (excludes header)
df_rows = sum(1 for line in open(filename)) - 1 

# Sample size - in this case ~10%
size = int(df_rows / 10000)

#the 0-indexed header will not be included in the skip list
skip_idx = sorted(random.sample(range(1, df_rows), df_rows - size))

df = pd.read_csv(filename,skiprows=skip_idx)
df.head()

Unnamed: 0,TRIP_ID,CALL_TYPE,ORIGIN_CALL,ORIGIN_STAND,TAXI_ID,TIMESTAMP,DAY_TYPE,MISSING_DATA,POLYLINE
0,1373009757620000623,B,,15.0,20000623,1373009757,A,False,"[[-8.585577,41.148909],[-8.585586,41.148909],[..."
1,1372983032620000303,C,,,20000303,1372983032,A,False,"[[-8.631009,41.168646]]"
2,1373170569620000166,C,,,20000166,1373170569,A,False,"[[-8.63082,41.146389],[-8.630568,41.146416],[-..."
3,1373281067620000377,B,,27.0,20000377,1373281067,A,False,"[[-8.608752,41.147685],[-8.60859,41.147658],[-..."
4,1373474907620000686,C,,,20000686,1373474907,A,False,"[[-8.628291,41.157567],[-8.6283,41.157504],[-8..."


In [152]:
df.shape

(171, 9)

In [153]:
df = df.drop(["CALL_TYPE","ORIGIN_CALL","ORIGIN_STAND","TRIP_ID","DAY_TYPE","MISSING_DATA"],axis=1)
df.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,POLYLINE
0,20000623,1373009757,"[[-8.585577,41.148909],[-8.585586,41.148909],[..."
1,20000303,1372983032,"[[-8.631009,41.168646]]"
2,20000166,1373170569,"[[-8.63082,41.146389],[-8.630568,41.146416],[-..."
3,20000377,1373281067,"[[-8.608752,41.147685],[-8.60859,41.147658],[-..."
4,20000686,1373474907,"[[-8.628291,41.157567],[-8.6283,41.157504],[-8..."


In [154]:
import numpy as np
import ast

def caltime(row):
    
    polylen = len(ast.literal_eval(row["POLYLINE"]))
    timecost = (polylen-1)*15
    return timecost

def snapshot(row):
    snap = len(ast.literal_eval(row["POLYLINE"]))
    return snap

def initial_long(row):
    loc = ast.literal_eval(row["POLYLINE"])[0]
    longitude = loc[0]
    return longitude

def initial_lat(row):
    loc = ast.literal_eval(row["POLYLINE"])[0]
    latitude = loc[1]
    return latitude


In [155]:
df['TAXI_ID'] -= np.min(df['TAXI_ID']) 
df["timecost"] = df.apply(caltime,axis=1)
df["snapshots"] = df.apply(snapshot,axis=1)

df.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,POLYLINE,timecost,snapshots
0,622,1373009757,"[[-8.585577,41.148909],[-8.585586,41.148909],[...",8445,564
1,302,1372983032,"[[-8.631009,41.168646]]",0,1
2,165,1373170569,"[[-8.63082,41.146389],[-8.630568,41.146416],[-...",615,42
3,376,1373281067,"[[-8.608752,41.147685],[-8.60859,41.147658],[-...",180,13
4,685,1373474907,"[[-8.628291,41.157567],[-8.6283,41.157504],[-8...",1125,76


In [156]:
empty_poly = df.loc[df['POLYLINE'] == '[]'].index.tolist()
df = df.drop(empty_poly)

snap_short = df.loc[df['snapshots']<4].index.tolist()
df = df.drop(snap_short)
df.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,POLYLINE,timecost,snapshots
0,622,1373009757,"[[-8.585577,41.148909],[-8.585586,41.148909],[...",8445,564
2,165,1373170569,"[[-8.63082,41.146389],[-8.630568,41.146416],[-...",615,42
3,376,1373281067,"[[-8.608752,41.147685],[-8.60859,41.147658],[-...",180,13
4,685,1373474907,"[[-8.628291,41.157567],[-8.6283,41.157504],[-8...",1125,76
5,483,1373519094,"[[-8.664813,41.168916],[-8.664822,41.168925],[...",570,39


In [160]:
from math import radians, cos, sin, asin, sqrt  
  
def haversine(row): 
    """ 
    Calculate the great circle distance between two points  
    on the earth (specified in decimal degrees) 
    """   
    lon1 = row['initial_lon']
    lat1 = row['initial_lat']
    lon2 = ast.literal_eval(row["POLYLINE"])[-1][0]
    lat2 = ast.literal_eval(row["POLYLINE"])[-1][1]
    
    # change Decimal to radian   
    lon1, lat1, lon2, lat2 = map(radians, [lon1, lat1, lon2, lat2])  
  
    # haversine
    dlon = lon2 - lon1   
    dlat = lat2 - lat1   
    a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2  
    c = 2 * asin(sqrt(a))
    
    # 地球平均半徑，單位為公里
    r = 6371
    
    return c * r * 1000  

In [158]:
df['initial_lon'] = df.apply(initial_long,axis=1)
df['initial_lat'] = df.apply(initial_lat,axis=1)
df['diff'] = df.apply(haversine,axis=1)

df.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,POLYLINE,timecost,snapshots,initial_lon,initial_lat,diff
0,622,1373009757,"[[-8.585577,41.148909],[-8.585586,41.148909],[...",8445,564,-8.585577,41.148909,105506.921391
2,165,1373170569,"[[-8.63082,41.146389],[-8.630568,41.146416],[-...",615,42,-8.63082,41.146389,6820.272513
3,376,1373281067,"[[-8.608752,41.147685],[-8.60859,41.147658],[-...",180,13,-8.608752,41.147685,1116.813514
4,685,1373474907,"[[-8.628291,41.157567],[-8.6283,41.157504],[-8...",1125,76,-8.628291,41.157567,2513.360965
5,483,1373519094,"[[-8.664813,41.168916],[-8.664822,41.168925],[...",570,39,-8.664813,41.168916,7601.246174


In [159]:
cols = df.columns.tolist()
y_label = cols[3]
del cols[3]
cols.append(y_label)
df = df[cols]

df = df.drop(["POLYLINE"],axis=1)
df.head()

Unnamed: 0,TAXI_ID,TIMESTAMP,snapshots,initial_lon,initial_lat,diff,timecost
0,622,1373009757,564,-8.585577,41.148909,105506.921391,8445
2,165,1373170569,42,-8.63082,41.146389,6820.272513,615
3,376,1373281067,13,-8.608752,41.147685,1116.813514,180
4,685,1373474907,76,-8.628291,41.157567,2513.360965,1125
5,483,1373519094,39,-8.664813,41.168916,7601.246174,570
