In [1]:
%matplotlib inline

from datetime import datetime

import numpy as np
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry import Polygon
from geopy.distance import vincenty
from sklearn import cross_validation
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error

plt.style.use('ggplot')
pd.set_option('display.max_columns', 100)

ImportError: No module named 'geopandas'

In [None]:
import warnings
warnings.simplefilter(action = "ignore")

In [None]:
def feature_creation(df):
    
    df = df.copy()
    
    time_cols = ['departure', 'arrival']
    for col in time_cols:
        df[col] = pd.to_datetime(df[col])
        
    # departures: hour, day of week, weekend, peak time (6pm-2am), time (seconds)
    df['dep_hour'] = df.departure.apply(lambda x: x.hour)
    df['dep_dow'] = df.departure.apply(lambda x: x.weekday())
    df['dep_weekend'] = (df.dep_dow.isin([5, 6])) * 1
    df['dep_peak'] = (df.dep_hour.isin([18,19,20,21,22,23,0,1,2])) * 1
    start_time = np.array([pd.to_datetime('09/01/2012')] * len(df))
    df['dep_elapsed'] = ((df.departure - start_time) /\
                         np.timedelta64(1, 's')).astype(float)
    
    # length of trip, in seconds
    df['trip_time'] = ((df.arrival - df.departure) /\
                       np.timedelta64(1, 's')).astype(float)
    
    # dist ^2, ^3
    df['dist_sq'] = df.dist ** 2
    df['dist_cu'] = df.dist ** 3
    
    # dist x hour
    df['dist_hour'] = df.dist * df.dep_hour
    
    # distance traveled
    departures = zip(df.depy, df.depx)
    arrivals = zip(df.arry, df.arrx)
    coords = zip(departures, arrivals)
    distances = [vincenty(c[0], c[1]).miles for c in coords]
    df['dist_geopy'] = distances
    
    # sfo
    departures = zip(df.depx, df.depy)
    df['dep_sfo'] = np.array([Point(coords).within(sfo)
                              for coords in departures]) * 1
    arrivals = zip(df.arrx, df.arry)
    df['arr_sfo'] = np.array([Point(coords).within(sfo)
                              for coords in arrivals]) * 1
    df['sfo'] = df['dep_sfo'] + df['arr_sfo']
    
    return df

## Data

### SFO Polygon

In [None]:
sfo = Polygon((
    (-122.390356,37.623409),
    (-122.378511,37.618311),
    (-122.384777,37.608928),
    (-122.394991,37.613416),
    (-122.395163,37.615455),
    (-122.397566,37.618583),
    (-122.393703,37.620078),
    (-122.390442,37.623274)))

### Training Data

In [None]:
training = pd.read_csv('data/Taxi_Train_Routed.csv')

In [None]:
training = feature_creation(training)

### Split the Training Data

In [None]:
train, dev = cross_validation.train_test_split(training, test_size=0.2)#, random_state=8675309)

## Training

### Predict Time

In [None]:
gbr_time = GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_leaf=5, n_estimators=1000)

In [None]:
tf = ['num', 'dist', 'dep_hour', 'dist_hour', 'sfo', 'dep_elapsed', 'deptaz', 'arrtaz']

In [None]:
time_train = train[tf].values

In [None]:
time_label = train['trip_time'].values

In [None]:
gbr_time.fit(time_train, time_label)

In [None]:
time_dev = dev[tf].values

In [None]:
times = gbr_time.predict(time_dev)

In [None]:
plt.scatter(dev.trip_time, times, s=50, alpha=0.1, color='DimGray')
plt.plot(range(4000), range(4000), color='#348ABD', linewidth=2)
plt.xlim(0, 4000)
plt.ylim(0, 4000)

In [None]:
print 'Best:', 241.25566775514835

In [None]:
mean_squared_error(dev['trip_time'], times) ** 0.5

In [None]:
dev['trip_time'] = times

### Predict Fare

In [None]:
gbr = GradientBoostingRegressor(learning_rate=0.01, max_depth=4, min_samples_leaf=5, n_estimators=1000)

In [None]:
features = ['dist', 'distcost', 'timecost', 'trip_time', 'dep_hour',
            'dist_hour', 'dist_sq', 'dist_cu', 'sfo', 'deptaz', 'arrtaz']

In [None]:
features_train = train[features].values
label_train = train.fare.values

In [None]:
gbr.fit(features_train, label_train)

In [None]:
features_dev = dev[features].values

In [None]:
dev['fare_predicted'] = gbr.predict(features_dev)

In [None]:
plt.figure(figsize=(8,8))

plt.scatter(dev['fare'], dev['fare_predicted'], s=35, alpha=0.1, color='DimGray')
plt.plot(range(200), range(200), color='#348ABD', linewidth=2)

plt.xlim(-5, 200)
plt.ylim(-5, 200)

In [None]:
print 'Best:', 4.3942468777917876

In [None]:
mean_squared_error(dev['fare'], dev['fare_predicted']) ** 0.5

## Testing

In [None]:
test = pd.read_csv('data/Taxi_Query_Routed.csv')

In [None]:
test = feature_creation(test)

In [None]:
time_test = test[tf].values

In [None]:
test['trip_time'] = gbr_time.predict(time_test)

In [None]:
features_test = test[features].values

In [None]:
test['fare'] = gbr.predict(features_test)

In [None]:
output = test[['id', 'fare']]
output.columns = ['ID', 'Fare']

Round Time?

In [None]:
output.to_csv('data/results/_js00.csv', index=False)