In [32]:
import numpy as np # linear algebra
import pandas as pd # CSV file I/O (e.g. pd.read_csv)
import os # reading the input files we have access to

print(os.listdir('./input'))

types = {'fare_amount': 'float32',
         'pickup_longitude': 'float32',
         'pickup_latitude': 'float32',
         'dropoff_longitude': 'float32',
         'dropoff_latitude': 'float32',
         'passenger_count': 'uint8'}

# Columns to keep (basically discarding the 'key' column) 
cols = ['fare_amount', 'pickup_datetime', 'pickup_longitude', 'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude', 'passenger_count']
train_df =  pd.read_csv('./input/train.csv',  parse_dates=["pickup_datetime"], usecols=cols, infer_datetime_format=True, dtype=types, nrows = 30_000_000)
print(train_df.dtypes)

# Given a dataframe, add two new features 'abs_diff_longitude' and
# 'abs_diff_latitude' reprensenting the "Manhattan vector" from
# the pickup location to the dropoff location.
def add_travel_vector_features(df):
    df['abs_diff_longitude'] = (df.dropoff_longitude - df.pickup_longitude).abs()
    df['abs_diff_latitude'] = (df.dropoff_latitude - df.pickup_latitude).abs()

add_travel_vector_features(train_df)
del train_df['pickup_longitude']
del train_df['pickup_latitude']
del train_df['dropoff_longitude']
del train_df['dropoff_latitude']
print(train_df.dtypes)
print(train_df.head(5))

#print(train_df.isnull().sum())
print('Old size: %d' % len(train_df))
train_df = train_df.dropna(how = 'any', axis = 'rows')
print('New size: %d' % len(train_df))

#plot = train_df.iloc[:2000].plot.scatter('abs_diff_longitude', 'abs_diff_latitude')
print('Old size: %d' % len(train_df))
train_df = train_df[(train_df.abs_diff_longitude < 5.0) & (train_df.abs_diff_latitude < 5.0)]
print('New size: %d' % len(train_df))

def add_hour_features(df):
    df['time_hour'] = df.pickup_datetime.apply(lambda x: x.hour)

add_hour_features(train_df)
del train_df['pickup_datetime']
print(train_df.dtypes)
print(train_df.head(5))

def get_input_matrix(df):
    return np.column_stack((df.abs_diff_longitude, df.abs_diff_latitude,df.passenger_count, df.time_hour, np.ones(len(df))))
    
train_X = get_input_matrix(train_df)
train_y = np.array(train_df['fare_amount'])

print(train_X.shape)
print(train_y.shape)

del train_df
import gc
gc.collect()


['train.csv', 'sample_submission.csv', 'test.csv', 'GCP-Coupons-Instructions.rtf']
fare_amount                 float32
pickup_datetime      datetime64[ns]
pickup_longitude            float32
pickup_latitude             float32
dropoff_longitude           float32
dropoff_latitude            float32
passenger_count               uint8
dtype: object
fare_amount                  float32
pickup_datetime       datetime64[ns]
passenger_count                uint8
abs_diff_longitude           float32
abs_diff_latitude            float32
dtype: object
   fare_amount     pickup_datetime  passenger_count  abs_diff_longitude  \
0          4.5 2009-06-15 17:26:21                1            0.002701   
1         16.9 2010-01-05 16:52:16                1            0.036774   
2          5.7 2011-08-18 00:35:00                2            0.008507   
3          7.7 2012-04-21 04:30:42                1            0.004440   
4          5.3 2010-03-09 07:51:00                1            0.011436   

 

14

In [33]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
model = Pipeline([('poly', PolynomialFeatures(degree=3)),('linear', LinearRegression(fit_intercept=False))])
model = model.fit(train_X,train_y)
print(model.named_steps['linear'].coef_)
del train_X
del train_y
import gc
gc.collect()

[ 9.43681884e-01  5.66752344e+01  5.54023569e+01  2.16829344e-02
 -9.21165676e-03  9.43681884e-01 -7.97896411e+01 -4.45102985e+01
  2.01488710e+00  3.92881423e+00  5.66752344e+01 -6.16724220e+01
 -2.53323756e+00 -2.86002733e+00  5.54023569e+01 -1.45910559e-02
  7.84392297e-03  2.16829331e-02  5.78150727e-03 -9.21165761e-03
  9.43681885e-01  2.53064338e+01  1.52014481e+01  9.75278944e-01
  6.88114841e-02 -7.97896411e+01  1.41203987e+01 -6.68421928e+00
 -4.37824337e-01 -4.45102985e+01 -3.62458158e-01 -5.89287188e-02
  2.01488710e+00 -3.24520607e-01  3.92881423e+00  5.66752344e+01
  2.14638681e+01  4.88714883e-01 -2.10470963e-01 -6.16724220e+01
  1.29878295e+00 -1.40994825e-02 -2.53323755e+00  2.53222366e-01
 -2.86002733e+00  5.54023569e+01  1.38517167e-04 -2.13925158e-05
 -1.45910560e-02 -6.22942750e-04  7.84392306e-03  2.16829331e-02
 -4.72819371e-04  5.78151041e-03 -9.21165740e-03  9.43681885e-01]


0

In [34]:
test_df = pd.read_csv('./input/test.csv',parse_dates=["pickup_datetime"], infer_datetime_format=True)
print(test_df.dtypes)

# Reuse the above helper functions to add our features and generate the input matrix.
add_travel_vector_features(test_df)
add_hour_features(test_df)

test_X = get_input_matrix(test_df)
# Predict fare_amount on the test set using our model (w) trained on the training set.
test_y_predictions = np.array(model.predict(test_X))

# Write the predictions to a CSV file which we can submit to the competition.
submission = pd.DataFrame(
    {'key': test_df.key, 'fare_amount': test_y_predictions},
    columns = ['key', 'fare_amount'])
submission.to_csv('submission26_deg3_30k.csv', index = False)

print(os.listdir('.'))

key                          object
pickup_datetime      datetime64[ns]
pickup_longitude            float64
pickup_latitude             float64
dropoff_longitude           float64
dropoff_latitude            float64
passenger_count               int64
dtype: object
['.ipynb_checkpoints', 'submission26_deg3_30k.csv', 'train.csv.zip', 'New York City Taxi Fare Prediction.ipynb_v26.ipynb', 'input', 'submission26_deg3_20k.csv']


## Ideas for Improvement
The output here will score an RMSE of $5.74, but you can do better than that!  Here are some suggestions:

* Use more columns from the input data.  Here we're only using the start/end GPS points from columns `[pickup|dropoff]_[latitude|longitude]`.  Try to see if the other columns -- `pickup_datetime` and `passenger_count` -- can help improve your results.
* Use absolute location data rather than relative.  Here we're only looking at the difference between the start and end points, but maybe the actual values -- indicating where in NYC the taxi is traveling -- would be useful.
* Use a non-linear model to capture more intricacies within the data.
* Try to find more outliers to prune, or construct useful feature crosses.
* Use the entire dataset -- here we're only using about 20% of the training data!

Special thanks to Dan Becker, Will Cukierski, and Julia Elliot for reviewing this Kernel and providing suggestions!