In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype
from geopy.distance import vincenty

## Functions

In [2]:
def add_datepart(df, fldname):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear','Hour'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1

def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

def proc_df(df, y_fld, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace=True)
    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict]
    if do_scale: res = res + [mapper]
    return res

## Data Exploration and Cleaning

In [67]:
PATH = './data/'
data = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')

In [68]:
data.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [69]:
# Removing trips more than an hour and less than a minute
data = data[(data['trip_duration']<60*60) & (data['trip_duration']>60)]

In [70]:
# Adding time features
add_datepart(data,'pickup_datetime')
add_datepart(test,'pickup_datetime')

In [71]:
data['pickup_date'] = data['pickup_datetime'].dt.date
test['pickup_date'] = test['pickup_datetime'].dt.date

In [72]:
# Dropping drop-off time as it is not available in the test data 
data.drop(['dropoff_datetime'],axis=1,inplace=True)
# Drop pickup_datetima as the information is encoded in different columns using add_datepart
data.drop('pickup_datetime',axis=1,inplace=True)
test.drop('pickup_datetime',axis=1,inplace=True)

In [73]:
data.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,pickup_datetimeMonth,pickup_datetimeWeek,pickup_datetimeDay,pickup_datetimeDayofweek,pickup_datetimeDayofyear,pickup_datetimeHour,pickup_date
0,id2875421,2,1,-73.982155,40.767937,-73.96463,40.765602,N,455,2016,3,11,14,0,74,17,2016-03-14
1,id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2016,6,23,12,6,164,0,2016-06-12
2,id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,2016,1,3,19,1,19,11,2016-01-19
3,id3504673,2,1,-74.01004,40.719971,-74.012268,40.706718,N,429,2016,4,14,6,2,97,19,2016-04-06
4,id2181028,2,1,-73.973053,40.793209,-73.972923,40.78252,N,435,2016,3,12,26,5,86,13,2016-03-26


In [74]:
# Clustering based on Latitute and longitude 
places = np.vstack((data[['pickup_latitude', 'pickup_longitude']].values,
                    data[['dropoff_latitude', 'dropoff_longitude']].values))

In [75]:
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(places)

In [76]:
data['pickup_cluster'] = kmeans.predict(data[['pickup_latitude', 'pickup_longitude']])
data['dropoff_cluster'] = kmeans.predict(data[['dropoff_latitude', 'dropoff_longitude']])
test['pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test['dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [77]:
data.nunique().sort_values()

pickup_datetimeYear               1
vendor_id                         2
store_and_fwd_flag                2
pickup_datetimeMonth              6
pickup_datetimeDayofweek          7
passenger_count                   9
pickup_datetimeHour              24
pickup_datetimeWeek              27
pickup_datetimeDay               31
dropoff_cluster                 100
pickup_cluster                  100
pickup_datetimeDayofyear        182
pickup_date                     182
trip_duration                  3539
pickup_longitude              22153
dropoff_longitude             32886
pickup_latitude               44561
dropoff_latitude              61678
id                          1437533
dtype: int64

## Feature Engineering

In [78]:
data.columns

Index(['id', 'vendor_id', 'passenger_count', 'pickup_longitude',
       'pickup_latitude', 'dropoff_longitude', 'dropoff_latitude',
       'store_and_fwd_flag', 'trip_duration', 'pickup_datetimeYear',
       'pickup_datetimeMonth', 'pickup_datetimeWeek', 'pickup_datetimeDay',
       'pickup_datetimeDayofweek', 'pickup_datetimeDayofyear',
       'pickup_datetimeHour', 'pickup_date', 'pickup_cluster',
       'dropoff_cluster'],
      dtype='object')

### Add shortest distance

In [79]:
# Use Vincenty function from geopy package to compute shortest distance between two points on earth
data['short_distance'] = data[['pickup_latitude','pickup_longitude', 'dropoff_latitude', 'dropoff_longitude']].\
                         apply(lambda x: vincenty((x[0],x[1]),(x[2],x[3])), axis=1)

### Add Weather Data

In [80]:
# Data sourced from https://www.kaggle.com/mathijs/weather-data-in-new-york-city-2016/data
weather = pd.read_csv("./data/weather_data_nyc_centralpark_2016.csv")

In [81]:
weather.columns

Index(['date', 'maximum temperature', 'minimum temperature',
       'average temperature', 'precipitation', 'snow fall', 'snow depth'],
      dtype='object')

In [82]:
weather['date'] = pd.to_datetime(weather['date']).dt.date
weather['precipitation'] = pd.to_numeric(weather.precipitation.apply(lambda x: '0.01' if x == 'T' else x),\
                                         downcast='float')
weather['snow fall'] = pd.to_numeric(weather['snow fall'].apply(lambda x: '0.01' if x == 'T' else x),\
                                     downcast='float')
weather['snow depth'] = pd.to_numeric(weather['snow depth'].apply(lambda x: '0.01' if x == 'T' else x),\
                                      downcast='float')

In [83]:
data = data.merge(weather, how='left', left_on='pickup_date', right_on='date')
test = test.merge(weather, how='left', left_on='pickup_date', right_on='date')
data.drop(columns=['pickup_date','date'], inplace=True)
test.drop(columns=['pickup_date','date'], inplace=True)

In [84]:
# Target Encoding
from sklearn.model_selection import KFold

def reg_target_encoding(train, col, splits=5):
    """ Computes regularize mean encoding.
    Inputs:
       train: training dataframe
       
    """
    kf = KFold(n_splits=splits)
    global_mean = train.trip_duration.mean()
    for train_index,test_index in kf.split(train):
        kfold_mean_device_type = train.iloc[train_index,:].groupby(col).trip_duration.mean()
        train.loc[test_index,col+'_mean_enc'] =  train.loc[test_index,col].map(kfold_mean_device_type) 
    train[col+"_mean_enc"].fillna(global_mean, inplace=True)
    return train

def mean_encoding_test(test, train, col):
    """ Computes target enconding for test data.
    This is similar to how we do validation
    """
    global_mean = train.trip_duration.mean()
    mean_device_type = train.groupby(col).trip_duration.mean()
    test[col+"_mean_enc"] = test[col].map(mean_device_type)
    test[col+"_mean_enc"].fillna(global_mean, inplace=True)
    return test

In [85]:
train,valid = train_test_split(data,test_size =0.2)
train = train.reset_index()
train.drop('index', axis=1,inplace=True)
valid = valid.reset_index()
valid.drop('index', axis=1,inplace=True)

In [86]:
collist = ['vendor_id','passenger_count','pickup_cluster','dropoff_cluster',
           'store_and_fwd_flag','pickup_datetimeMonth','pickup_datetimeDayofweek','pickup_datetimeHour']
for col in collist:
    reg_target_encoding(train,col=col)
    mean_encoding_test(valid,train,col=col)

In [87]:
train.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,...,snow fall,snow depth,vendor_id_mean_enc,passenger_count_mean_enc,pickup_cluster_mean_enc,dropoff_cluster_mean_enc,store_and_fwd_flag_mean_enc,pickup_datetimeMonth_mean_enc,pickup_datetimeDayofweek_mean_enc,pickup_datetimeHour_mean_enc
0,id2566971,1,1,-73.980263,40.765717,-73.998695,40.726089,N,1561,2016,...,0.0,0.0,813.046019,807.2688,860.181973,753.718242,815.271357,830.778318,852.826867,897.261977
1,id3246535,1,1,-73.964493,40.792244,-73.945091,40.796654,N,461,2016,...,0.0,0.0,813.046019,807.2688,640.887397,753.409561,815.271357,800.575769,837.088998,865.830773
2,id3560523,2,1,-73.997681,40.691441,-73.976173,40.75898,N,1810,2016,...,0.0,0.0,819.209785,807.2688,924.940666,798.188241,815.271357,830.778318,788.231744,812.756223
3,id3040467,2,6,-73.977577,40.762032,-73.981834,40.768505,N,562,2016,...,0.0,0.0,819.209785,811.890421,860.181973,722.946423,815.271357,830.778318,774.117348,903.982383
4,id0497136,2,1,-73.994583,40.750481,-73.984322,40.75983,N,697,2016,...,0.0,0.0,819.209785,807.2688,836.586746,818.427445,815.271357,830.778318,837.088998,837.040698


In [88]:
valid.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,...,snow fall,snow depth,vendor_id_mean_enc,passenger_count_mean_enc,pickup_cluster_mean_enc,dropoff_cluster_mean_enc,store_and_fwd_flag_mean_enc,pickup_datetimeMonth_mean_enc,pickup_datetimeDayofweek_mean_enc,pickup_datetimeHour_mean_enc
0,id3417922,1,3,-73.954384,40.763775,-73.963005,40.756851,N,294,2016,...,0.0,0.0,813.118056,842.67111,702.047318,699.739098,815.147676,779.317144,773.171467,836.179936
1,id3126615,2,1,-73.989784,40.730267,-73.997543,40.724308,N,387,2016,...,0.0,0.0,818.913103,806.875521,761.653388,802.233728,815.147676,779.317144,853.070317,902.892622
2,id2283587,2,6,-73.977989,40.75491,-73.991493,40.750439,N,412,2016,...,0.0,0.0,818.913103,813.182067,788.984814,800.09654,815.147676,801.437,835.056281,902.892622
3,id1106505,2,1,-73.978287,40.6898,-73.952797,40.608776,N,1837,2016,...,0.0,0.0,818.913103,806.875521,830.77359,1989.157352,815.147676,829.655355,773.171467,732.148503
4,id3024151,1,3,-73.993301,40.747154,-73.998657,40.739922,N,91,2016,...,0.0,0.0,813.118056,842.67111,834.494261,643.157207,815.147676,858.549386,753.773681,836.179936


## Final Preprocessing

In [108]:
train_cats(train)

In [109]:
apply_cats(valid,train)

In [114]:
final_train = proc_df(train,'trip_duration')

In [116]:
X_train = final_train[0]
y_train = final_train[1]