In [107]:
import numpy as np
import pandas as pd
from datetime import datetime
import re
from sklearn.cluster import MiniBatchKMeans
from sklearn.model_selection import train_test_split
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import LabelEncoder, Imputer, StandardScaler
from pandas.api.types import is_string_dtype, is_numeric_dtype

## Functions

In [2]:
def add_datepart(df, fldname):
    fld = df[fldname]
    if not np.issubdtype(fld.dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    for n in ('Year', 'Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear','Hour'):
        df[targ_pre+n] = getattr(fld.dt,n.lower())
        
def train_cats(df):
    for n,c in df.items():
        if is_string_dtype(c): df[n] = c.astype('category').cat.as_ordered()

def apply_cats(df, trn):
    for n,c in df.items():
        if (n in trn.columns) and (trn[n].dtype.name=='category'):
            df[n] = pd.Categorical(c, categories=trn[n].cat.categories, ordered=True)

def fix_missing(df, col, name, na_dict):
    if is_numeric_dtype(col):
        if pd.isnull(col).sum() or (name in na_dict):
            df[name+'_na'] = pd.isnull(col)
            filler = na_dict[name] if name in na_dict else col.median()
            df[name] = col.fillna(filler)
            na_dict[name] = filler
    return na_dict

def numericalize(df, col, name, max_n_cat):
    if not is_numeric_dtype(col) and ( max_n_cat is None or col.nunique()>max_n_cat):
        df[name] = col.cat.codes+1

def scale_vars(df, mapper):
    warnings.filterwarnings('ignore', category=sklearn.exceptions.DataConversionWarning)
    if mapper is None:
        map_f = [([n],StandardScaler()) for n in df.columns if is_numeric_dtype(df[n])]
        mapper = DataFrameMapper(map_f).fit(df)
    df[mapper.transformed_names_] = mapper.transform(df)
    return mapper

def proc_df(df, y_fld, skip_flds=None, do_scale=False, na_dict=None,
            preproc_fn=None, max_n_cat=None, subset=None, mapper=None):
    if not skip_flds: skip_flds=[]
    if subset: df = get_sample(df,subset)
    df = df.copy()
    if preproc_fn: preproc_fn(df)
    y = df[y_fld].values
    df.drop(skip_flds+[y_fld], axis=1, inplace=True)
    if na_dict is None: na_dict = {}
    for n,c in df.items(): na_dict = fix_missing(df, c, n, na_dict)
    if do_scale: mapper = scale_vars(df, mapper)
    for n,c in df.items(): numericalize(df, c, n, max_n_cat)
    res = [pd.get_dummies(df, dummy_na=True), y, na_dict]
    if do_scale: res = res + [mapper]
    return res

## Data Exploration and Cleaning

In [3]:
PATH = './data/'
data = pd.read_csv(PATH+'train.csv')
test = pd.read_csv(PATH+'test.csv')

In [4]:
data.describe()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,trip_duration
count,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0,1458644.0
mean,1.53495,1.66453,-73.97349,40.75092,-73.97342,40.7518,959.4923
std,0.4987772,1.314242,0.07090186,0.03288119,0.07064327,0.03589056,5237.432
min,1.0,0.0,-121.9333,34.3597,-121.9333,32.18114,1.0
25%,1.0,1.0,-73.99187,40.73735,-73.99133,40.73588,397.0
50%,2.0,1.0,-73.98174,40.7541,-73.97975,40.75452,662.0
75%,2.0,2.0,-73.96733,40.76836,-73.96301,40.76981,1075.0
max,2.0,9.0,-61.33553,51.88108,-61.33553,43.92103,3526282.0


In [5]:
# Dropping drop-off time as it is not available in the test data 
data.drop(['dropoff_datetime'],axis=1,inplace=True)

In [6]:
# Removing trips more than an hour and less than a minute
data = data[(data['trip_duration']<60*60) & (data['trip_duration']>60)]

In [7]:
# Adding time features
add_datepart(data,'pickup_datetime')
add_datepart(test,'pickup_datetime')
data.drop('pickup_datetime',axis=1,inplace=True)
test.drop('pickup_datetime',axis=1,inplace=True)

In [8]:
data.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,pickup_datetimeMonth,pickup_datetimeWeek,pickup_datetimeDay,pickup_datetimeDayofweek,pickup_datetimeDayofyear,pickup_datetimeHour
0,id2875421,2,1,-73.982155,40.767937,-73.96463,40.765602,N,455,2016,3,11,14,0,74,17
1,id2377394,1,1,-73.980415,40.738564,-73.999481,40.731152,N,663,2016,6,23,12,6,164,0
2,id3858529,2,1,-73.979027,40.763939,-74.005333,40.710087,N,2124,2016,1,3,19,1,19,11
3,id3504673,2,1,-74.01004,40.719971,-74.012268,40.706718,N,429,2016,4,14,6,2,97,19
4,id2181028,2,1,-73.973053,40.793209,-73.972923,40.78252,N,435,2016,3,12,26,5,86,13


In [9]:
# Clustering based on Latitute and longitude 
places = np.vstack((data[['pickup_latitude', 'pickup_longitude']].values,
                    data[['dropoff_latitude', 'dropoff_longitude']].values))

In [10]:
kmeans = MiniBatchKMeans(n_clusters=100, batch_size=10000).fit(places)

In [11]:
data['pickup_cluster'] = kmeans.predict(data[['pickup_latitude', 'pickup_longitude']])
data['dropoff_cluster'] = kmeans.predict(data[['dropoff_latitude', 'dropoff_longitude']])
test['pickup_cluster'] = kmeans.predict(test[['pickup_latitude', 'pickup_longitude']])
test['dropoff_cluster'] = kmeans.predict(test[['dropoff_latitude', 'dropoff_longitude']])

In [102]:
# Target Encoding
from sklearn.model_selection import KFold

def reg_target_encoding(train, col, splits=5):
    """ Computes regularize mean encoding.
    Inputs:
       train: training dataframe
       
    """
    kf = KFold(n_splits=splits)
    global_mean = train.trip_duration.mean()
    for train_index,test_index in kf.split(train):
        kfold_mean_device_type = train.iloc[train_index,:].groupby(col).trip_duration.mean()
        train.loc[test_index,col+'_mean_enc'] =  train.loc[test_index,col].map(kfold_mean_device_type) 
    train[col+"_mean_enc"].fillna(global_mean, inplace=True)
    return train

def mean_encoding_test(test, train, col):
    """ Computes target enconding for test data.
    This is similar to how we do validation
    """
    global_mean = train.trip_duration.mean()
    mean_device_type = train.groupby(col).trip_duration.mean()
    test[col+"_mean_enc"] = test[col].map(mean_device_type)
    test[col+"_mean_enc"].fillna(global_mean, inplace=True)
    return test

In [99]:
train,valid = train_test_split(data,test_size =0.2)
train = train.reset_index()
train.drop('index', axis=1,inplace=True)
valid = valid.reset_index()
valid.drop('index', axis=1,inplace=True)

In [14]:
data.nunique().sort_values()

pickup_datetimeYear               1
vendor_id                         2
store_and_fwd_flag                2
pickup_datetimeMonth              6
pickup_datetimeDayofweek          7
passenger_count                   9
pickup_datetimeHour              24
pickup_datetimeWeek              27
pickup_datetimeDay               31
dropoff_cluster                 100
pickup_cluster                  100
pickup_datetimeDayofyear        182
trip_duration                  3539
pickup_longitude              22153
dropoff_longitude             32886
pickup_latitude               44561
dropoff_latitude              61678
id                          1437533
dtype: int64

In [103]:
collist = ['vendor_id','passenger_count','pickup_cluster','dropoff_cluster',
           'store_and_fwd_flag','pickup_datetimeMonth','pickup_datetimeDayofweek','pickup_datetimeHour']
for col in collist:
    reg_target_encoding(train,col=col)
    mean_encoding_test(valid,train,col=col)

In [104]:
train.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,...,pickup_cluster,dropoff_cluster,vendor_id_mean_enc,passenger_count_mean_enc,pickup_cluster_mean_enc,dropoff_cluster_mean_enc,store_and_fwd_flag_mean_enc,pickup_datetimeMonth_mean_enc,pickup_datetimeDayofweek_mean_enc,pickup_datetimeHour_mean_enc
0,id1803905,1,1,-73.958267,40.778103,-73.967995,40.76796,N,322,2016,...,62,61,812.578528,806.544767,704.723126,707.303081,814.814662,778.956914,788.927792,764.980242
1,id1371183,1,1,-73.979553,40.776325,-73.974007,40.787323,N,293,2016,...,20,56,812.578528,806.544767,692.45645,653.03438,814.814662,850.520388,753.36624,877.449749
2,id1993999,1,1,-73.87117,40.773945,-73.973625,40.758194,N,2010,2016,...,67,79,812.578528,806.544767,1674.627221,787.141266,814.814662,829.556854,788.927792,863.570093
3,id2029462,1,1,-73.782356,40.644707,-73.940536,40.851795,N,1746,2016,...,3,96,812.578528,806.544767,2157.765817,1216.636158,814.814662,858.90959,788.927792,800.580341
4,id2635074,1,1,-73.952629,40.776455,-73.974754,40.74411,N,464,2016,...,87,69,812.578528,806.544767,649.10145,640.223969,814.814662,829.556854,788.927792,811.004471


In [105]:
valid.head()

Unnamed: 0,id,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_datetimeYear,...,pickup_cluster,dropoff_cluster,vendor_id_mean_enc,passenger_count_mean_enc,pickup_cluster_mean_enc,dropoff_cluster_mean_enc,store_and_fwd_flag_mean_enc,pickup_datetimeMonth_mean_enc,pickup_datetimeDayofweek_mean_enc,pickup_datetimeHour_mean_enc
0,id3294378,1,1,-74.005638,40.741112,-74.00518,40.748093,N,176,2016,...,10,94,812.509755,806.510715,809.575031,701.616825,815.056096,858.660984,851.230397,785.620559
1,id1836458,1,2,-73.863678,40.769836,-73.98838,40.778481,N,2378,2016,...,13,92,812.509755,847.114641,1837.740408,722.716161,815.056096,801.177011,851.230397,877.500936
2,id0411198,1,1,-73.995926,40.73867,-73.985641,40.763222,Y,569,2016,...,71,1,812.509755,806.510715,725.320264,880.135707,1009.83395,778.971947,773.104788,863.34088
3,id2742973,1,1,-73.989433,40.757355,-73.955391,40.779301,N,858,2016,...,90,87,812.509755,806.510715,756.132124,607.184546,815.056096,801.177011,751.998039,833.344643
4,id1083752,1,1,-73.946968,40.77182,-73.973099,40.764427,N,772,2016,...,4,8,812.509755,806.510715,710.251836,778.761288,815.056096,775.990068,851.230397,863.34088


## Final Preprocessing

In [108]:
train_cats(train)

In [109]:
apply_cats(valid,train)

In [114]:
final_train = proc_df(train,'trip_duration')

In [116]:
X_train = final_train[0]
y_train = final_train[1]