<h1><center> Kaggle Competition: [New York City Taxi Fare Prediction](https://www.kaggle.com/c/new-york-city-taxi-fare-prediction)</center></h1> 

# 1. Data Ingestion & Data Cleaning

## 1.1 Data Ingestion

In [1]:
import math
import pandas as pd
import numpy as np
from IPython.display import display
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
%matplotlib inline
from tqdm import tqdm_notebook as tqdm
import subprocess
import seaborn as sns
sns.set()

import time
start_time = time.time()

In [2]:
# train_rows = 55423857
# print (f'>>> Exact number of rows: {train_rows}')

In [3]:
# Check if data/train.csv exists:
try:
    train_df = pd.read_csv('data/train.csv', nrows=5)
except FileNotFoundError:
    print('>>> You must download train.csv from: https://www.kaggle.com/c/new-york-city-taxi-fare-prediction/download/train.csv')

#### Tricks to load the data and decrease memory footprint significantly

In [4]:
# set column types to optimize memory usage
col_types = {
    'fare_amount': 'float32',
    'pickup_datetime': 'str', 
    'pickup_longitude': 'float32',
    'pickup_latitude': 'float32',
    'dropoff_longitude': 'float32',
    'dropoff_latitude': 'float32',
    'passenger_count': 'uint8'
}

new_cols = list(col_types.keys())

In [5]:
def get_raw_dataframe(read_size=0):   
    # Method #1: read the entire DataFrame from the disk
    if (read_size == 0):
        df_list = [] # list to hold the batch dataframe
        for df_chunk in tqdm(pd.read_csv('data/train.csv', usecols=new_cols, dtype=col_types, chunksize=5000000)): # loads 5M rows each iteration
            df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
            df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

            # Process each chunk of dataframe right here:
            # clean_data(), feature_engineer(),fit()

            # Or, append the chunk to list and merge all later
            df_list.append(df_chunk) 

        # merge all dataframes into one
        train_df = pd.concat(df_list)
        del df_list
        return train_df
        
    # Method #2: read a predetermined amount of data         
    train_df = pd.read_csv('data/train.csv', usecols=new_cols, dtype=col_types, nrows=read_size)
    train_df['pickup_datetime'] = train_df['pickup_datetime'].str.slice(0, 16)
    train_df['pickup_datetime'] = pd.to_datetime(train_df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
    return train_df

In [6]:
# train_df.info(memory_usage='deep')

## 1.2 Data Cleaning

In [7]:
# check feature related statistics
pd.set_option('display.max_columns', 100)       # tells pandas to print all columns (no hiding!)
pd.set_option('display.width', 3000)
pd.set_option('float_format', '{:f}'.format)    # prints the entire number instead of x + ye
train_df.describe()

Unnamed: 0,fare_amount,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
count,5.0,5.0,5.0,5.0,5.0,5.0
mean,8.02,-73.959664,40.739009,-73.952068,40.75734,1.2
std,5.102156,0.066789,0.024756,0.063357,0.029084,0.447214
min,4.5,-74.016048,40.711303,-73.991567,40.712278,1.0
25%,5.3,-73.98713,40.721319,-73.991242,40.750562,1.0
50%,5.7,-73.982738,40.733143,-73.979268,40.758092,1.0
75%,7.7,-73.968095,40.76127,-73.956655,40.782004,1.0
max,16.9,-73.844311,40.768008,-73.84161,40.783762,2.0


In [8]:
def remove_geo_outliers(df):
    # NY boundary
    boundary = { 
        'min_lng':-74.263242, 'min_lat':40.573143, 
        'max_lng':-72.986532, 'max_lat':41.709555 
    }
   
    df.loc[ ((df.pickup_longitude  >= boundary['min_lng'] ) & 
             (df.pickup_longitude  <= boundary['max_lng']) &
             (df.pickup_latitude   >= boundary['min_lat']) & 
             (df.pickup_latitude   <= boundary['max_lat']) &
             (df.dropoff_longitude >= boundary['min_lng']) & 
             (df.dropoff_longitude <= boundary['max_lng']) &
             (df.dropoff_latitude  >= boundary['min_lat']) & 
             (df.dropoff_latitude  <= boundary['max_lat'])),'outlier'] = 0
    
    df.loc[~((df.pickup_longitude  >= boundary['min_lng']) & 
             (df.pickup_longitude  <= boundary['max_lng']) &
             (df.pickup_latitude   >= boundary['min_lat']) & 
             (df.pickup_latitude   <= boundary['max_lat']) &
             (df.dropoff_longitude >= boundary['min_lng']) & 
             (df.dropoff_longitude <= boundary['max_lng']) &
             (df.dropoff_latitude  >= boundary['min_lat']) & 
             (df.dropoff_latitude  <= boundary['max_lat'])),'outlier'] = 1    
    
#     print("Outlier vs Non Outlier")
#     print(df['is_outlier_loc'].value_counts())

    df = df.loc[df['outlier'] == 0]    
    return df.drop(['outlier'], axis=1)

In [9]:
def get_clean_dataframe(num_rows):
    train_df = get_raw_dataframe(num_rows)
    
    # remove NaNs and Negative & Free fares (keeping only the ones that cost more than 0)
    train_df.dropna(inplace=True, axis=0)
    train_df = train_df[train_df['fare_amount'] > 0]
#     train_df = train_df[(train_df['fare_amount'] >= 2) & (train_df['fare_amount'] <= 200)]

    # remove rows where lat/long are ZERO
    train_df = train_df[(train_df['pickup_latitude'] != 0) & (train_df['pickup_longitude'] != 0)]
    train_df = train_df[(train_df['dropoff_latitude'] != 0) & (train_df['dropoff_longitude'] != 0)]

    # handle rides with too many or too few passengers
    train_df = train_df[(train_df['passenger_count'] >= 0) & (train_df['passenger_count'] <= 7)]
    
    # remove any point beyond NYC border
    train_df = remove_geo_outliers(train_df)
    
    # remove useless feature: passenger_count
    train_df.drop('passenger_count', axis=1, inplace=True)
    
    return train_df

# 2. Feature Engineering

In [10]:
# Distance formula: https://www.movable-type.co.uk/scripts/latlong.html
def getDistance(lat1, lon1, lat2, lon2): 
    R = 6371e3
    phi1 = math.radians(lat1)
    phi2 = math.radians(lat2)
    deltaPhi = math.radians(lat2-lat1)
    deltaLambda = math.radians(lon2-lon1)

    a = math.sin(deltaPhi/2) * math.sin(deltaPhi/2) + math.cos(phi1) * math.cos(phi2) * math.sin(deltaLambda/2) * math.sin(deltaLambda/2)
    c = 2 * math.atan2(math.sqrt(a), math.sqrt(1-a))
    return round((R * c) / 1000, 3)    # returns distance in km

In [11]:
# Add calendar related features: year, month, day, hour, day_of_week
def add_calendar(df):
    df['year'] = df.pickup_datetime.apply(lambda x: x.year)
    df['month'] = df.pickup_datetime.apply(lambda x: x.month)
    df['day'] = df.pickup_datetime.apply(lambda x: x.day)
    df['hour'] = df.pickup_datetime.apply(lambda x: x.hour)
    df['day_of_week'] = df.pickup_datetime.apply(lambda x: x.weekday())
    df['quarter'] = df.pickup_datetime.apply(lambda x: x.quarter)
    df['week_of_year'] = df.pickup_datetime.apply(lambda x: x.isocalendar()[1])        
    df['business_day'] = df.pickup_datetime.apply(lambda x: 1 if (x.weekday() >= 0 and x.weekday() <= 4) else 0)
    df['daytime'] = 0
    df.loc[(df.hour >= 8) & (df.hour < 20), 'daytime'] = 1 
    
    df['year'] = df['year'].astype('int16')
    df['month'] = df['month'].astype('int8')
    df['day'] = df['day'].astype('int8')
    df['hour'] = df['hour'].astype('int8')
    df['day_of_week'] = df['day_of_week'].astype('int8')    
    df['quarter'] = df['quarter'].astype('int8')
    df['week_of_year'] = df['week_of_year'].astype('int8')
    df['daytime'] = df['daytime'].astype('int8')
    df['business_day'] = df['business_day'].astype('int8')
    return df

In [12]:
# Add distance in kilometers
def add_distance_km(df):
    df['distance_km'] = df.apply(lambda row: getDistance(row['pickup_latitude'], row['pickup_longitude'], row['dropoff_latitude'], row['dropoff_longitude']), axis=1)
    df['distance_km'] = df['distance_km'].astype('float32')

    df['distance_km_round'] = df.distance_km.apply(lambda x: int(round(x)))
    df['distance_km_round'] = df['distance_km_round'].astype('int16')
    return df

In [13]:
def minkowski_distance(x1, x2, y1, y2, p):
    return ((abs(x2 - x1) ** p) + (abs(y2 - y1)) ** p) ** (1 / p)

def add_distances_mht_ecd(df):
    df['distance_mht'] = minkowski_distance(df['pickup_longitude'], df['dropoff_longitude'], df['pickup_latitude'], df['dropoff_latitude'], 1)
    df['distance_ecd'] = minkowski_distance(df['pickup_longitude'], df['dropoff_longitude'], df['pickup_latitude'], df['dropoff_latitude'], 2)    
    
    df['distance_mht'] = df['distance_mht'].astype('float32')
    df['distance_ecd'] = df['distance_ecd'].astype('float32')
    return df

In [14]:
# Add geographical cluster as feature 
from sklearn.cluster import KMeans
import copy

# When this function is called with 1 arg, the arg is the training dataset. 
# When called with both args, the 1st arg is the test and the 2nd is the training dataset.
def add_geo_cluster(df, train=pd.DataFrame()):
    kmeans_geo_model = KMeans(n_clusters=4) # 4 clusters, as indicated by the Elbow method
    
    # The test and training datasets are passed so they can be clustered together and the test gets proper labels.
    if (not train.empty):
        df['is_test'] = 1
        train['is_test'] = 0
        geo_df = pd.concat([df,train], axis=0, sort=True)[['pickup_longitude','pickup_latitude','is_test']]
    else:  
        geo_df = copy.deepcopy(df[['pickup_longitude','pickup_latitude']])
        
    kmeans_geo_model.fit(geo_df[['pickup_longitude','pickup_latitude']])        
    cluster_labels = kmeans_geo_model.predict(geo_df[['pickup_longitude','pickup_latitude']])    

    # At this point, cluster_labels might have all the labels for the test & train datasets,
    # and we can't simply do: df['geo_cluster'] = cluster_labels
    # because when this function is called with both params, df will just store the test data.
    # It's like trying to fit a big box into a smaller one.
    geo_df['geo_cluster'] = cluster_labels    # SettingWithCopyWarning: geo_df cannot be just a slice of df
    
    if (train.empty):
        df['geo_cluster'] = cluster_labels    
        return df 
        
    df['geo_cluster'] = geo_df[geo_df['is_test'] == 1].geo_cluster         
    train.drop('is_test', axis=1, inplace=True)
    df.drop('is_test', axis=1, inplace=True)  
    return df

In [15]:
# Add geographical difference (ride_start - ride_end) as feature 
def add_geo_diff(df):
    df['lon_diff'] = df['dropoff_longitude'] - df['pickup_longitude']
    df['lat_diff'] = df['dropoff_latitude']  - df['pickup_latitude']
    return df

In [16]:
# Add geographical coordinates rounded for less precision
def add_geo_coords_rounded(df):
    df['pickup_latitude_round']    = df['pickup_latitude'].apply(lambda x:round(x,3)).astype('float32')
    df['pickup_longitude_round']   = df['pickup_longitude'].apply(lambda x:round(x,3)).astype('float32')
    df['dropoff_latitude_round']   = df['dropoff_latitude'].apply(lambda x:round(x,3)).astype('float32')
    df['dropoff_longitude_round']  = df['dropoff_longitude'].apply(lambda x:round(x,3)).astype('float32')
    return df

In [17]:
# Add features that define if a trip was to a particular airport OR neighbourhood
def setLocationSrcDst(df, boundary, srcName, dstName):
    df[srcName] = 0             
    df.loc[((df.pickup_latitude  >= boundary['min_lat']) & (df.pickup_latitude  <= boundary['max_lat']) &
            (df.pickup_longitude >= boundary['min_lng']) & (df.pickup_longitude <= boundary['max_lng'])), srcName] = 1    
    
    df[dstName] = 0
    df.loc[((df.dropoff_latitude  >= boundary['min_lat']) & (df.dropoff_latitude  <= boundary['max_lat']) &
            (df.dropoff_longitude >= boundary['min_lng']) & (df.dropoff_longitude <= boundary['max_lng'])), dstName] = 1
    
    df[srcName] = df[srcName].astype('int8')
    df[dstName] = df[dstName].astype('int8')
    return df


def add_airports(df):
    JFK = { 'min_lng': -73.8352, 'min_lat': 40.6195, 'max_lng': -73.7401, 'max_lat': 40.6659 }        
    EWR = { 'min_lng': -74.1925, 'min_lat': 40.6700, 'max_lng': -74.1531, 'max_lat': 40.7081 }
    LGA = { 'min_lng': -73.8895, 'min_lat': 40.7664, 'max_lng': -73.8550, 'max_lat': 40.7931 }
    
    df = setLocationSrcDst(df, JFK, 'jfk_src', 'jfk_dst')        
    df = setLocationSrcDst(df, EWR, 'ewr_src', 'ewr_dst')        
    df = setLocationSrcDst(df, LGA, 'lga_src', 'lga_dst')    
    return df   


def add_locations(df):
    manhattan =     { 'min_lng': -74.0479, 'min_lat': 40.6829, 'max_lng': -73.9067, 'max_lat': 40.8820 }                
    queens =        { 'min_lng': -73.9630, 'min_lat': 40.5431, 'max_lng': -73.7004, 'max_lat': 40.8007 }
    brooklyn =      { 'min_lng': -74.0421, 'min_lat': 40.5707, 'max_lng': -73.8334, 'max_lat': 40.7395 }
    bronx =         { 'min_lng': -73.9339, 'min_lat': 40.7855, 'max_lng':-73.7654,  'max_lat': 40.9176 }
    staten_island = { 'min_lng': -74.2558, 'min_lat': 40.4960, 'max_lng': -74.0522, 'max_lat': 40.6490 }
    
    df = setLocationSrcDst(df, manhattan, 'man_src', 'man_dst')
    df = setLocationSrcDst(df, queens, 'qns_src', 'qns_dst')
    df = setLocationSrcDst(df, brooklyn, 'bkny_src', 'bkny_dst')
    df = setLocationSrcDst(df, bronx, 'bx_src', 'bx_dst')
    df = setLocationSrcDst(df, staten_island, 'si_src', 'si_dst')
    
    return df

#### Bad features

These features shouldn't be used because they:
- Don't improve the score;
- Are mostly based on `fare_amount`;

In [18]:
def add_peak_hour(df):
    df['peak_hour'] = df.hour.apply(lambda x: 1 if ((x >= 7 and x <= 9) or (x >= 16 and x <= 18)) else 0)
    df['peak_hour'] = train_df['peak_hour'].astype('bool')
    return df

In [19]:
# Add ratio for (rides_per_hour and rides_per_day) (only works for the training set)
def add_ride_statistics(df):    
    tmp_hour = df.groupby(['year','month','day','hour']).agg({'fare_amount' : [np.size]}).reset_index()
    tmp_hour.columns = ['year','month','day','hour','rides_per_hour']

    tmp_day = df.groupby(['year','month','day']).agg({'fare_amount' : [np.size]}).reset_index()
    tmp_day.columns = ['year','month','day','rides_per_day']
            
    df = pd.merge(df, tmp_hour, how='left', on=['year','month','day','hour'])
    df = pd.merge(df, tmp_day, how='left', on=['year','month','day'])
        
    df['rides_per_hour'] = df['rides_per_hour'].astype('int16')
    df['rides_per_day'] = df['rides_per_day'].astype('int16')
    return df

In [20]:
# Add mean/median/min/max fare (only works well for the training/test set, not for the Kaggle scoring system)
def add_fare_statistics(df):
    tmp = df.groupby(['year','month','day_of_week','hour','distance_km_round','taxis_per_hour']).agg({'fare_amount' : [np.mean,np.median,np.min,np.max]}).reset_index()
    tmp.columns = ['year','month','day_of_week','hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']
    df = pd.merge(df, tmp, how='left', on=['year','month','day_of_week','hour','distance_km_round','taxis_per_hour']) 
    return df

In [None]:
# Add cost per KM (only works well for the training/test set, not for the Kaggle scoring system)
def add_cost_per_km(df):
    df['cost_per_km'] = df.apply(lambda row: row['distance_km_round'] / row['fare_amount'], axis=1)    
    df['cost_per_km'] = df['cost_per_km'].astype('float32')
    return df

# 3. Model Training

In [None]:
%%time

### Retrive a clean dataframe from the disk with N rows
train_df = get_clean_dataframe(2*1000000) # loads 2M rows

### Add relevant features
train_df = add_calendar(train_df)
train_df = add_distance_km(train_df)
train_df = add_distances_mht_ecd(train_df)
train_df = add_geo_coords_rounded(train_df)
train_df = add_geo_cluster(train_df)
train_df = add_geo_diff(train_df)
train_df = add_airports(train_df)
train_df = add_locations(train_df)

# bad features
# train_df = add_ride_statistics(train_df)
# train_df = add_peak_hour(train_df)
# train_df = add_fare_statistics(train_df)
# train_df = add_cost_per_km(train_df)

In [None]:
train_df.describe()

In [None]:
train_df.info(memory_usage='deep')

In [None]:
# train_subset = train_df[:6000]
# sns.pairplot(train_subset, vars=['fare_amount','distance_km_round','lon_diff','lat_diff','geo_cluster','year','month','hour'])
# plt.show()

In [None]:
train_df.describe()

In [None]:
%%time

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectPercentile, f_regression, RFECV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from itertools import compress

target = train_df[['fare_amount']]
df = train_df.drop(['fare_amount', 'pickup_datetime'], axis=1)

X = df
y = target.values.ravel()

# Split X,y into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=0)

print('>>> Train/Test Split done!')

# Create the model using RandomForestRegressor
rfr_model = RandomForestRegressor(n_jobs=7, n_estimators=25, max_features=len(df.columns), max_depth=25, min_samples_split=3, min_samples_leaf=3, random_state=24)
rfr_model.fit(X_train, np.log1p(y_train))
print('>>> Model Training done!')

## 3.1 Hyperparameter Tuning

Change `EXEC_GRID_SEARCH` to `True` to run RandomizedSearchCV and find optimal parameters:

In [None]:
from sklearn.model_selection import RandomizedSearchCV

EXEC_GRID_SEARCH = True

if (EXEC_GRID_SEARCH):
    param_grid = {
        'n_estimators': np.linspace(10, 100).astype(int),
        'max_depth': [None] + list(np.linspace(5, 30).astype(int)),
        'max_features': ['auto', 'sqrt', None] + list(np.arange(0.5, 1, 0.1)),
        'max_leaf_nodes': [None] + list(np.linspace(10, 50, 500).astype(int)),
        'min_samples_split': [2, 5, 10],
#         'bootstrap': [True, False]
    }

    # Estimator for use in random search
    estimator = RandomForestRegressor(random_state=42)

    # Create the random search model
    rs_cv = RandomizedSearchCV(estimator, param_grid, n_jobs=7, scoring='neg_mean_absolute_error', cv=3, n_iter=100, verbose=10, random_state=24)
    rs_cv.fit(X_train, np.log1p(y_train))

    rfr_model = rs_cv.best_estimator_
    print(f'The best parameters were {rs_cv.best_params_} with a negative mae of {rs_cv.best_score_}')

# 4. Model Evaluation

In [None]:
from lib import gfx  # gfx is another local module under the 'lib' directory
from sklearn import metrics
from math import sqrt

# The prediction is calculated as the arithmetic mean of both model's predictions
# gbr_fare = np.expm1(gbr_model.predict(X_test))  # model 1
rfr_fare = np.expm1(rfr_model.predict(X_test))   # model 2

# y_pred = (gbr_visitors + rfr_visitors) / 2
# y_pred = gbr_fare
y_pred = rfr_fare

rmse_score = sqrt(metrics.mean_squared_error(y_test, y_pred))
print('\n>>> RMSE Score:', rmse_score, '\n') # 2.5M -> 3.6734

# Plot Actual fare data against Predicted fare
gfx.plot_actual_vs_predicted(y, y_test, y_pred)

In [None]:
features = df.columns[:df.shape[1]]
importances = rfr_model.feature_importances_
indices = np.argsort(importances)

plt.figure(figsize=(12,8))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='b', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

# 5. Submission

### 5.1 Feature Engineering for the Test dataset

In [None]:
# Add ride-based features for the test set, copying from the training set
def test_add_ride_statistics(test, train):
    short_train_df = train[['year','month','day','hour','rides_per_hour']]
    tmp_hour = short_train_df.groupby(['year','month','day','hour']).agg({'rides_per_hour' : [np.mean]}).reset_index()
    tmp_hour.columns = ['year','month','day','hour','rides_per_hour']
    test = pd.merge(test, tmp_hour, how='left', on=['year','month','day','hour']) 

    short_train_df = train[['year','month','day','rides_per_day']]
    tmp_day = short_train_df.groupby(['year','month','day']).agg({'rides_per_day' : [np.mean]}).reset_index()
    tmp_day.columns = ['year','month','day','rides_per_day']
    test = pd.merge(test, tmp_day, how='left', on=['year','month','day'])
    
    # At this point, there might be some NaNs laying around    
    test["rides_per_day"].fillna(method='ffill', inplace=True) 
    test["rides_per_hour"].fillna(method='ffill', inplace=True) 
    return test

In [None]:
# Add mean/median/min/max fare statistics for the test set
def test_add_statistics(test, train):
    # Score 7.82    
#     short_train_df = train[['year','month','day_of_week','hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']]
#     tmp = short_train_df.groupby(['year','month','day_of_week','hour','distance_km_round','taxis_per_hour']).agg({'mean_fare':[np.mean],'median_fare':[np.mean],'min_fare':[np.mean],'max_fare':[np.mean]}).reset_index()
#     tmp.columns = ['year','month','day_of_week','hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']    
#     test = pd.merge(test, tmp, how='left', on=['year','month','day_of_week','hour','distance_km_round','taxis_per_hour'])          

    # Score 6.59
    short_train_df = train[['hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']]
    tmp = short_train_df.groupby(['hour','distance_km_round','taxis_per_hour']).agg({'mean_fare':[np.mean],'median_fare':[np.mean],'min_fare':[np.mean],'max_fare':[np.mean]}).reset_index()
    tmp.columns = ['hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']
    test = pd.merge(test, tmp, how='left', on=['hour','distance_km_round','taxis_per_hour'])  

    # Score 7.58
#     short_train_df = train[['day_of_week','hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']]
#     tmp = short_train_df.groupby(['day_of_week','hour','distance_km_round','taxis_per_hour']).agg({'mean_fare':[np.mean],'median_fare':[np.mean],'min_fare':[np.mean],'max_fare':[np.mean]}).reset_index()
#     tmp.columns = ['day_of_week','hour','distance_km_round','taxis_per_hour','mean_fare','median_fare','min_fare','max_fare']
#     test = pd.merge(test, tmp, how='left', on=['day_of_week','hour','distance_km_round','taxis_per_hour'])  
    
    return test

In [None]:
def test_add_cost_per_km(test):    
    test['cost_per_km'] = test.apply(lambda row: row['distance_km_round'] / row['mean_fare'], axis=1)
    test['cost_per_km'] = test['cost_per_km'].astype('float32')
    return test

### 5.2 Load test data and add new features

In [None]:
# loads a sample submission file in the correct format (columns key and fare_amount). 
submission_df = pd.read_csv('data/sample_submission.csv')
# display(submission_df.head(2))

# loads the test dataset used for submission
test_df = pd.read_csv('data/test.csv')
test_df.drop('passenger_count', axis=1, inplace=True) # remove feature passenger count

# display(test_df.head(3))
print('>>> test_df shape:', test_df.shape)

In [None]:
# Add common features
test_df['pickup_datetime'] = test_df['pickup_datetime'].str.slice(0, 16)
test_df['pickup_datetime'] = pd.to_datetime(test_df['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')

test_df = add_calendar(test_df)
test_df = add_distance_km(test_df)
test_df = add_distances_mht_ecd(test_df)
test_df = add_geo_coords_rounded(test_df)
test_df = add_geo_cluster(test_df, train_df)
test_df = add_geo_diff(test_df)
test_df = add_airports(test_df)
test_df = add_locations(test_df)

# Bad features
# test_df = test_add_ride_statistics(test_df, train_df)
# test_df = add_peak_hour(test_df)

display(test_df.head())

In [None]:
# test_df = add_fare_statistics(test_df, train_df)        
# display(test_df.head())
# print(test_df.isnull().sum())               # check missing data
# test_df = test_df.fillna(test_df.mean())    # handle missing data


# test_df = test_add_statistics(test_df, train_df)        
# display(test_df.head())
# print(test_df.isnull().sum())               # check missing data
# test_df = test_df.fillna(test_df.mean())    # handle missing data


# test_df = test_add_cost_per_km(test_df)
# display(test_df.head())

# Print NaN rows
# print(test_df.isnull().sum())               # check missing data
display(test_df[test_df.isnull().any(axis=1)].head())

In [None]:
import unittest

L1 = train_df.columns.tolist()
L2 = test_df.columns.tolist()
print('>>> Only *fare_amount* can show up as the difference between both datasets.\n>>> Found:', list(set(L1) - set(L2)))
print('>>> Number of Features for training=', len(L1), 'and testing=', len(L2))

tc = unittest.TestCase('__init__')
tc.assertEqual(len(L1) == len(L2), True)

### 5.3 Create (full) model and predict the fare

In [None]:
%%time

### Prepare the data
X = train_df.drop(['fare_amount', 'pickup_datetime'], axis=1)
y = train_df[['fare_amount']].values.ravel()

### Create model
rfr_model = RandomForestRegressor(n_jobs=7, n_estimators=25, max_features=len(df.columns), max_depth=25, min_samples_split=3, min_samples_leaf=3, random_state=42)
# rfr_model = RandomForestRegressor(n_jobs=6, bootstrap=True, n_estimators=41, max_features=0.5, max_depth=22, min_samples_split=2, min_samples_leaf=3, max_leaf_nodes=49, random_state=42)
rfr_model.fit(X, np.log1p(y))

### Predict fare
X_test = test_df.drop(['key', 'pickup_datetime'], axis=1)
y_pred = np.expm1(rfr_model.predict(X_test))

In [None]:
### Save submission file
submission_df['fare_amount'] = y_pred
submission_df.to_csv('submission.csv', index=False)

display(submission_df.head())
print('>>> submission_df shape=', submission_df.shape)
print('>>> File saved sucessfully.')

**Print total notebook runtime for debugging purposes:**

In [None]:
from datetime import timedelta

elapsed_time = (time.time() - start_time)
print('>>> Runtime:', str(timedelta(seconds=elapsed_time)))