In [14]:
import pandas as pd
import sklearn
import numpy as np
import os
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import svm
from sklearn.compose import ColumnTransformer
from sklearn.metrics import f1_score, accuracy_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn import tree
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from joblib import dump, load
from sklearn.kernel_approximation import RBFSampler


After the preliminary analysis, we can proceed with the training. We are using 3 extra features that were created, along with all numeric columns. The instructions are used as well, with the index, order_id, customer_id geohash and postalcode being dropped.

In [2]:
deliveries_data = pd.read_csv('file_for_training.csv')
deliveries_data.head()

Unnamed: 0,index,order_id,order_value,order_items_count,customer_id,created_timestamp_local,has_instruction,logistics_dropoff_distance,delivery_postal_code,delivery_geohash_precision8,...,has_call_instruction,has_leave_instruction,has_lift_instruction,has_lobby_instruction,has_gate_instruction,has_knock_instruction,has_bell_instruction,hour,is_w21xz,is_w23b7
0,618,b8ds-tqhf,40.44,10,h6huq0td,2021-03-11T20:19:50,True,58.656138,828868,w23bh248,...,False,False,False,False,False,False,False,20,False,False
1,619,x5xx-j2d2,35.6,10,t2nod4rq,2021-03-12T18:42:48,True,39.025148,530548,w21zu55k,...,False,False,False,False,False,False,False,18,False,False
2,620,x6qu-14kt,42.49,12,m4ycd3tm,2021-03-27T11:29:17,True,40.067811,650248,w21z9jc4,...,False,True,False,False,False,False,False,11,False,False
3,621,x3yw-rhc5,14.4,11,sgmnf9sb,2021-03-02T22:08:52,True,44.986785,550105,w21zg97y,...,False,False,False,False,False,False,False,22,False,False
4,622,y9zp-58q7,38.15,10,x4rvv3iy,2021-03-23T20:35:26,True,71.520828,102112,w21z61q6,...,False,False,False,False,False,False,False,20,False,False


In [3]:
num_features = ['order_value', 'order_items_count']
fact_features = ["hour","is_w21xz","is_w23b7",'has_instruction', 'has_phone_number', 'has_call_instruction', 'has_leave_instruction', 'has_lift_instruction', 'has_lobby_instruction', 'has_gate_instruction', 'has_knock_instruction', 'has_bell_instruction']

target = ['logistics_dropoff_distance']
# print(deliveries_data.columns)


numeric_features = deliveries_data.select_dtypes(include=['int64', 'float64']).drop(['index', 'logistics_dropoff_distance'], axis=1).columns
categorical_features = deliveries_data.select_dtypes(include=['object', 'bool']).drop(['delivery_geohash_precision8','created_timestamp_local', 'delivery_postal_code', 'order_id', 'customer_id'], axis = 1).columns



In [4]:
#shuffle plus splits
deliveries_data = deliveries_data.sample(frac=1, random_state=19).reset_index(drop=True)
X_train, X_test, y_train, y_test = train_test_split(deliveries_data[num_features + fact_features], deliveries_data['logistics_dropoff_distance'].tolist(), test_size=0.20, random_state=19)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.20,
#                                                                               random_state=19)


In [5]:
X_train.columns

Index(['order_value', 'order_items_count', 'hour', 'is_w21xz', 'is_w23b7',
       'has_instruction', 'has_phone_number', 'has_call_instruction',
       'has_leave_instruction', 'has_lift_instruction',
       'has_lobby_instruction', 'has_gate_instruction',
       'has_knock_instruction', 'has_bell_instruction'],
      dtype='object')

We obtain a baseline error for both RMSE and MAE. This will help set benchmarks and help us understand the benefits of models. Evaluations selected for continous variables are Mean Absolute Error and Root Mean Square Error. MAE will help understand the actual average error whereas RMSE helps from the persepctive of model improvement.

In [6]:
#setting two baselines - 
y_pred_test_base1 = [np.mean(y_train)]*len(y_test)

#Evaluate using Mean Absolute Error & RMSE
print((mean_squared_error(y_pred_test_base1, y_test, squared=False)))

print((mean_absolute_error(y_pred_test_base1, y_test)))

76.78102851459481
38.10485845954867


Using the mean of the entire dataset, we get an error of 76.53m for RMSE and 38.09m for MAE

We've earlier seen the distribution and are aware that more than 90% of distances are within 100m and 97% are within 200m. We use a weighted mean to predict the dropoff distance

In [7]:
#setting baseline with weighted mean of 97% of observations
'''
obs between 0-100, 100-200
print(len([item for item in y_train if item < 100]))
print(len([item for item in y_train if 100 <= item < 201]))
'''
def calculate_weighted_mean(list_of_values, ranges_list, counts):
    assert len(counts)==len(ranges_list) 
    weighted_mean = 0
    total_count = len([val for val in list_of_values if val <200])
    for n,interval in enumerate(ranges_list):
        weight = counts[n]/total_count
        list_subset = [v for v in list_of_values if interval[0] <= v < interval[1]]
        weighted_mean += weight*np.mean(list_subset)
        
        
        
    return weighted_mean

weighted_mean = calculate_weighted_mean(y_train, [(0,100), (100,200)], [1484737,114919])
    

#Evaluate using Mean Absolute Error & RMSE

y_pred_test_base2 = [weighted_mean]*len(y_test)
print((mean_squared_error(y_pred_test_base2, y_test, squared=False)))

print((mean_absolute_error(y_pred_test_base2, y_test)))


79.25556496206427
34.438615305002855


Using the mean of the main chunk of the dataset, we have significantly reduced the average error to 34.40m for MAE. Note that we get penalized for huge distances, thus RMSE increased to 78.99m

Baseline Accuracy

In [8]:
baseline = pd.DataFrame({'Model': ['Entire dataset mean', 'Below 200m mean'], 'RMSE':[76.53, 78.99], 'MAE':[38.09, 34.40]})
baseline.head()

Unnamed: 0,Model,RMSE,MAE
0,Entire dataset mean,76.53,38.09
1,Below 200m mean,78.99,34.4


In [9]:
oe = OneHotEncoder()
numeric_transformer = Pipeline(steps=[
    ('scaler', RobustScaler())])
categorical_transformer = Pipeline(steps=[('encoder',oe )])
preprocessor = ColumnTransformer(
transformers=[
    ('num', numeric_transformer, numeric_features),
    ('cat', categorical_transformer, categorical_features)])

linear_classifiers = [LinearRegression(), Ridge(alpha=0.1, max_iter=5000 ,solver='sparse_cg'), Lasso(selection='random', warm_start=True,positive=True)]

lr_text= ['Linear Regression', 'Ridge Regression', 'Lasso Regression']
for n,classifier in enumerate(linear_classifiers):
    linear_dict = {}

    print(classifier)
    linear_dict['Model'] = (lr_text[n])
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', classifier)])
    
    model = pipe.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Root Mean Squared Error is..")
    print(mean_squared_error(y_pred, y_test, squared=False))
    linear_dict['RMSE'] = mean_squared_error(y_pred, y_test, squared=False)
    print("Mean Absolute Error is..")
    print((mean_absolute_error(y_pred, y_test)))
    linear_dict['MAE'] = mean_absolute_error(y_pred, y_test)
    baseline = baseline.append(pd.Series(linear_dict), ignore_index = True)

LinearRegression()
Root Mean Squared Error is..
76.74259062448374
Mean Absolute Error is..
38.06008007220691
Ridge(alpha=0.1, max_iter=5000, solver='sparse_cg')
Root Mean Squared Error is..
76.74256992903723
Mean Absolute Error is..
38.060105607939775
Lasso(positive=True, selection='random', warm_start=True)
Root Mean Squared Error is..
76.77391629837416
Mean Absolute Error is..
38.09765332558676


In [10]:
baseline.head()

Unnamed: 0,Model,RMSE,MAE
0,Entire dataset mean,76.53,38.09
1,Below 200m mean,78.99,34.4
2,Linear Regression,76.742591,38.06008
3,Ridge Regression,76.74257,38.060106
4,Lasso Regression,76.773916,38.097653


Looking at the figures, it indicates this cannot be modelled effectively with linear models and we might need to make use of non linear kernel based models

SVR and RF take time to train hence we are using 10% of the data to reduce the training time. To represent the data faithfully, we take 10 percent of samples from each geohash precision level 6 so that the most occuring ones are greater in numbers in the training set as well and vice versa

In [11]:
def stratified_sample(df, against, fraction=0.1):
    if df[against].dtype == np.int64 or df[against].dtype == np.float64:
        #throw some error or convert to categorical
        return None
    unique_values = pd.unique(df[against].values)
    sampled = pd.DataFrame()
    for value in unique_values:
        subset = df[df[against] == value]
        subset = subset.sample(frac=fraction, random_state=19)
        sampled = sampled.append(subset)
    return sampled
deliveries_data['delivery_geohash_precision6'] = deliveries_data['delivery_geohash_precision8'].apply(lambda x: x[:6])
sampled = stratified_sample(deliveries_data, against='delivery_geohash_precision6')


print(len(sampled))



257625


In [12]:
if 'delivery_geohash_precision6' in sampled.columns:
    sampled.drop(['delivery_geohash_precision6'], inplace=True, axis=1)
    
X_train_small, X_test_small, y_train_small, y_test_small = train_test_split(sampled[num_features + fact_features], sampled['logistics_dropoff_distance'].tolist(), test_size=0.20, random_state=19)



In [22]:
classifier = svm.SVR(max_iter=50000)


pipe = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', classifier)])
    
model = pipe.fit(X_train_small, y_train_small)

y_pred = model.predict(X_test_small)



In [24]:
linear_dict = {}
linear_dict['Model'] = 'SVR'
print("Root Mean Squared Error is..")
print(mean_squared_error(y_pred, y_test_small, squared=False))
linear_dict['RMSE'] = mean_squared_error(y_pred, y_test_small, squared=False)
print("Mean Absolute Error is..")
print((mean_absolute_error(y_pred, y_test_small)))
linear_dict['MAE'] = mean_absolute_error(y_pred, y_test_small)
baseline = baseline.append(pd.Series(linear_dict), ignore_index = True)

Root Mean Squared Error is..
81.98866607440205
Mean Absolute Error is..
37.14570226792991


In [25]:
y_pred = model.predict(X_test)
linear_dict = {}
linear_dict['Model'] = 'SVR'
print("Root Mean Squared Error is..")
print(mean_squared_error(y_pred, y_test, squared=False))
linear_dict['RMSE'] = mean_squared_error(y_pred, y_test, squared=False)
print("Mean Absolute Error is..")
print((mean_absolute_error(y_pred, y_test)))
linear_dict['MAE'] = mean_absolute_error(y_pred, y_test)
baseline = baseline.append(pd.Series(linear_dict), ignore_index = True)

Root Mean Squared Error is..
83.28661393767834
Mean Absolute Error is..
37.72021894013785


In [29]:
baseline.head()

Unnamed: 0,Model,RMSE,MAE
0,Entire dataset mean,76.53,38.09
1,Below 200m mean,78.99,34.4
2,Linear Regression,76.742591,38.06008
3,Ridge Regression,76.74257,38.060106
4,Lasso Regression,76.773916,38.097653


In [13]:
# dump(model, '../../model/svr.joblib') 

Using SVR we get an MAE of 37.72m when training on 10% of the dataset and RMSE of 83.2m

To reduce the time we make use of Kernel Estimations

In [15]:
tree_classifiers = [tree.DecisionTreeRegressor(min_samples_split = 10000,criterion = "absolute_error", max_depth = 20)]

for classifier in tree_classifiers:
    print(classifier)
    pipe = Pipeline(steps=[('preprocessor', preprocessor),
                  ('classifier', classifier)])
    linear_dict = {}
    linear_dict['Model'] = 'Decision Trees'
    model = pipe.fit(X_train_small, y_train_small)
    print("Train complete..")

    y_pred = model.predict(X_test_small)
    print("Root Mean Squared Error is..")
    print((mean_squared_error(y_pred, y_test_small, squared=False)))
    linear_dict['RMSE'] = mean_squared_error(y_pred, y_test_small, squared=False)

    print("Mean Absolute Error is..")
    print((mean_absolute_error(y_pred, y_test_small)))
    linear_dict['MAE'] = mean_absolute_error(y_pred, y_test_small)
    baseline = baseline.append(pd.Series(linear_dict), ignore_index = True)



DecisionTreeRegressor(criterion='absolute_error', max_depth=20,
                      min_samples_split=10000)
Train complete..
Root Mean Squared Error is..
77.57070208367331
Mean Absolute Error is..
33.84662477024509


In [17]:
# dump(model, '../../model/dt_model.joblib')

['../../model/dt_model.joblib']

In [19]:
print(baseline)

                 Model       RMSE        MAE
0  Entire dataset mean  76.530000  38.090000
1      Below 200m mean  78.990000  34.400000
2    Linear Regression  76.742591  38.060080
3     Ridge Regression  76.742570  38.060106
4     Lasso Regression  76.773916  38.097653
5                  SVR  81.980000  37.720000
6       Decision Trees  77.570700  33.840000


In [21]:
classifier = RandomForestRegressor(n_estimators=16, criterion='absolute_error',min_samples_leaf=10,min_samples_split=1000, n_jobs=16)

pipe = Pipeline(steps=[('preprocessor', preprocessor),
              ('classifier', classifier)])
linear_dict = {}
linear_dict['Model'] = 'Random Forest'
model = pipe.fit(X_train_small, y_train_small)
print("Train complete..")

y_pred = model.predict(X_test_small)
print("Root Mean Squared Error is..")
print((mean_squared_error(y_pred, y_test_small, squared=False)))
linear_dict['RMSE'] = mean_squared_error(y_pred, y_test_small, squared=False)

print("Mean Absolute Error is..")
print((mean_absolute_error(y_pred, y_test_small)))
linear_dict['MAE'] = mean_absolute_error(y_pred, y_test_small)
baseline = baseline.append(pd.Series(linear_dict), ignore_index = True)



Train complete..
Root Mean Squared Error is..
77.53440615687191
Mean Absolute Error is..
33.8552488657673


In [22]:
dump(model, '../../model/rf_model.joblib')

['../../model/rf_model.joblib']

In [23]:
print(baseline)

                 Model       RMSE        MAE
0  Entire dataset mean  76.530000  38.090000
1      Below 200m mean  78.990000  34.400000
2    Linear Regression  76.742591  38.060080
3     Ridge Regression  76.742570  38.060106
4     Lasso Regression  76.773916  38.097653
5                  SVR  81.980000  37.720000
6       Decision Trees  77.570700  33.840000
7        Random Forest  77.534406  33.855249


## **Conclusion:**
1. We started with the baseline from two different mean measures and see how we could improve
2. Linear models after parameter tweaks could only improve the error marginally implying that - Given the set of features, the mapping between the input and output variables aren't linear. 
3. Of course certain more features extracted could have improved the predictions a bit
4. We proceed with SVR using an rbf kernel which was too time consuming. We used a stratified sample of the data to reduce the training time and were able to obtain a slightly better model
5. Decision Trees and Random Forests provided a lower error. I decided to proceed with Trees Ensemble i.e Random Forest as it uses aggregation to avoid overfitting and would generalise better on unseen data. Since there are many categorical variables, Trees outperform other models in terms of MAE(slightly) and Random Forests are much more robust to unseen data.  

## **Way Forward:**
1. Better Features i.e more features could be extracted from geohash and timestamps
2. Extracted features could be clustered for sub-100m for an improved prediction
3. Missing information on customer's nth order and rider history which can help improve significantly
4. Turning this problem into a classification problem and identifying the actual group for which the problem needs to be solved. <50m is okay given the inaccuracies in gps and dropped pins to the nearest reachable road. Also serveral classification metrics could be used to identify key areas to focus
5. Experimenting with evaluation metrics apart from absolute error- like squared errors and logs
6. Experimenting with kernel approximations
7. Adding confidence to predictions  
8. Using better feature engineering, the error can be further reduced, or rather from a business perspective, it could be used as a classification problem to group the distance into bins with a dedicated model for each bin.



## Front End Suggestions:

**Problems**  
The main problem according to me are the outliers and the distances exceeding 60m  
Anything greater than 800m to 1km is an outlier, and the possible reasons for this are:      
Customer enters the address and the suggestions map some place which is a few kms away or even in a different country (which can explain why we have a 50000+ m value too). Customer selects it unaware that it doesn’t match his gps location  
Location permission is perhaps turned off  
Customer is ordering for someone else   
Customer is travelling  
Random Reasons could influence someone wanting the rider to drop off because of certain set of rules within places for eg, a housing colony doesn’t allow outsiders to enter and the security guards pick it up at the gate for the customer  
Issue with address suggestions which aren’t reflective of actual location on map  


**Solutions:**  
In general, location data should be compared with the dropped pin and in case of a distance mismatch, there should be a pop-up or prompt message to highlight to them that the location is 200+ or n+ metres away from their actual location. This will assist in tackling issues of locations set far away by users unknowingly  
Saved addresses or preselected addresses should then display the distance of the dropped pin from location of user to highlight the actual distance as seen below  
Highlight instruction fields (to encourage user to fill)  in case dropped pin is more than a certain distance away from the actual location  
More granular pins for areas with high order density such that dropoff distances are minimised  
Suggestion ranking score when an address is typed, with confidence that is inversely proportional to the distance

