In [81]:
import pandas as pd
import seaborn as sns
import missingno as msno
import matplotlib.pyplot as plt
import numpy as np

In [82]:
train = pd.read_pickle("clean-data/train.pkl")
train.head()

Unnamed: 0,order_id,user_id,platform,personal_or_business,placement_date,placement_weekday,placement_time,confirm_date,confirm_weekday,confirm_time,...,distance,temperature,pickup_lat,pickup_long,dest_lat,dest_long,rider_id,pickup_to_arrival_time,placement_to_confirm_time,confirm_to_pickup_time
0,4211,633,3,Business,9,5,34546,9,5,34810,...,4,20.4,-1.317755,36.83037,-1.300406,36.829741,432,745,264,2840
1,25375,2285,3,Personal,12,5,40576,12,5,41001,...,16,26.4,-1.351453,36.899315,-1.295004,36.814358,856,1993,425,1248
2,1899,265,3,Business,30,2,45565,30,2,45764,...,3,23.258889,-1.308284,36.843419,-1.300921,36.828195,155,455,199,619
3,9336,1402,3,Business,15,5,33934,15,5,33965,...,9,19.2,-1.281301,36.832396,-1.257147,36.795063,855,1341,31,1021
4,27883,1737,1,Personal,13,1,35718,13,1,35778,...,9,15.4,-1.266597,36.792118,-1.295041,36.809817,770,1214,60,545


In [83]:
# Combining riders and train into one df
riders = pd.read_pickle("clean-data/riders.pkl")
train = train.merge(riders, on="rider_id")

In [84]:
# Dropping date/time columns
dropped = ["placement_date", "placement_weekday", "placement_time",
           "confirm_date", "confirm_weekday", "confirm_time",
           "arrival_pickup_date", "arrive_pickup_weekday", "arrive_time",
           "pickup_date", "pickup_weekday", "pickup_time",
           "arrival_dest_date", "arrival_dest_weekday", "arrival_dest_time", 
           "order_id", "user_id", "rider_id"]

train.drop(columns=dropped, inplace=True)

In [85]:
train.head()

Unnamed: 0,platform,personal_or_business,distance,temperature,pickup_lat,pickup_long,dest_lat,dest_long,pickup_to_arrival_time,placement_to_confirm_time,confirm_to_pickup_time,no_of_orders,age,average_rating,no_of_ratings
0,3,Business,4,20.4,-1.317755,36.83037,-1.300406,36.829741,745,264,2840,1637,1309,13.8,549
1,3,Personal,20,24.5,-1.326774,36.787807,-1.356237,36.904295,2886,13,1927,1637,1309,13.8,549
2,3,Business,6,24.7,-1.255189,36.782203,-1.273412,36.818206,2615,1268,1219,1637,1309,13.8,549
3,3,Personal,18,15.2,-1.290315,36.757377,-1.22352,36.802061,2986,19,1577,1637,1309,13.8,549
4,2,Personal,7,19.2,-1.273524,36.79922,-1.300431,36.752427,1602,86,1036,1637,1309,13.8,549


In [73]:
# Converting numerical columns into bins
cols = ["temperature", "age", "no_of_orders", 
        "placement_to_confirm_time", "confirm_to_pickup_time", "no_of_ratings"]

for col in cols:
    train[col] = pd.qcut(train[col], q=4)

In [74]:
train.head()

Unnamed: 0,platform,personal_or_business,distance,temperature,pickup_lat,pickup_long,dest_lat,dest_long,pickup_to_arrival_time,placement_to_confirm_time,confirm_to_pickup_time,no_of_orders,age,average_rating,no_of_ratings
0,3,Business,4,"(11.199, 21.4]",-1.317755,36.83037,-1.300406,36.829741,745,"(69.0, 289.0]","(1939.0, 9836.0]","(1212.0, 2311.0]","(1236.0, 3764.0]",13.8,"(495.0, 2298.0]"
1,3,Personal,20,"(23.259, 25.3]",-1.326774,36.787807,-1.356237,36.904295,2886,"(-0.001, 27.0]","(1269.0, 1939.0]","(1212.0, 2311.0]","(1236.0, 3764.0]",13.8,"(495.0, 2298.0]"
2,3,Business,6,"(23.259, 25.3]",-1.255189,36.782203,-1.273412,36.818206,2615,"(289.0, 176725.0]","(836.0, 1269.0]","(1212.0, 2311.0]","(1236.0, 3764.0]",13.8,"(495.0, 2298.0]"
3,3,Personal,18,"(11.199, 21.4]",-1.290315,36.757377,-1.22352,36.802061,2986,"(-0.001, 27.0]","(1269.0, 1939.0]","(1212.0, 2311.0]","(1236.0, 3764.0]",13.8,"(495.0, 2298.0]"
4,2,Personal,7,"(11.199, 21.4]",-1.273524,36.79922,-1.300431,36.752427,1602,"(69.0, 289.0]","(836.0, 1269.0]","(1212.0, 2311.0]","(1236.0, 3764.0]",13.8,"(495.0, 2298.0]"


In [79]:
pd.get_dummies(train).dtypes

distance                                         int64
pickup_lat                                     float64
pickup_long                                    float64
dest_lat                                       float64
dest_long                                      float64
pickup_to_arrival_time                           int64
average_rating                                 float64
platform_1                                       uint8
platform_2                                       uint8
platform_3                                       uint8
platform_4                                       uint8
personal_or_business_Business                    uint8
personal_or_business_Personal                    uint8
temperature_(11.199, 21.4]                       uint8
temperature_(21.4, 23.259]                       uint8
temperature_(23.259, 25.3]                       uint8
temperature_(25.3, 32.1]                         uint8
placement_to_confirm_time_(-0.001, 27.0]         uint8
placement_

In [80]:
train = pd.get_dummies(train)

In [88]:
target = train[["pickup_to_arrival_time"]]
features = train.drop(columns="pickup_to_arrival_time")

In [89]:
target.to_pickle("clean-data/target.pkl")

In [90]:
features.to_pickle("clean-data/features.pkl")