In [1]:
import pandas as pd
import os

os.chdir("../")

from src import display_df

In [2]:
completed_orders = pd.read_csv("data/completed_orders.csv")
drivers_location = pd.read_csv("data/drivers_location_during_request.csv")

In [3]:
completed_orders.head(7)

Unnamed: 0,Trip ID,Trip Origin,Trip Destination,Trip Start Time,Trip End Time
0,391996,"6.508813001668548,3.37740316890347","6.650969799999999,3.3450307",2021-07-01 07:28:04,2021-07-01 07:29:37
1,391997,"6.4316714,3.4555375","6.4280814653326,3.4721885847586",2021-07-01 06:38:04,2021-07-01 07:07:28
2,391998,"6.631679399999999,3.3388976","6.508324099999999,3.3590397",2021-07-01 06:21:02,2021-07-01 07:02:23
3,391999,"6.572757200000001,3.3677082","6.584881099999999,3.3614073",2021-07-01 07:16:07,2021-07-01 07:29:42
4,392001,"6.6010417,3.2766339","6.4501069,3.3916154",2021-07-01 09:30:59,2021-07-01 09:34:36
5,392005,"6.565087699999999,3.3844415","6.499696300000001,3.3509075",2021-07-01 10:53:36,2021-07-01 11:27:51
6,392009,"6.6636484,3.3082058","6.6185421,3.301634",2021-07-01 06:39:51,2021-07-01 07:41:12


In [4]:
drivers_location.head(10)

Unnamed: 0,id,order_id,driver_id,driver_action,lat,lng,created_at,updated_at
0,1,392001,243828,accepted,6.602207,3.270465,,
1,2,392001,243588,rejected,6.592097,3.287445,,
2,3,392001,243830,rejected,6.596133,3.281784,,
3,4,392001,243539,rejected,6.596142,3.280526,,
4,5,392001,171653,rejected,6.609232,3.2888,,
5,6,392001,245662,rejected,6.593095,3.287759,,
6,7,392001,171651,rejected,6.613538,3.270779,,
7,8,392001,243906,rejected,6.591838,3.276124,,
8,9,392001,226949,rejected,6.610941,3.276126,,
9,10,392001,243769,rejected,6.617199,3.271723,,


#### Analyze completed orders information

In [5]:
# get the informtion of completed orders data
completed_orders.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 536020 entries, 0 to 536019
Data columns (total 5 columns):
 #   Column            Non-Null Count   Dtype 
---  ------            --------------   ----- 
 0   Trip ID           536020 non-null  int64 
 1   Trip Origin       536020 non-null  object
 2   Trip Destination  536020 non-null  object
 3   Trip Start Time   534369 non-null  object
 4   Trip End Time     536019 non-null  object
dtypes: int64(1), object(4)
memory usage: 20.4+ MB


In [6]:
# Check missing values in completed_orders
completed_orders.isnull().sum()

Trip ID                0
Trip Origin            0
Trip Destination       0
Trip Start Time     1651
Trip End Time          1
dtype: int64

##### Analyze drivers location info

In [7]:
# Check missing values for drivers_location
drivers_location.isnull().sum()

id                     0
order_id               0
driver_id              0
driver_action          0
lat                    0
lng                    0
created_at       1557740
updated_at       1557740
dtype: int64

### Perform data preprocessing

In [8]:
from src.preprocess_data import GokadaDataPreprocessor

In [9]:
completed_orders_path = 'data/completed_orders.csv'
drivers_location_path = 'data/drivers_location_during_request.csv'

preprocessor = GokadaDataPreprocessor(completed_orders_path, drivers_location_path)


In [10]:
preprocessed_df = preprocessor.preprocess_and_save('processed_data.csv')

In [11]:
display_df(preprocessed_df)

| order_id   | trip_origin         | trip_destination    | trip_start_time     | trip_end_time       | id   | driver_id   | driver_action   | drivers_lat   | drivers_lon   | day_of_week   | hour_of_day   | day_of_month   | month   | trip_start_date   | trip_end_date   | trip_origin_latitude   | trip_origin_longitude   | trip_destination_latitude   | trip_destination_longitude   |
|:-----------|:--------------------|:--------------------|:--------------------|:--------------------|:-----|:------------|:----------------|:--------------|:--------------|:--------------|:--------------|:---------------|:--------|:------------------|:----------------|:-----------------------|:------------------------|:----------------------------|:-----------------------------|
| 392001     | 6.6010417,3.2766339 | 6.4501069,3.3916154 | 2021-07-01 09:30:59 | 2021-07-01 09:34:36 | 1    | 243828      | accepted        | 6.60221       | 3.27046       | Thursday      | 9             | 1              | July    | 2

In [12]:
# Check for missing values
preprocessed_df.isnull().sum()

order_id                      0
trip_origin                   0
trip_destination              0
trip_start_time               0
trip_end_time                 0
id                            0
driver_id                     0
driver_action                 0
drivers_lat                   0
drivers_lon                   0
day_of_week                   0
hour_of_day                   0
day_of_month                  0
month                         0
trip_start_date               0
trip_end_date                 0
trip_origin_latitude          0
trip_origin_longitude         0
trip_destination_latitude     0
trip_destination_longitude    0
dtype: int64

In [13]:
preprocessed_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1557740 entries, 4 to 1585113
Data columns (total 20 columns):
 #   Column                      Non-Null Count    Dtype         
---  ------                      --------------    -----         
 0   order_id                    1557740 non-null  object        
 1   trip_origin                 1557740 non-null  object        
 2   trip_destination            1557740 non-null  object        
 3   trip_start_time             1557740 non-null  datetime64[ns]
 4   trip_end_time               1557740 non-null  datetime64[ns]
 5   id                          1557740 non-null  float64       
 6   driver_id                   1557740 non-null  float64       
 7   driver_action               1557740 non-null  object        
 8   drivers_lat                 1557740 non-null  float64       
 9   drivers_lon                 1557740 non-null  float64       
 10  day_of_week                 1557740 non-null  object        
 11  hour_of_day             