# Understanding the data (Data Loading and Intial Expolration)

In [60]:
#import needed libraries

import pandas as pd #for data manipulation(rows, columns,..etc)
import numpy as np 
import seaborn as sns #for visualization
%matplotlib inline

In [61]:
#import .csv file

data = pd.read_csv('Uber Request Data.csv', delimiter = ',') #delimiter = ',' as its .csv file

In [62]:
data.head() #show data start

Unnamed: 0,Request id,Pickup point,Driver id,Status,Request timestamp,Drop timestamp
0,619,Airport,1.0,Trip Completed,11/7/2016 11:51,11/7/2016 13:00
1,867,Airport,1.0,Trip Completed,11/7/2016 17:57,11/7/2016 18:47
2,1807,City,1.0,Trip Completed,12/7/2016 9:17,12/7/2016 9:58
3,2532,Airport,1.0,Trip Completed,12/7/2016 21:08,12/7/2016 22:03
4,3112,City,1.0,Trip Completed,13-07-2016 08:33:16,13-07-2016 09:25:47


In [63]:
data.tail() #show data start

Unnamed: 0,Request id,Pickup point,Driver id,Status,Request timestamp,Drop timestamp
6740,6745,City,,No Cars Available,15-07-2016 23:49:03,
6741,6752,Airport,,No Cars Available,15-07-2016 23:50:05,
6742,6751,City,,No Cars Available,15-07-2016 23:52:06,
6743,6754,City,,No Cars Available,15-07-2016 23:54:39,
6744,6753,Airport,,No Cars Available,15-07-2016 23:55:03,


In [64]:
data.info() #exploring our data types, num of entries

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6745 entries, 0 to 6744
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Request id         6745 non-null   int64  
 1   Pickup point       6745 non-null   object 
 2   Driver id          4095 non-null   float64
 3   Status             6745 non-null   object 
 4   Request timestamp  6745 non-null   object 
 5   Drop timestamp     2831 non-null   object 
dtypes: float64(1), int64(1), object(4)
memory usage: 316.3+ KB


In [65]:
data.isna().sum() #it shows that there is 2650 missing driver ids, 3914 missing drop timestamps
#missing driver ids indicats no cars available, no cars available so obviously no assigned driver id
#missing drop timestamps indicats no cars available and cancelled requests

Request id              0
Pickup point            0
Driver id            2650
Status                  0
Request timestamp       0
Drop timestamp       3914
dtype: int64

In [66]:
data.describe() #exploring our data numerical info
#we notice that we have 6745 entries(rows) which should be equivalent to request id,
#but our maximum request id is 6766, that means there are some missing entries in our data

Unnamed: 0,Request id,Driver id
count,6745.0,4095.0
mean,3384.644922,149.501343
std,1955.099667,86.051994
min,1.0,1.0
25%,1691.0,75.0
50%,3387.0,149.0
75%,5080.0,224.0
max,6766.0,300.0


# Data Preprocessing

In [67]:
data.sort_values(by = ['Request id']) #sort data based on request id

Unnamed: 0,Request id,Pickup point,Driver id,Status,Request timestamp,Drop timestamp
2700,1,Airport,285.0,Trip Completed,11/7/2016 0:20,11/7/2016 0:51
4098,2,Airport,,No Cars Available,11/7/2016 0:23,
776,3,Airport,80.0,Trip Completed,11/7/2016 0:24,11/7/2016 1:31
4101,4,City,,No Cars Available,11/7/2016 0:37,
2506,5,Airport,264.0,Trip Completed,11/7/2016 0:36,11/7/2016 1:35
...,...,...,...,...,...,...
2534,6762,Airport,267.0,Trip Completed,15-07-2016 00:07:29,15-07-2016 00:52:50
2137,6763,City,224.0,Trip Completed,15-07-2016 00:04:44,15-07-2016 01:06:42
2324,6764,City,243.0,Trip Completed,15-07-2016 00:06:12,15-07-2016 01:17:53
6165,6765,Airport,,No Cars Available,15-07-2016 00:09:09,


In [68]:
duplicateValues = data.duplicated() #check if there is duplicate records
data[duplicateValues].value_counts() #empty series means no duplicate records

Series([], Name: count, dtype: int64)

In [69]:
data = data.dropna(subset=['Driver id']) #dropping null driver id values as its not valid to have no  driver,
#will use null drop timestamp values to indicate cancelled and no cars available both as cancelled
data.isna().sum()

Request id              0
Pickup point            0
Driver id               0
Status                  0
Request timestamp       0
Drop timestamp       1264
dtype: int64

In [73]:
#when we checked for data types we noticed driver id is float which may mean we have any decimal driver id which isnt valid
def checkDecimal(data, column):

    indciesToDrop = []

    for i, val in data[column].items():

        temp = val
        intNum = int(val)
        diff = temp - intNum

        #if decimal then drop the record as it's not valid to have decimal driver id
        if diff != 0:
            inciesToDrop.append(i)

    tempData = data.drop(indciesToDrop, axis=0)

    return tempData

data = checkDecimal(data, 'Driver id')

#we have 4095 driver id records, if they got less then it means there were invalid decimal values
data['Driver id'].info()

<class 'pandas.core.series.Series'>
Index: 4095 entries, 0 to 4094
Series name: Driver id
Non-Null Count  Dtype  
--------------  -----  
4095 non-null   float64
dtypes: float64(1)
memory usage: 64.0 KB


In [76]:
#since there were no decimal values
data['Driver id'] = data['Driver id'].astype(int)
data['Driver id'].info()

<class 'pandas.core.series.Series'>
Index: 4095 entries, 0 to 4094
Series name: Driver id
Non-Null Count  Dtype
--------------  -----
4095 non-null   int64
dtypes: int64(1)
memory usage: 64.0 KB
