## Data cleaning, preparing for ML models

In [3]:
import pandas as pd
from taxipred.utils.constants import DATA_PATH
import matplotlib.pyplot as plt

df = pd.read_csv(DATA_PATH / "taxi_trip_pricing.csv")
df.head()


Unnamed: 0,Trip_Distance_km,Time_of_Day,Day_of_Week,Passenger_Count,Traffic_Conditions,Weather,Base_Fare,Per_Km_Rate,Per_Minute_Rate,Trip_Duration_Minutes,Trip_Price
0,19.35,Morning,Weekday,3.0,Low,Clear,3.56,0.8,0.32,53.82,36.2624
1,47.59,Afternoon,Weekday,1.0,High,Clear,,0.62,0.43,40.57,
2,36.87,Evening,Weekend,1.0,High,Clear,2.7,1.21,0.15,37.27,52.9032
3,30.33,Evening,Weekday,4.0,Low,,3.48,0.51,0.15,116.81,36.4698
4,,Evening,Weekday,3.0,High,Clear,2.93,0.63,0.32,22.64,15.618


## Selecting Columns for the Prediction Model


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 11 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Time_of_Day            950 non-null    object 
 2   Day_of_Week            950 non-null    object 
 3   Passenger_Count        950 non-null    float64
 4   Traffic_Conditions     950 non-null    object 
 5   Weather                950 non-null    object 
 6   Base_Fare              950 non-null    float64
 7   Per_Km_Rate            950 non-null    float64
 8   Per_Minute_Rate        950 non-null    float64
 9   Trip_Duration_Minutes  950 non-null    float64
 10  Trip_Price             951 non-null    float64
dtypes: float64(7), object(4)
memory usage: 86.1+ KB


### I chose to drop Base_Fare, Per_Km_Rate, and Per_Minute_Rate because although they would increase the modelâ€™s accuracy, they would result in an unrealistic performance due to target leakage. I even drop Passenger_Count and Time_of_Day, similiar to week_day but feel less important

In [5]:
df = df.drop(columns=["Base_Fare", "Per_Km_Rate", "Per_Minute_Rate", "Time_of_Day", "Passenger_Count"])

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       950 non-null    float64
 1   Day_of_Week            950 non-null    object 
 2   Traffic_Conditions     950 non-null    object 
 3   Weather                950 non-null    object 
 4   Trip_Duration_Minutes  950 non-null    float64
 5   Trip_Price             951 non-null    float64
dtypes: float64(3), object(3)
memory usage: 47.0+ KB


### Cleaning Trip_Price

In [7]:
df["Trip_Price"].describe()

count    951.000000
mean      56.874773
std       40.469791
min        6.126900
25%       33.742650
50%       50.074500
75%       69.099350
max      332.043689
Name: Trip_Price, dtype: float64

In [8]:
df = df.dropna(subset=["Trip_Price"])


In [9]:
df["Trip_Price"].shape

(951,)

In [10]:
df["Trip_Price"].nlargest(50)

616    332.043689
287    329.913004
225    328.871769
141    327.217665
302    325.098950
481    322.725996
64     320.958664
532    296.088697
747    283.645201
27     280.877302
268    276.840597
110    274.535087
338    248.295209
588    247.598318
797    239.171407
108    233.008285
835    224.914663
267    206.699570
410    206.508652
22     201.869509
795    138.642500
671    129.535600
751    128.654900
522    124.241600
411    123.928000
725    122.418000
245    118.032100
278    117.746800
385    116.420600
478    114.941700
140    110.254400
560    109.896500
441    109.479600
810    109.430400
351    108.973400
758    108.232500
265    107.702500
437    106.296300
115    106.004200
93     105.941800
248    105.144000
76     104.242100
50     104.176400
717    104.155500
869    102.885000
200    102.880600
403    102.724600
615    102.430200
683    102.331600
577    101.991400
Name: Trip_Price, dtype: float64

In [11]:
df["Trip_Price"].describe()

count    951.000000
mean      56.874773
std       40.469791
min        6.126900
25%       33.742650
50%       50.074500
75%       69.099350
max      332.043689
Name: Trip_Price, dtype: float64

### There is only 20 lines with over 150. Removing the longest trips, but we keep the short trips

In [12]:
(df["Trip_Price"] > 150).sum()

np.int64(20)

In [13]:
df = df[df["Trip_Price"] <= 150]
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 931 entries, 0 to 999
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Trip_Distance_km       881 non-null    float64
 1   Day_of_Week            885 non-null    object 
 2   Traffic_Conditions     881 non-null    object 
 3   Weather                886 non-null    object 
 4   Trip_Duration_Minutes  885 non-null    float64
 5   Trip_Price             931 non-null    float64
dtypes: float64(3), object(3)
memory usage: 50.9+ KB


### Cleaning rest of the nan values

In [14]:
df = df.dropna(subset=["Trip_Distance_km", "Day_of_Week", "Traffic_Conditions", "Weather", "Trip_Duration_Minutes"])


In [16]:
df.isna().sum()

Trip_Distance_km         0
Day_of_Week              0
Traffic_Conditions       0
Weather                  0
Trip_Duration_Minutes    0
Trip_Price               0
dtype: int64