# Feature Engineering

In [31]:
import pandas as pd
import numpy as np

In [32]:
df = pd.read_csv("cleaned_dataset.csv")
df

Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count
0,2015-05-07 19:52:06.0000003,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1
1,2009-07-17 20:04:56.0000002,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.994710,40.750325,1
2,2009-08-24 21:45:00.00000061,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.740770,-73.962565,40.772647,1
3,2009-06-26 08:22:21.0000001,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3
4,2014-08-28 17:47:00.000000188,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5
...,...,...,...,...,...,...,...,...
199994,2012-10-28 10:49:00.00000053,3.0,2012-10-28 10:49:00+00:00,-73.987042,40.739367,-73.986525,40.740297,1
199995,2014-03-14 01:09:00.0000008,7.5,2014-03-14 01:09:00+00:00,-73.984722,40.736837,-74.006672,40.739620,1
199996,2009-06-29 00:42:00.00000078,30.9,2009-06-29 00:42:00+00:00,-73.986017,40.756487,-73.858957,40.692588,2
199997,2015-05-20 14:56:25.0000004,14.5,2015-05-20 14:56:25+00:00,-73.997124,40.725452,-73.983215,40.695415,1


In [33]:
df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'], errors='coerce')

In [34]:
df['hour'] = df['pickup_datetime'].dt.hour
df['day'] = df['pickup_datetime'].dt.day
df['month'] = df['pickup_datetime'].dt.month

In [35]:
df['day_of_week'] = df['pickup_datetime'].dt.dayofweek
df['day_name'] = df['pickup_datetime'].dt.day_name()


In [36]:
def classify_peak_hour(hour):
    if 7 <= hour <= 10 or 16 <= hour <= 20:
        return 'Peak'
    else:
        return 'Off-Peak'

df['peak_time'] = df['hour'].apply(classify_peak_hour)


In [37]:
df['is_weekend'] = df['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)


In [38]:
print(df[['pickup_datetime', 'hour', 'day', 'month', 'day_of_week', 'day_name', 'peak_time', 'is_weekend']].head())


            pickup_datetime  hour  day  month  day_of_week  day_name  \
0 2015-05-07 19:52:06+00:00    19    7      5            3  Thursday   
1 2009-07-17 20:04:56+00:00    20   17      7            4    Friday   
2 2009-08-24 21:45:00+00:00    21   24      8            0    Monday   
3 2009-06-26 08:22:21+00:00     8   26      6            4    Friday   
4 2014-08-28 17:47:00+00:00    17   28      8            3  Thursday   

  peak_time  is_weekend  
0      Peak           0  
1      Peak           0  
2  Off-Peak           0  
3      Peak           0  
4      Peak           0  


In [39]:
# Auto-detect object or category columns
categorical_cols = df.select_dtypes(include=['object', 'category']).columns.tolist()
print("Categorical columns:", categorical_cols)


Categorical columns: ['key', 'day_name', 'peak_time']


In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le  # Store encoder for inverse transformation if needed


In [41]:
df.to_csv("enhanced_dataset.csv", index=False)


In [42]:
print("Enhanced dataset shape:", df.shape)
print("Sample rows:")
display(df.head())


Enhanced dataset shape: (199999, 15)
Sample rows:


Unnamed: 0,key,fare_amount,pickup_datetime,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,passenger_count,hour,day,month,day_of_week,day_name,peak_time,is_weekend
0,195991,7.5,2015-05-07 19:52:06+00:00,-73.999817,40.738354,-73.999512,40.723217,1,19,7,5,3,4,1,0
1,16590,7.7,2009-07-17 20:04:56+00:00,-73.994355,40.728225,-73.99471,40.750325,1,20,17,7,4,0,1,0
2,19677,12.9,2009-08-24 21:45:00+00:00,-74.005043,40.74077,-73.962565,40.772647,1,21,24,8,0,1,0,0
3,14916,5.3,2009-06-26 08:22:21+00:00,-73.976124,40.790844,-73.965316,40.803349,3,8,26,6,4,0,1,0
4,176170,16.0,2014-08-28 17:47:00+00:00,-73.925023,40.744085,-73.973082,40.761247,5,17,28,8,3,4,1,0
