# Notebook 4: Feature Engineering
---

In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Load the data
df = pd.read_parquet('../clean_data/compiled_data.parquet')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2018-06-01 00:00:00 to 2023-05-31 23:00:00
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   nyc_trips_casual        43824 non-null  float64
 1   nyc_trips_member        43824 non-null  float64
 2   brooklyn_start_all      43824 non-null  float64
 3   manhattan_start_all     43824 non-null  float64
 4   queens_start_all        43824 non-null  float64
 5   brooklyn_end_all        43824 non-null  float64
 6   manhattan_end_all       43824 non-null  float64
 7   queens_end_all          43824 non-null  float64
 8   nyc_trips_all           43824 non-null  float64
 9   brooklyn_start_casual   43824 non-null  float64
 10  brooklyn_end_casual     43824 non-null  float64
 11  brooklyn_start_member   43824 non-null  float64
 12  brooklyn_end_member     43824 non-null  float64
 13  manhattan_start_casual  43824 non-null  float64
 14  man

In [4]:
# Extract the hour, weekday, month, and year from the start datetime
df['hour'] = (df.index.hour)
df['weekday'] = df.index.weekday
df['month'] = df.index.month
df['year'] = (df.index.year)

In [5]:
# Extract the season from the month
seasons = {1: 'winter', 2: 'winter', 3: 'spring', 4: 'spring', 5: 'spring', 6: 'summer', 
           7: 'summer', 8: 'summer', 9: 'autumn', 10: 'autumn', 11: 'autumn', 12: 'winter'}
df['season'] = df['month'].map(seasons)

In [6]:
df['weekend'] = [0 if day < 5 else 1 for day in df.index.weekday]

>NYC weekday rush hour is from 7:30 AM to 9:30 AM and 5 PM to 7 PM
>
>Citation:
>
>Fodor's Travel. (n.d.). New York City: Tips and Etiquette. The New York Times. Retrieved June 28, 2023, from https://archive.nytimes.com/www.nytimes.com/fodors/top/features/travel/destinations/unitedstates/newyork/newyorkcity/fdrs_tips_111_2.html

In [7]:
def categorize_hour(day_type, hour):
    """
    Categorize an hour as rush hour or not based on the day type and hour value.

    Args:
        day_type (int): Indicator for the day type (1 for rush hour, 0 for non-rush hour).
        hour (int): The hour value (24-hour format) to be categorized.

    Returns:
        str: A string indicating whether the hour is categorized as rush hour or not.
    """
    if day_type == 1:
        return 'not rush hour'
    elif hour >= 7 and hour <= 10:
        return 'rush hour'
    elif hour >= 17 and hour <=19:
        return 'rush hour'
    else:
        return 'not rush hour'

In [8]:
df['hour_type'] = [categorize_hour(row['weekend'], row['hour']) for _, row in df.iterrows()]

In [9]:
df['bike_counts_log'] = np.log10(df['nyc_bike_counts'])

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2018-06-01 00:00:00 to 2023-05-31 23:00:00
Data columns (total 38 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   nyc_trips_casual        43824 non-null  float64
 1   nyc_trips_member        43824 non-null  float64
 2   brooklyn_start_all      43824 non-null  float64
 3   manhattan_start_all     43824 non-null  float64
 4   queens_start_all        43824 non-null  float64
 5   brooklyn_end_all        43824 non-null  float64
 6   manhattan_end_all       43824 non-null  float64
 7   queens_end_all          43824 non-null  float64
 8   nyc_trips_all           43824 non-null  float64
 9   brooklyn_start_casual   43824 non-null  float64
 10  brooklyn_end_casual     43824 non-null  float64
 11  brooklyn_start_member   43824 non-null  float64
 12  brooklyn_end_member     43824 non-null  float64
 13  manhattan_start_casual  43824 non-null  float64
 14  man

source: https://www.investopedia.com/historical-timeline-of-covid-19-in-new-york-city-5071986

In [11]:
def categorize_date(date):
    if date < pd.to_datetime('2020-03-07'):
        return 'pre-pandemic'
    elif date < pd.to_datetime('2020-06-08'):
        return 'lockdown'
    elif date < pd.to_datetime('2021-04-06'):
        return 'reopening'
    elif date < pd.to_datetime('2022-01-01'):
        return 'post-vaccine'
    else:
        return 'post-pandemic'

In [12]:
df['pandemic_period'] = pd.Series(df.index).apply(categorize_date).values

In [13]:
# Review the DataFrame
df.head()

Unnamed: 0_level_0,nyc_trips_casual,nyc_trips_member,brooklyn_start_all,manhattan_start_all,queens_start_all,brooklyn_end_all,manhattan_end_all,queens_end_all,nyc_trips_all,brooklyn_start_casual,...,nyc_bike_counts,hour,weekday,month,year,season,weekend,hour_type,bike_counts_log,pandemic_period
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-06-01 00:00:00,30.0,368.0,95.0,280.0,23.0,106.0,272.0,20.0,398.0,5.0,...,707.0,0,4,6,2018,summer,0,not rush hour,2.849419,pre-pandemic
2018-06-01 01:00:00,21.0,169.0,49.0,135.0,6.0,59.0,124.0,7.0,190.0,6.0,...,365.0,1,4,6,2018,summer,0,not rush hour,2.562293,pre-pandemic
2018-06-01 02:00:00,15.0,100.0,24.0,87.0,4.0,31.0,82.0,3.0,115.0,4.0,...,173.0,2,4,6,2018,summer,0,not rush hour,2.238046,pre-pandemic
2018-06-01 03:00:00,8.0,60.0,17.0,52.0,5.0,20.0,52.0,3.0,74.0,2.0,...,114.0,3,4,6,2018,summer,0,not rush hour,2.056905,pre-pandemic
2018-06-01 04:00:00,8.0,60.0,17.0,52.0,3.0,17.0,52.0,3.0,74.0,3.0,...,149.0,4,4,6,2018,summer,0,not rush hour,2.173186,pre-pandemic


In [14]:
# Save the new DataFrame
df.to_parquet('../clean_data/engineered_data.parquet')