# Notebook 5: Data Transformation
---

In [1]:
import numpy as np
import pandas as pd
import datetime as dt

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_parquet('../clean_data/engineered_data.parquet')

In [3]:
df.head()

Unnamed: 0_level_0,nyc_trips_casual,nyc_trips_member,brooklyn_start_all,manhattan_start_all,queens_start_all,brooklyn_end_all,manhattan_end_all,queens_end_all,nyc_trips_all,brooklyn_start_casual,...,nyc_bike_counts,hour,weekday,month,year,season,weekend,hour_type,bike_counts_log,pandemic_period
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2018-06-01 00:00:00,30.0,368.0,95.0,280.0,23.0,106.0,272.0,20.0,398.0,5.0,...,707.0,0,4,6,2018,summer,0,not rush hour,2.849419,pre-pandemic
2018-06-01 01:00:00,21.0,169.0,49.0,135.0,6.0,59.0,124.0,7.0,190.0,6.0,...,365.0,1,4,6,2018,summer,0,not rush hour,2.562293,pre-pandemic
2018-06-01 02:00:00,15.0,100.0,24.0,87.0,4.0,31.0,82.0,3.0,115.0,4.0,...,173.0,2,4,6,2018,summer,0,not rush hour,2.238046,pre-pandemic
2018-06-01 03:00:00,8.0,60.0,17.0,52.0,5.0,20.0,52.0,3.0,74.0,2.0,...,114.0,3,4,6,2018,summer,0,not rush hour,2.056905,pre-pandemic
2018-06-01 04:00:00,8.0,60.0,17.0,52.0,3.0,17.0,52.0,3.0,74.0,3.0,...,149.0,4,4,6,2018,summer,0,not rush hour,2.173186,pre-pandemic


In [4]:
# Drop the holiday name column
df = df[['nyc_trips_all', 'temp', 'humidity', 'precip', 'windspeed', 'visibility', 'isPaidTimeOff', 'isHoliday', 
         'nyc_bike_counts', 'bike_counts_log', 'hour', 'weekday', 'month', 'year', 'season', 
         'weekend', 'hour_type', 'pandemic_period']]

## Perform Label Encoding for ordinal variables
---

In [5]:
# Label encode the pandemic_period column
pp = {'pre-pandemic': 0, 'lockdown': 1, 'reopening': 2, 'post-vaccine': 3, 'post-pandemic': 4}
df['pandemic_period'] = df['pandemic_period'].map(pp)

In [6]:
# Label encode the season column
seasons = {'spring': 0, 'summer': 1, 'autumn': 2, 'winter': 3,}
df['season'] = df['season'].map(seasons)

## Perform Cyclical Encoding for time-based variables
---

In [7]:
df['hour_sin'] = np.sin(2 * np.pi * df['hour'] / 24)
df['hour_cos'] = np.cos(2 * np.pi * df['hour'] / 24)

In [8]:
df['season_sin'] = np.sin(2 * np.pi * df['season'] / 4)
df['season_cos'] = np.cos(2 * np.pi * df['season'] / 4)

## Perform One-Hot Encoding for nominal variables
---

In [9]:
# One-hot encode the hour type column
df['rush_hour'] = pd.get_dummies(df['hour_type'])['rush hour']

In [10]:
# Drop the hour type column
df.drop('hour_type', axis=1, inplace=True)

In [11]:
# Combine the isPaidTimeOff and isHoliday columns
df['is_holiday'] = (df['isPaidTimeOff'] + df['isHoliday']).replace(2, 1)
df.drop(['isPaidTimeOff', 'isHoliday'], axis=1, inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2018-06-01 00:00:00 to 2023-05-31 23:00:00
Data columns (total 21 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   nyc_trips_all    43824 non-null  float64
 1   temp             43824 non-null  float64
 2   humidity         43824 non-null  float64
 3   precip           43824 non-null  float64
 4   windspeed        43824 non-null  float64
 5   visibility       43824 non-null  float64
 6   nyc_bike_counts  43824 non-null  float64
 7   bike_counts_log  43824 non-null  float64
 8   hour             43824 non-null  int64  
 9   weekday          43824 non-null  int64  
 10  month            43824 non-null  int64  
 11  year             43824 non-null  int64  
 12  season           43824 non-null  int64  
 13  weekend          43824 non-null  int64  
 14  pandemic_period  43824 non-null  int64  
 15  hour_sin         43824 non-null  float64
 16  hour_cos         43824 

In [13]:
df.to_parquet('../clean_data/transformed_data.parquet')