# Notebook 3: Data Aggregation
---

In [1]:
import datetime as dt
import numpy as np
import pandas as pd
from scipy.stats import zscore
import warnings
warnings.filterwarnings('ignore')

## Load the Citi Bike, Weather, Holiday, and NYC Bike Counts Data
---

In [2]:
# Load the data
tripdata = pd.read_parquet('../clean_data/hourly_tripdata.parquet')
weather = pd.read_parquet('../clean_data/clean_weather.parquet')
holiday = pd.read_parquet('../clean_data/holidays.parquet')
bike_counts = pd.read_parquet('../clean_data/nyc_bike_counts.parquet')

In [3]:
# Rename all indexes to 'date'
for df in [tripdata, weather, holiday, bike_counts]:
    df.index.name = 'date'

## Merge the Citi Bike, Weather, Holiday, and NYC Bike Counts Data
---

In [4]:
# Merge the DataFrames
df = pd.merge(tripdata, weather, on='date').merge(holiday, on='date').merge(bike_counts, on='date')

In [5]:
# Check for missing values
df.isna().sum()

nyc_trips_casual          0
nyc_trips_member          0
brooklyn_start_all        0
manhattan_start_all       0
queens_start_all          0
brooklyn_end_all          0
manhattan_end_all         0
queens_end_all            0
nyc_trips_all             0
brooklyn_start_casual     0
brooklyn_end_casual       0
brooklyn_start_member     0
brooklyn_end_member       0
manhattan_start_casual    0
manhattan_end_casual      0
manhattan_start_member    0
manhattan_end_member      0
queens_start_casual       0
queens_end_casual         0
queens_start_member       0
queens_end_member         0
temp                      0
humidity                  0
precip                    0
windspeed                 0
visibility                0
holidayName               0
isPaidTimeOff             0
isHoliday                 0
nyc_bike_counts           0
dtype: int64

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 43824 entries, 2018-06-01 00:00:00 to 2023-05-31 23:00:00
Data columns (total 30 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   nyc_trips_casual        43824 non-null  float64
 1   nyc_trips_member        43824 non-null  float64
 2   brooklyn_start_all      43824 non-null  float64
 3   manhattan_start_all     43824 non-null  float64
 4   queens_start_all        43824 non-null  float64
 5   brooklyn_end_all        43824 non-null  float64
 6   manhattan_end_all       43824 non-null  float64
 7   queens_end_all          43824 non-null  float64
 8   nyc_trips_all           43824 non-null  int64  
 9   brooklyn_start_casual   43824 non-null  float64
 10  brooklyn_end_casual     43824 non-null  float64
 11  brooklyn_start_member   43824 non-null  float64
 12  brooklyn_end_member     43824 non-null  float64
 13  manhattan_start_casual  43824 non-null  float64
 14  man

In [7]:
df1 = df.drop(['holidayName', 'isPaidTimeOff', 'isHoliday'], axis=1)

In [8]:
# Check for outliers
zscore(df1)[(zscore(df1) < -3) | (zscore(df1) > 3)].count()

nyc_trips_casual          1125
nyc_trips_member           497
brooklyn_start_all         679
manhattan_start_all        504
queens_start_all           638
brooklyn_end_all           751
manhattan_end_all          489
queens_end_all             609
nyc_trips_all              526
brooklyn_start_casual     1142
brooklyn_end_casual       1128
brooklyn_start_member      531
brooklyn_end_member        572
manhattan_start_casual    1094
manhattan_end_casual      1088
manhattan_start_member     508
manhattan_end_member       491
queens_start_casual       1142
queens_end_casual         1113
queens_start_member        634
queens_end_member          606
temp                         6
humidity                     0
precip                     113
windspeed                  420
visibility                1748
nyc_bike_counts            417
dtype: int64

In [9]:
def winsorize_feature(feature, lower_percentile, upper_percentile):
    """
    Apply winsorization to a feature array.

    Args:
        feature (numpy.ndarray): The array containing the feature values.
        lower_percentile (float): The lower percentile value for winsorization.
        upper_percentile (float): The upper percentile value for winsorization.

    Returns:
        numpy.ndarray: The winsorized feature array with replaced outliers.
    """
    # Calculate the winsorizing values
    lower_value = np.percentile(feature, lower_percentile)
    upper_value = np.percentile(feature, upper_percentile)

    # Replace outliers with winsorizing values
    winsorized_feature = np.where(feature < lower_value, lower_value, feature)
    winsorized_feature = np.where(winsorized_feature > upper_value, upper_value, winsorized_feature)

    return winsorized_feature

In [10]:
df1.describe()

Unnamed: 0,nyc_trips_casual,nyc_trips_member,brooklyn_start_all,manhattan_start_all,queens_start_all,brooklyn_end_all,manhattan_end_all,queens_end_all,nyc_trips_all,brooklyn_start_casual,...,queens_start_casual,queens_end_casual,queens_start_member,queens_end_member,temp,humidity,precip,windspeed,visibility,nyc_bike_counts
count,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,...,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0
mean,434.835068,1720.539704,528.833607,1505.481449,121.059716,531.243656,1503.826282,120.304833,2155.374772,129.78708,...,23.164453,22.893369,97.895263,97.411464,57.503142,63.960504,0.003585,6.634465,9.399744,1567.002989
std,553.760156,1578.042468,510.203412,1413.383124,123.468301,520.092461,1408.740263,119.643284,2029.766138,168.058829,...,31.547424,30.635991,97.618336,94.994902,16.878033,18.458432,0.063389,4.096415,1.604257,1295.631593
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,5.9,10.38,0.0,0.0,0.0,4.0
25%,58.0,376.0,113.0,327.0,25.0,114.0,322.0,26.0,467.0,16.0,...,3.0,3.0,20.0,21.0,43.9,49.86,0.0,4.0,9.9,460.75
50%,212.0,1350.0,382.0,1137.0,81.0,378.0,1134.0,81.0,1609.0,62.0,...,10.0,10.0,69.0,69.0,57.0,63.58,0.0,5.9,9.9,1296.0
75%,595.0,2613.0,803.0,2266.0,179.0,793.25,2268.0,179.0,3256.0,178.0,...,31.0,31.0,147.0,146.0,71.9,79.29,0.0,8.9,9.9,2357.25
max,4207.0,9937.0,2841.0,8697.0,795.0,3084.0,8494.0,760.0,12055.0,1167.0,...,378.0,270.0,668.0,663.0,98.0,100.0,8.785,31.3,9.9,8306.0


In [11]:
# Winsorize outliers
for col in df1.columns:
    df1[col] = winsorize_feature(df1[col], 5, 95)

In [12]:
df1.describe()

Unnamed: 0,nyc_trips_casual,nyc_trips_member,brooklyn_start_all,manhattan_start_all,queens_start_all,brooklyn_end_all,manhattan_end_all,queens_end_all,nyc_trips_all,brooklyn_start_casual,...,queens_start_casual,queens_end_casual,queens_start_member,queens_end_member,temp,humidity,precip,windspeed,visibility,nyc_bike_counts
count,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,...,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0,43824.0
mean,409.497399,1666.021545,511.16961,1459.518346,116.503126,512.076396,1459.317931,116.17082,2088.872882,121.559191,...,21.703633,21.412879,94.140585,93.766384,57.60316,64.13787,0.000759,6.511151,9.515907,1528.840065
std,471.810811,1423.282893,460.270717,1281.61929,110.051484,464.773109,1282.388587,107.581429,1841.623657,141.169608,...,26.661535,25.723596,86.219772,84.132988,16.005159,17.785818,0.002525,3.747648,1.12827,1191.391793
min,8.0,60.0,17.0,52.0,3.0,17.0,52.0,3.0,74.0,2.0,...,0.0,0.0,3.0,3.0,31.7,33.8315,0.0,0.2,5.4,92.0
25%,58.0,376.0,113.0,327.0,25.0,114.0,322.0,26.0,467.0,16.0,...,3.0,3.0,20.0,21.0,43.9,49.86,0.0,4.0,9.9,460.75
50%,212.0,1350.0,382.0,1137.0,81.0,378.0,1134.0,81.0,1609.0,62.0,...,10.0,10.0,69.0,69.0,57.0,63.58,0.0,5.9,9.9,1296.0
75%,595.0,2613.0,803.0,2266.0,179.0,793.25,2268.0,179.0,3256.0,178.0,...,31.0,31.0,147.0,146.0,71.9,79.29,0.0,8.9,9.9,2357.25
max,1668.0,4776.85,1578.0,4349.0,374.0,1603.0,4336.0,367.0,6250.85,498.0,...,95.0,91.0,294.0,286.0,83.7,92.78,0.01,14.5,9.9,4085.0


In [13]:
df[df1.columns] = df1

In [14]:
# Save the merged data
df.to_parquet('../clean_data/compiled_data.parquet')