This notebook contains code to add features for a complete time series.
These features include:
1) Hour (int)
2) Day of Week (one hot encoding)
3) Holiday (boolean)
4) Weather condition (Rainy, Sunny, Windy, etc) (one hot encoding)
5) Is weekend? (boolean)

In [None]:
%run Jan-Aug.ipynb
%run Sep-Dec.ipynb

In [223]:
import datetime
import pandas as pd
from pandas import Timestamp
from pandas.tseries.holiday import USFederalHolidayCalendar
import requests
from dotenv import load_dotenv
import os

In [220]:
# this function extracts day of week, hour, holiday, and is weekend features
def extract_time_features(df):
    # create new time feature df
    time_features_df = pd.DataFrame(df["started_at"].tolist(), columns=["start_date_time"])

    # extract start date and time info 
    start_info = df["started_at"].str.split(" ", expand=True)
    start_date, start_time = pd.to_datetime(start_info[0]), start_info[1]

    # append day_of_week feature to df (0:Monday - 6:Sunday)
    weekday = start_date.dt.weekday
    time_features_df["day_of_week"] = weekday
    # print(start_date[0], time_features_df["day_of_week"][0])

    # append hour feature to df (military time)
    time_features_df["hour"] = start_time.str.split(":", expand=True)[0]

    # append holiday feature to df
    cal = USFederalHolidayCalendar()
    holidays = cal.holidays()
    time_features_df['holiday'] = start_date.isin(holidays)
    # print(time_features_df[time_features_df["holiday"] == True])

    # append is_weekend feature to df
    time_features_df['is_weekend'] = ((weekday == 5) | (weekday == 6)).astype(int)

    return time_features_df
    

In [221]:
time_features_df = extract_time_features(jan)
print(time_features_df)

                 start_date_time  day_of_week hour  holiday  is_weekend
0        2024-01-22 18:43:19.012            0   18    False           0
1        2024-01-11 19:19:18.721            3   19    False           0
2        2024-01-30 19:17:41.693            1   19    False           0
3        2024-01-27 11:27:01.759            5   11    False           1
4        2024-01-16 15:15:41.000            1   15    False           0
...                          ...          ...  ...      ...         ...
1888080  2024-01-29 07:40:32.831            0   07    False           0
1888081  2024-01-29 11:56:47.527            0   11    False           0
1888082  2024-01-12 16:51:37.231            4   16    False           0
1888083  2024-01-26 09:32:45.932            4   09    False           0
1888084  2024-01-29 17:29:55.879            0   17    False           0

[1888085 rows x 5 columns]


In [None]:
def extract_weather_features(df):
    # extract date range
    start_info = df["started_at"].str.split(" ", expand=True)
    start_date = pd.to_datetime(start_info[0])
    min_date, max_date = start_date.min().date(), start_date.max().date()

    # api config and request
    load_dotenv()
    api_key = os.getenv("API_KEY")
    location = "New York"
    url = f"https://weather.visualcrossing.com/VisualCrossingWebServices/rest/services/timeline/{location}/{min_date}/{max_date}"
    params = {"unitGroup": "metric", "include": "hours", "key": api_key, "contentType": "json"}

    response = requests.get(url, params=params)
    data = response.json()

    # parse data and create df
    historical_data = []
    for day in data.get("days", []):
        for hour in day.get("hours", []):
            historical_data.append({
                "hour": hour["datetime"],
                "date": day["datetime"],
                "temp": hour.get("temp"),
                "precip": hour.get("precip"),
                "conditions": hour.get("conditions")
            })

    weather_df = pd.DataFrame(historical_data)
    weather_df["datetime"] = pd.to_datetime(weather_df["date"] + " " + weather_df["datetime"])

    return weather_df


In [None]:
extract_weather_features(jan)