In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pprint import pprint
from tqdm import tqdm
from functools import reduce

# loading weather data

In [4]:
import openmeteo_requests

import requests_cache
from retry_requests import retry

In [5]:
def load_weather_data(lat=40.715422, lon=-74.01122, start_date="2015-01-01", end_date="2019-12-31"):
    # Setup the Open-Meteo API client with cache and retry on error
    cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
    retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
    openmeteo = openmeteo_requests.Client(session = retry_session)

    # Make sure all required weather variables are listed here
    # The order of variables in hourly or daily is important to assign them correctly below
    url = "https://archive-api.open-meteo.com/v1/archive"
    variables = ["temperature_2m",
         "cloud_cover", "cloud_cover_low", 
         "snow_depth", "wind_speed_10m", "wind_speed_100m", "pressure_msl",
          "surface_pressure", "relative_humidity_2m", "precipitation", "rain", 
          "snowfall", "dew_point_2m", "apparent_temperature"]
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date,
        "end_date": end_date,
        "hourly": variables,
        "timezone": "America/New_York"
    }
    responses = openmeteo.weather_api(url, params=params)

    # Process first location. Add a for-loop for multiple locations or weather models
    response = responses[0]

    # Process hourly data. The order of variables needs to be the same as requested.
    hourly = response.Hourly()

    hourly_data = {"date": pd.date_range(
        start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
        end = pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
        freq = pd.Timedelta(seconds = hourly.Interval()),
        inclusive = "left"
    )}
    
    for i, variable in enumerate(variables):
        hourly_data[variable] = hourly.Variables(i).ValuesAsNumpy()

    hourly_dataframe = pd.DataFrame(data = hourly_data)
    return hourly_dataframe

In [6]:
df_weather = load_weather_data()

In [7]:
df_weather.to_parquet('../data/01_raw/weather.parquet')

# Loading bike data

In [2]:
def load_bike_data():
    # Load bike data from a CSV file or any other source
    # This is a placeholder function; implement as needed
    from sklearn.datasets import fetch_openml
    data = fetch_openml(data_id=43526, as_frame=True)

    data=(
        data['frame']
        .drop(columns=['Unnamed:_0'])
        .assign(
            Start_Time=lambda df: pd.to_datetime(df['Start_Time']),
            Stop_Time=lambda df: pd.to_datetime(df['Stop_Time']),
        )
        .astype({
            'User_Type': 'category',
            'End_Station_Name': 'category',
            'Start_Station_Name': 'category',
        })
        .rename(columns=lambda x: x.lower())
        # station with no geo-location
        .query('start_station_id != 3442 and end_station_id != 3442')
    )

    return data

In [3]:
data = load_bike_data()

In [4]:
data.to_parquet('../data/01_raw/bike.parquet')