In [1]:
import http.client
import json
import yaml
import pandas as pd
from datetime import datetime, timedelta
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_columns', None)  # Show all columns
pd.set_option('display.max_rows', None)     # Show all rows

In [3]:
# Load the configuration
with open('config.yaml', 'r') as file:
    config = yaml.load(file, Loader=yaml.FullLoader)

hostname = config['data_ingestion']['hostname']

# Setting up the 30-day window ending today
date = datetime.now()
end = date  # End date is today
start = end - timedelta(days=30)  # Start date is 30 days ago

print(f"Ingestion range: {start.strftime('%Y-%m-%d')} to {end.strftime('%Y-%m-%d')}")

Ingestion range: 2026-01-21 to 2026-02-20


In [4]:
conn = http.client.HTTPSConnection(hostname)
payload = ''
headers = {}

raw_data = pd.DataFrame()

# Loop through cities and fetch weather data
for i in config['cities']:
    lat = config['cities'][i]['latitude']
    lon = config['cities'][i]['longitude']
    
    # API request
    req_url = f"/v1/archive?latitude={lat}&longitude={lon}&start_date={start.strftime('%Y-%m-%d')}&end_date={end.strftime('%Y-%m-%d')}&daily=weathercode,temperature_2m_max,temperature_2m_min,precipitation_sum&timezone=Europe%2FLondon"
    
    conn.request("GET", req_url, payload, headers)
    res = conn.getresponse()
    data = json.loads(res.read().decode("utf-8"))
    
    # Format into DataFrame
    df_city = pd.DataFrame(data['daily'])
    df_city["city"] = config['cities'][i]['id']
    raw_data = pd.concat([raw_data, df_city])

# Reset index after concatenation
raw_data = raw_data.reset_index(drop=True)

In [5]:
print(raw_data.shape)
raw_data.head(2)

(155, 6)


Unnamed: 0,time,weathercode,temperature_2m_max,temperature_2m_min,precipitation_sum,city
0,2026-01-21,0,0.9,-6.4,0.0,1
1,2026-01-22,1,-0.9,-6.7,0.0,1


In [None]:
# Format and sort chronologically to avoid leakage
raw_data['time'] = pd.to_datetime(raw_data['time'])
raw_data = raw_data.sort_values(by=['city', 'time'])

# Define hard cutoff dates based on our 30-day window
# Train: First 20 days | Eval: Next 5 days | Holdout: Last 5 days
cutoff_date_eval = end - timedelta(days=10)
cutoff_date_holdout = end - timedelta(days=5)

# Apply the strict cutoffs
train_df = raw_data[raw_data["time"] < cutoff_date_eval]
eval_df = raw_data[(raw_data["time"] >= cutoff_date_eval) & (raw_data["time"] < cutoff_date_holdout)]
holdout_df = raw_data[raw_data["time"] >= cutoff_date_holdout]

print("Train shape:", train_df.shape)
print("Eval shape:", eval_df.shape)
print("Holdout shape:", holdout_df.shape)

# Save to your local directory (ensure the 'data/raw/' folder exists!)
train_df.to_csv("data/raw/train.csv", index=False)
eval_df.to_csv("data/raw/eval.csv", index=False)
holdout_df.to_csv("data/raw/holdout.csv", index=False)

Train shape: (105, 6)
Eval shape: (25, 6)
Holdout shape: (25, 6)
