In [1]:
import os, re, json, math, hashlib, calendar
from datetime import datetime, date, timedelta
import pandas as pd
from typing import List, Optional

# ! pip install openmeteo_requests requests_cache retry_requests
import openmeteo_requests
import requests_cache
from retry_requests import retry


In [2]:
def save_jsonl(data: list, file_path: str) -> list:
    """Save a list of dicts to JSONL."""
    with open(file_path, "w", encoding="utf-8") as f:
        for obj in data:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")


In [3]:
def load_jsonl(file_path: str) -> list:
    """Load a JSONL file and return a list of locations (strings)."""
    records = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # skip empty lines
                obj = json.loads(line)
                records.append(obj)
    return records

In [4]:
def build_openmeteo_client(cache_path: str = ".cache") -> openmeteo_requests.Client:
    cache_session = requests_cache.CachedSession(cache_path, expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    return openmeteo_requests.Client(session=retry_session)

In [5]:
 # Helper to normalize names
def normalize(text: str) -> str:
    text = text.lower().strip()
    text = re.sub(r"\s+", "_", text)      # spaces -> underscores
    text = re.sub(r"[^a-z0-9_]", "", text)  # keep only safe chars
    return text

In [6]:
def make_location_filename(data: dict, ext: str = 'json') -> str:
    
    # Extract and normalize key parts
    country   = normalize(data.get("country_code", "in"))
    state     = normalize(data.get("state_name", "unknown"))
    district  = normalize(data.get("district_name", "unknown"))
    
    # Prefer market_name, fallback to apmc_name
    market_raw = data.get("market_name") or data.get("apmc_name") or "unknown"
    market = normalize(market_raw)
    
    # Lat/long rounded to 3 decimals for disambiguation
    lat = round(float(data.get("latitude", 0.0)), 3)
    lon = round(float(data.get("longitude", 0.0)), 3)

    # Short hash from resolved_name+lat+lon for uniqueness
    unique_str = f"{data.get('resolved_name', '')}{lat}{lon}"
    hash_part = hashlib.sha256(unique_str.encode()).hexdigest()[:8]

    # Build filename
    filename = f"{country}_{state}_{district}_{market}_{lat}_{lon}_{hash_part}.{ext}"
    return filename

In [7]:
def add_suffix(filename: str, suffix: str) -> str:
    base, ext = os.path.splitext(filename)
    return f"{base}_{suffix}{ext}"

In [8]:
# Keep the same variables/order as your snippet
DAILY_VARS = [
    "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min",
    "cloud_cover_mean",
    "relative_humidity_2m_max", "relative_humidity_2m_min", "relative_humidity_2m_mean",
    "wind_speed_10m_max", "wind_speed_10m_min", "wind_speed_10m_mean", 
    "wet_bulb_temperature_2m_min", "wet_bulb_temperature_2m_max", "wet_bulb_temperature_2m_mean",
    "wind_direction_10m_dominant",
    "rain_sum", "precipitation_sum"
]

ARCHIVE_URL  = "https://archive-api.open-meteo.com/v1/archive"
FORECAST_URL = "https://api.open-meteo.com/v1/forecast"


In [9]:
def fetch_daily(client: openmeteo_requests.Client,
        url: str,
        lat: float,
        lon: float,
        start_date: date,
        end_date: date,
        timezone: str
    ) -> pd.DataFrame:
    """Fetch daily data from either archive or forecast endpoint."""
    if start_date > end_date:
        return pd.DataFrame(columns=["date", *DAILY_VARS])
    
    params = {
        "latitude": lat, "longitude": lon,
        "start_date": start_date.isoformat(), "end_date": end_date.isoformat(),
        "daily": DAILY_VARS, "timezone": timezone,
    }
    
    responses = client.weather_api(url, params=params)
    if not responses:
        return pd.DataFrame(columns=["date", *DAILY_VARS])
    daily = responses[0].Daily()

    # Build UTC series -> convert to local tz -> take local calendar date
    t_start_utc = pd.to_datetime(daily.Time(), unit="s", utc=True)
    t_end_utc   = pd.to_datetime(daily.TimeEnd(), unit="s", utc=True)
    interval_s  = int(daily.Interval())

    # Create UTC range, then convert to local tz and take just the calendar date
    time_utc = pd.date_range(start=t_start_utc, end=t_end_utc,
                             freq=pd.Timedelta(seconds=interval_s), inclusive="left")
    time_local = time_utc.tz_convert(timezone)
    date_local = time_local.normalize().date  # array of python date objects

    daily_data = {"date": pd.Series(date_local)}


    # Assign in the same order
    for i, key in enumerate(daily_vars):
        daily_data[key] = daily.Variables(i).ValuesAsNumpy()

    df = pd.DataFrame(daily_data)
    
    # Final clamp (defensive)
    df = df[(df["date"] >= start_date_local) & (df["date"] <= end_date_local)].reset_index(drop=True)

    return df
   

In [None]:
def _write_partitioned_by_month(
    df: pd.DataFrame,
    base_dir: str,
    location_meta: dict,
    filename_suffix: Optional[str] = None  # e.g., "forecast"
) -> List[str]:
    """
    Writes df partitioned by month into ./base_dir/YYYY/Mon/<filename[+suffix]>.json
    Returns list of saved paths.
    """
    saved_paths: List[str] = []
    if df.empty:
        return saved_paths

    # Build base filename
    fname = make_location_filename(location_meta, ext="json")
    
    # Add Year/Month columns to group easily
    df["_Y"] = pd.to_datetime(df["date"]).dt.year
    df["_M"] = pd.to_datetime(df["date"]).dt.month
    
    if filename_suffix:
        fname = add_suffix(fname, filename_suffix)

    # Group by year+month (handles multi-year spans too)
    for (yy, mm), g in df.groupby(["_Y", "_M"], sort=True):
        month_df = g.drop(columns=["_Y", "_M"]).reset_index(drop=True)

        year_dir = os.path.join(base_dir, f"{int(yy):04d}")
        month_dir = os.path.join(year_dir, calendar.month_abbr[int(mm)])  # 'Apr'
        os.makedirs(month_dir, exist_ok=True)

        fpath = os.path.join(month_dir, fname)
        month_df.to_json(fpath, orient="records", date_format="iso")
        saved_paths.append(fpath)

    return saved_paths

In [None]:
def get_openmeteo_month(
    location_meta: dict,
    year: int,
    mode: Literal["archive", "forecast", "hybrid"] = "archive",
    base_dir: str = "../data/meteo",
    timezone: str = "Asia/Kolkata",
    client: Optional[openmeteo_requests.Client] = None,
    horizon_days: int = 16,   # only used by forecast & hybrid
) -> str:
    """
    Save month JSON under ../data/meteo/YYYY/Apr/<filename>.json

    mode:
      - 'archive'  : archive only (end clamped to today-5)
      - 'forecast' : forecast only (start=max(month_start, today), end=today+horizon_days)
      - 'hybrid'   : archive (<= D-5) + forecast (>= D-4), saved as '<name>_forecast.json'
    """
    if "latitude" not in location_meta or "longitude" not in location_meta:
        raise ValueError("location_meta must include 'latitude' and 'longitude'.")
    
    lat, lon = float(location_meta["latitude"]), float(location_meta["longitude"])
    
    # Compute intended local date window for the year
    start_date_local = date(year, 1, 1)
    end_date_local   = date(year, 12, 31)

    # Sanity clamp: do not request beyond "today" (local)
    today_local = datetime.now().date()
    archive_tail = today_local - timedelta(days=5)
    
    if end_date_local > archive_tail:
        end_date_local = archive_tail

    # If the entire year is in the future -> nothing to fetch
    if start_date_local > today_local:
        return []

    # Setup Open-Meteo client with cache + retries
    openmeteo = client or build_openmeteo_client()
    
    if mode == "archive":
        start = start_date_local
        end   = end_date_local
        df = fetch_daily(openmeteo, ARCHIVE_URL, lat, lon, start, end, timezone)
        return write_partitioned_by_month(df, base_dir, location_meta)


    elif mode == "forecast":
        start = today_local  # today onwards
        end   = today_local + timedelta(days=horizon_days)
        df = fetch_daily(openmeteo, FORECAST_URL, lat, lon, start, end, timezone)
        return write_partitioned_by_month(df, base_dir, location_meta, filename_suffix="forecast")


    else:
        raise ValueError("mode must be one of: 'archive', 'forecast'")

In [None]:
def save_openmeteo_month_json(
    location_meta: dict,
    year: int,
    base_dir: str = "../data/meteo",
    timezone: str = "Asia/Kolkata",
    client: openmeteo_requests.Client | None = None,   
) -> List[str]:
    """
    Fetch daily Open-Meteo archive for a given lat/lon and month, save as JSON,
    and return the saved file path.
    """

    if "latitude" not in location_meta or "longitude" not in location_meta:
        raise ValueError("location_meta must include 'latitude' and 'longitude'.")

    lat = float(location_meta["latitude"])
    lon = float(location_meta["longitude"])

    
    # Compute date range
    
    # Compute intended local date window for the year
    start_date_local = date(year, 1, 1)
    end_date_local   = date(year, 12, 31)

    # Sanity clamp: do not request beyond "today" (local)
    today_local = datetime.now().date()
    yesterday_local = today_local - timedelta(days=3)
    
    if end_date_local > yesterday_local:
        end_date_local = yesterday_local

    # If the entire year is in the future -> nothing to fetch
    if start_date_local > today_local:
        return []
        
    # Setup Open-Meteo client with cache + retries
    openmeteo = client or build_openmeteo_client()

    url = "https://archive-api.open-meteo.com/v1/archive"
    
    
    params = {
        "latitude": lat,
        "longitude": lon,
        "start_date": start_date_local.isoformat(),
        "end_date": end_date_local.isoformat(),
        "daily": daily_vars,
        "timezone": timezone,
    }

    responses = openmeteo.weather_api(url, params=params)
    if not responses:
        raise RuntimeError("Open-Meteo returned no responses.")
    response = responses[0]
    
    # Daily block (order must match daily_vars)
    daily = response.Daily()
    
    # Build UTC series -> convert to local tz -> take local calendar date
    t_start_utc = pd.to_datetime(daily.Time(), unit="s", utc=True)
    t_end_utc   = pd.to_datetime(daily.TimeEnd(), unit="s", utc=True)
    interval_s  = int(daily.Interval())

    # Create UTC range, then convert to local tz and take just the calendar date
    time_utc = pd.date_range(start=t_start_utc, end=t_end_utc,
                             freq=pd.Timedelta(seconds=interval_s), inclusive="left")
    time_local = time_utc.tz_convert(timezone)
    date_local = time_local.normalize().date  # array of python date objects

    daily_data = {"date": pd.Series(date_local)}


    # Assign in the same order
    for i, key in enumerate(daily_vars):
        daily_data[key] = daily.Variables(i).ValuesAsNumpy()

    df = pd.DataFrame(daily_data)
    
    # Final clamp (defensive)
    df = df[(df["date"] >= start_date_local) & (df["date"] <= end_date_local)].reset_index(drop=True)

    
    # Partition by month and write
    saved_paths: List[str] = []
    fname = make_location_filename(location_meta, ext="json")

    if df.empty:
        return saved_paths

    # Add Year/Month columns to group easily
    df["_Y"] = pd.to_datetime(df["date"]).dt.year
    df["_M"] = pd.to_datetime(df["date"]).dt.month

    print(df)
    for m in sorted(df["_M"].unique()):
        month_df = df[df["_M"] == m].drop(columns=["_Y", "_M"]).reset_index(drop=True)

        # Prepare directories ./data/meteo/YYYY/MM/
        year_dir = os.path.join(base_dir, f"{year:04d}")
        month_name = calendar.month_abbr[m]  # 'Jan', 'Feb', ..., 'Dec'
        month_dir = os.path.join(year_dir, month_name)
        os.makedirs(month_dir, exist_ok=True)

        fpath = os.path.join(month_dir, fname)
        month_df.to_json(fpath, orient="records", date_format="iso")
        saved_paths.append(fpath)

    return saved_paths

In [None]:
locs = load_jsonl("mandies_20250907.jsonl")
len(locs)

In [None]:
make_location_filename(locs[0])

In [None]:
locs[0]

In [None]:
client = build_openmeteo_client()
out_path = save_openmeteo_month_json(
        location_meta=locs[0],
        year=2025,
        base_dir="../data/meteo",
        client = client
    )