In [28]:
import os, re, json, math, hashlib, requests, calendar
from datetime import datetime, date, timedelta
import pandas as pd
from typing import List

# ! pip install openmeteo_requests requests_cache retry_requests
import openmeteo_requests
import requests_cache
from retry_requests import retry


In [2]:
def save_jsonl(data: list, file_path: str) -> list:
    """Save a list of dicts to JSONL."""
    with open(file_path, "w", encoding="utf-8") as f:
        for obj in data:
            f.write(json.dumps(obj, ensure_ascii=False) + "\n")


In [3]:
def load_jsonl(file_path: str) -> list:
    """Load a JSONL file and return a list of locations (strings)."""
    records = []
    with open(file_path, "r", encoding="utf-8") as f:
        for line in f:
            if line.strip():  # skip empty lines
                obj = json.loads(line)
                records.append(obj)
    return records

In [4]:
def make_location_filename(data: dict, ext: str = 'json') -> str:
    """
    Generate a unique, filesystem-safe filename for location data.
    
    Parameters
    ----------
    data : dict
        Dictionary containing fields like country_code, state_name, district_name, market_name,
        latitude, longitude, etc.
    ext : str, optional
        File extension (default: 'jsonl').
        
    Returns
    -------
    str
        Unique filename string.
    """
    # Helper to normalize names
    def normalize(text: str) -> str:
        text = text.lower().strip()
        text = re.sub(r"\s+", "_", text)      # spaces -> underscores
        text = re.sub(r"[^a-z0-9_]", "", text)  # keep only safe chars
        return text

    # Extract and normalize key parts
    country   = normalize(data.get("country_code", "xx"))
    state     = normalize(data.get("state_name", "unknown"))
    district  = normalize(data.get("district_name", "unknown"))
    
    # Prefer market_name, fallback to apmc_name
    market_raw = data.get("market_name") or data.get("apmc_name") or "unknown"
    market = normalize(market_raw)
    
    # Lat/long rounded to 3 decimals for disambiguation
    lat = round(float(data.get("latitude", 0.0)), 3)
    lon = round(float(data.get("longitude", 0.0)), 3)

    # Short hash from resolved_name+lat+lon for uniqueness
    unique_str = f"{data.get('resolved_name', '')}{lat}{lon}"
    hash_part = hashlib.sha256(unique_str.encode()).hexdigest()[:8]

    # Build filename
    filename = f"{country}_{state}_{district}_{market}_{lat}_{lon}_{hash_part}.{ext}"
    return filename

In [12]:
def build_openmeteo_client(cache_path: str = ".cache") -> openmeteo_requests.Client:
    cache_session = requests_cache.CachedSession(cache_path, expire_after=-1)
    retry_session = retry(cache_session, retries=5, backoff_factor=0.2)
    return openmeteo_requests.Client(session=retry_session)

In [36]:
def save_openmeteo_month_json(
    location_meta: dict,
    year: int,
    base_dir: str = "../data/meteo",
    timezone: str = "Asia/Kolkata",
    client: openmeteo_requests.Client | None = None,   
) -> List[str]:
    """
    Fetch daily Open-Meteo archive for a given lat/lon and month, save as JSON,
    and return the saved file path.
    """

    if "latitude" not in location_meta or "longitude" not in location_meta:
        raise ValueError("location_meta must include 'latitude' and 'longitude'.")

    lat = float(location_meta["latitude"])
    lon = float(location_meta["longitude"])

    
    # Compute date range
    
    # Compute intended local date window for the year
    start_date_local = date(year, 1, 1)
    end_date_local   = date(year, 12, 31)

    # Sanity clamp: do not request beyond "today" (local)
    today_local = datetime.now().date()
    yesterday_local = today_local - timedelta(days=3)
    
    if end_date_local > yesterday_local:
        end_date_local = yesterday_local

    # If the entire year is in the future -> nothing to fetch
    if start_date_local > today_local:
        return []
        
    # Setup Open-Meteo client with cache + retries
    openmeteo = client or build_openmeteo_client()

    # url = "https://archive-api.open-meteo.com/v1/archive"
    url = "https://api.open-meteo.com/v1/forecast"
    
    # Keep the same variables/order as your snippet
    daily_vars = [
        "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min",
        "cloud_cover_mean",
        "relative_humidity_2m_max", "relative_humidity_2m_min", "relative_humidity_2m_mean",
        "wind_speed_10m_max", "wind_speed_10m_min", "wind_speed_10m_mean", 
        "wet_bulb_temperature_2m_min", "wet_bulb_temperature_2m_max", "wet_bulb_temperature_2m_mean",
        "wind_direction_10m_dominant",
        "rain_sum", "precipitation_sum"
    ]

    params = {
        "latitude": lat,
        "longitude": lon,
        # "start_date": start_date_local.isoformat(),
        # "end_date": end_date_local.isoformat(),
        "start_date" : "2025-06-07",
        "end_date": "2025-09-23",
        "daily": daily_vars,
        "timezone": timezone,
    }

    responses = openmeteo.weather_api(url, params=params)
    if not responses:
        raise RuntimeError("Open-Meteo returned no responses.")
    response = responses[0]
    
    # Daily block (order must match daily_vars)
    daily = response.Daily()
    
    # Build UTC series -> convert to local tz -> take local calendar date
    t_start_utc = pd.to_datetime(daily.Time(), unit="s", utc=True)
    t_end_utc   = pd.to_datetime(daily.TimeEnd(), unit="s", utc=True)
    interval_s  = int(daily.Interval())

    # Create UTC range, then convert to local tz and take just the calendar date
    time_utc = pd.date_range(start=t_start_utc, end=t_end_utc,
                             freq=pd.Timedelta(seconds=interval_s), inclusive="left")
    time_local = time_utc.tz_convert(timezone)
    date_local = time_local.normalize().date  # array of python date objects

    daily_data = {"date": pd.Series(date_local)}


    # Assign in the same order
    for i, key in enumerate(daily_vars):
        daily_data[key] = daily.Variables(i).ValuesAsNumpy()

    df = pd.DataFrame(daily_data)
    
    # Final clamp (defensive)
    df = df[(df["date"] >= start_date_local) & (df["date"] <= end_date_local)].reset_index(drop=True)

    
    # Partition by month and write
    saved_paths: List[str] = []
    fname = make_location_filename(location_meta, ext="json")

    if df.empty:
        return saved_paths

    # Add Year/Month columns to group easily
    df["_Y"] = pd.to_datetime(df["date"]).dt.year
    df["_M"] = pd.to_datetime(df["date"]).dt.month

    print(df)
    for m in sorted(df["_M"].unique()):
        month_df = df[df["_M"] == m].drop(columns=["_Y", "_M"]).reset_index(drop=True)

        # Prepare directories ./data/meteo/YYYY/MM/
        year_dir = os.path.join(base_dir, f"{year:04d}")
        month_name = calendar.month_abbr[m]  # 'Jan', 'Feb', ..., 'Dec'
        month_dir = os.path.join(year_dir, month_name)
        os.makedirs(month_dir, exist_ok=True)

        fpath = os.path.join(month_dir, fname)
        month_df.to_json(fpath, orient="records", date_format="iso")
        saved_paths.append(fpath)

    return saved_paths

In [6]:
locs = load_jsonl("mandies_20250907.jsonl")
len(locs)

3049

In [8]:
make_location_filename(locs[0])

'in_andhra_pradesh_kurnool_kurnool_15.831_78.043_9cac68de.json'

In [9]:
locs[0]

{'state_name': 'Andhra Pradesh',
 'district_name': 'Kurnool',
 'market_name': 'Kurnool',
 'resolved_name': 'Kurnool, Andhra Pradesh, 518001, India',
 'latitude': 15.8309251,
 'longitude': 78.0425373,
 'country_code': 'IN',
 'place_class': 'place',
 'place_type': 'city',
 'importance': 1,
 'source': 'locationiq',
 'resolution_strategy': 'name'}

In [37]:
client = build_openmeteo_client()
out_path = save_openmeteo_month_json(
        location_meta=locs[0],
        year=2025,
        base_dir="../data/meteo",
        client = client
    )

          date  temperature_2m_mean  temperature_2m_max  temperature_2m_min  \
0   2025-06-07                  NaN                 NaN                 NaN   
1   2025-06-08                  NaN                 NaN                 NaN   
2   2025-06-09                  NaN                 NaN                 NaN   
3   2025-06-10                  NaN                 NaN                 NaN   
4   2025-06-11                  NaN                 NaN                 NaN   
..         ...                  ...                 ...                 ...   
87  2025-09-02            26.538164           29.421501           24.521502   
88  2025-09-03            27.063169           31.071501           24.521502   
89  2025-09-04            27.790247           32.221500           24.321501   
90  2025-09-05            28.117334           32.521500           23.871500   
91  2025-09-06            28.152754           32.771500           23.621500   

    cloud_cover_mean  relative_humidity_2m_max  rel