In [None]:
import pandas as pd
import requests
import time
import os

# 1. Configuration & NASA POWER API Helper
def fetch_nasa_weather(lat, lon, start_date="20180101", end_date="20241231"):
    """
    Fetches daily weather data from NASA POWER API.
    Dates must be in YYYYMMDD format.
    """
    url = "https://power.larc.nasa.gov/api/temporal/daily/point"
    params = {
        "latitude": lat,
        "longitude": lon,
        "start": start_date,
        "end": end_date,
        "community": "AG", # AG = Agriculture
        "parameters": "T2M,RH2M,PRECTOTCORR",
        "format": "JSON"
    }
    
    try:
        response = requests.get(url, params=params, timeout=60)
        
        if response.status_code == 200:
            data = response.json()
            # NASA POWER structure: properties -> parameter -> {PARAM} -> {DATE: VALUE}
            params_data = data['properties']['parameter']
            
            # Extract dates and values
            dates = list(params_data['T2M'].keys())
            
            df = pd.DataFrame({
                "date": pd.to_datetime(dates, format='%Y%m%d'),
                "temp": list(params_data['T2M'].values()),
                "humidity": list(params_data['RH2M'].values()),
                "rain": list(params_data['PRECTOTCORR'].values()),
                "latitude": lat,
                "longitude": lon
            })
            
            # NASA uses -999 for missing data; replace with NaN
            df.replace(-999, pd.NA, inplace=True)
            return df
            
        elif response.status_code == 429:
            print("Rate limit reached. Waiting 60s...")
            time.sleep(60)
            return fetch_nasa_weather(lat, lon, start_date, end_date)
        else:
            print(f"Skipping {lat}, {lon} - API Error: {response.status_code}")
            return None

    except Exception as e:
        print(f"Error at {lat}, {lon}: {e}")
        return None

# 2. Load Data
# Note: Ensure the path matches your local environment
csv_path = "mandi_coordinates (1).csv" 
df_locations = pd.read_csv(csv_path).dropna(subset=['latitude', 'longitude'])
output_filename = "historical_weather_nasa_2018_2024.parquet"

# 3. Execution Loop with Checkpointing
print(f"Starting data collection for {len(df_locations)} locations...")

for index, row in df_locations.iterrows():
    # Progress indicator
    print(f"[{index + 1}/{len(df_locations)}] Fetching NASA data for: {row['mandi']} ({row['latitude']}, {row['longitude']})")
    
    df_result = fetch_nasa_weather(row['latitude'], row['longitude'])
    
    if df_result is not None:
        # Add mandi name to the results
        df_result['mandi'] = row['mandi']
        
        # SAVE IMMEDIATELY: Append logic
        if not os.path.exists(output_filename):
            # Using pyarrow or fastparquet engine
            df_result.to_parquet(output_filename, engine='fastparquet', index=False)
        else:
            df_result.to_parquet(output_filename, engine='fastparquet', append=True, index=False)
    
    # Respectful delay for NASA API (approx 1.5 - 2 seconds)
    time.sleep(1.5)

print(f"Success! Process complete. Data is saved in {output_filename}")