# Anameka South - ACCESS CM2 Daily Data Extraction

This notebook extracts daily time series data from NetCDF files for a specific coordinate.

## Variables: tasmax, tasmin, pr
## Scenarios: SSP585, SSP245


In [None]:
import pandas as pd
import numpy as np
import xarray as xr
import os
from pathlib import Path
import glob
from tqdm import tqdm
import time

# Configuration
SSP585_DIR = r"C:\Users\ibian\Desktop\ClimAdapt\CMIP6\ACCESS CM2 SSP585"
SSP245_DIR = r"C:\Users\ibian\Desktop\ClimAdapt\CMIP6\ACCESS CM2 SSP245"
OUTPUT_DIR = r"C:\Users\ibian\Desktop\ClimAdapt\Anameka"

# Variables to process
VARIABLES = ['tasmax', 'tasmin', 'pr']
SCENARIOS = ['SSP585', 'SSP245']

# Coordinate matching tolerance (degrees)
COORD_TOLERANCE = 0.01  # Approximately 1.1 km

# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)

print("Configuration loaded:")
print(f"  - SSP585 directory: {SSP585_DIR}")
print(f"  - SSP245 directory: {SSP245_DIR}")
print(f"  - Output directory: {OUTPUT_DIR}")
print(f"  - Variables: {', '.join(VARIABLES)}")
print(f"  - Scenarios: {', '.join(SCENARIOS)}")


## 2. Specify Target Coordinate

**Enter the latitude and longitude for the grid point you want to extract:**


In [None]:
# TARGET COORDINATE
TARGET_LAT = -31.75  # Latitude (degrees)
TARGET_LON = 117.5999984741211   # Longitude (degrees)

print(f"Target coordinate:")
print(f"  Latitude: {TARGET_LAT}")
print(f"  Longitude: {TARGET_LON}")
print(f"  Tolerance: {COORD_TOLERANCE} degrees")

In [None]:
def extract_daily_data_from_netcdf(netcdf_dir, variable, target_lat, target_lon, tolerance=0.01):
    """
    Extract daily time series data for a specific coordinate from NetCDF files.
    Optimized version with progress reporting and cached coordinate indices.
    
    Parameters:
    -----------
    netcdf_dir : str
        Directory containing NetCDF files for the variable
    variable : str
        Variable name (tasmax, tasmin, or pr)
    target_lat : float
        Target latitude
    target_lon : float
        Target longitude
    tolerance : float
        Coordinate matching tolerance in degrees
    
    Returns:
    --------
    pd.DataFrame
        DataFrame with columns: date, value
    """
    start_time = time.time()
    
    # Find all NetCDF files in the directory
    nc_files = sorted(glob.glob(os.path.join(netcdf_dir, f"*{variable}*.nc")))
    
    if len(nc_files) == 0:
        print(f"  ERROR: No NetCDF files found in {netcdf_dir}")
        return None
    
    print(f"  Found {len(nc_files)} NetCDF files")
    
    # Cache coordinate information from first file
    lat_name = None
    lon_name = None
    time_name = None
    lat_idx = None
    lon_idx = None
    actual_lat = None
    actual_lon = None
    var_name = None
    
    # List to store daily data
    all_data = []
    
    # Process first file to get coordinate structure
    if len(nc_files) > 0:
        try:
            ds_sample = xr.open_dataset(nc_files[0], decode_times=False)
            
            # Get variable name
            for v in ds_sample.data_vars:
                if variable in v.lower() or v.lower() in variable.lower():
                    var_name = v
                    break
            
            if var_name is None:
                possible_names = [variable, variable.upper(), f'{variable}_day']
                for name in possible_names:
                    if name in ds_sample.data_vars:
                        var_name = name
                        break
            
            # Get coordinate names
            for coord in ds_sample.coords:
                coord_lower = coord.lower()
                if 'lat' in coord_lower:
                    lat_name = coord
                elif 'lon' in coord_lower:
                    lon_name = coord
                elif 'time' in coord_lower:
                    time_name = coord
            
            if lat_name and lon_name:
                # Find nearest grid point (cache indices)
                lat_idx = np.abs(ds_sample[lat_name].values - target_lat).argmin()
                lon_idx = np.abs(ds_sample[lon_name].values - target_lon).argmin()
                
                actual_lat = float(ds_sample[lat_name].values[lat_idx])
                actual_lon = float(ds_sample[lon_name].values[lon_idx])
                
                # Check if within tolerance
                if abs(actual_lat - target_lat) > tolerance or abs(actual_lon - target_lon) > tolerance:
                    print(f"  Warning: Nearest point ({actual_lat:.4f}, {actual_lon:.4f}) is outside tolerance")
                else:
                    print(f"  Using grid point: ({actual_lat:.4f}, {actual_lon:.4f})")
            
            ds_sample.close()
            
        except Exception as e:
            print(f"  Warning: Could not read sample file: {e}")
    
    if var_name is None or lat_idx is None or lon_idx is None:
        print(f"  ERROR: Could not determine coordinate structure")
        return None
    
    # Process all files with progress bar
    print(f"  Processing files...")
    for nc_file in tqdm(nc_files, desc=f"  {variable}", unit="file"):
        try:
            # Open NetCDF file with minimal decoding for speed
            ds = xr.open_dataset(nc_file, decode_times=False)
            
            # Extract data using cached indices
            data = ds[var_name].isel({lat_name: lat_idx, lon_name: lon_idx})
            
            # Get time values - extract year from filename for simplicity and speed
            import re
            year_match = re.search(r'(\d{4})', os.path.basename(nc_file))
            if year_match:
                year = int(year_match.group(1))
                # Create daily dates for the year (handles leap years automatically)
                time_values = pd.date_range(start=f'{year}-01-01', end=f'{year}-12-31', freq='D')
            else:
                # Fallback: use index if year not found
                time_values = pd.date_range(start='2000-01-01', periods=len(data.values), freq='D')
            
            # Convert to numpy array (load into memory)
            values = data.values
            if values.ndim > 1:
                values = values.flatten()
            
            # Create DataFrame for this file
            if len(values) == len(time_values):
                df_file = pd.DataFrame({
                    'date': time_values[:len(values)],
                    'value': values
                })
                all_data.append(df_file)
            
            ds.close()
            
        except Exception as e:
            tqdm.write(f"    Error processing {os.path.basename(nc_file)}: {e}")
            continue
    
    if len(all_data) == 0:
        print(f"  ERROR: No data extracted")
        return None
    
    # Combine all data
    print(f"  Combining data from {len(all_data)} files...")
    combined_df = pd.concat(all_data, ignore_index=True)
    
    # Sort by date
    combined_df = combined_df.sort_values('date').reset_index(drop=True)
    
    # Remove duplicate dates (keep first occurrence)
    combined_df = combined_df.drop_duplicates(subset='date', keep='first')
    
    elapsed_time = time.time() - start_time
    print(f"  ✓ Extracted {len(combined_df):,} daily records in {elapsed_time:.1f} seconds")
    print(f"  Date range: {combined_df['date'].min()} to {combined_df['date'].max()}")
    
    return combined_df


In [None]:
# Process each scenario and variable
results_summary = {}
total_start_time = time.time()

print(f"\n{'='*70}")
print(f"STARTING EXTRACTION PROCESS")
print(f"{'='*70}")
print(f"Total tasks: {len(SCENARIOS) * len(VARIABLES)} (2 scenarios × 3 variables)")
print(f"Target coordinate: ({TARGET_LAT}, {TARGET_LON})")
print(f"{'='*70}\n")

task_num = 0
total_tasks = len(SCENARIOS) * len(VARIABLES)

for scenario in SCENARIOS:
    print(f"\n{'='*70}")
    print(f"Processing Scenario: {scenario}")
    print(f"{'='*70}")
    
    # Select directory based on scenario
    if scenario == 'SSP585':
        base_dir = SSP585_DIR
    elif scenario == 'SSP245':
        base_dir = SSP245_DIR
    else:
        print(f"  ERROR: Unknown scenario {scenario}")
        continue
    
    for variable in VARIABLES:
        task_num += 1
        print(f"\n{'-'*70}")
        print(f"Task {task_num}/{total_tasks}: Processing {variable} ({scenario})")
        print(f"{'-'*70}")
        
        task_start_time = time.time()
        
        # Construct directory path for this variable
        var_dir = os.path.join(base_dir, f"{variable}_ACCESS CM2 {scenario}")
        
        if not os.path.exists(var_dir):
            print(f"  ERROR: Directory not found: {var_dir}")
            continue
        
        # Extract daily data
        daily_data = extract_daily_data_from_netcdf(
            var_dir,
            variable,
            TARGET_LAT,
            TARGET_LON,
            tolerance=COORD_TOLERANCE
        )
        
        if daily_data is None or len(daily_data) == 0:
            print(f"  WARNING: No data extracted for {variable} ({scenario})")
            continue
        
        # Prepare output filename
        output_filename = f"Anameka South_ACCESS CM2_{variable}_{scenario}.csv"
        output_path = os.path.join(OUTPUT_DIR, output_filename)
        
        # Export to CSV (keep only date and value columns for tidy format)
        print(f"  Exporting to CSV...")
        output_df = daily_data[['date', 'value']].copy()
        output_df.to_csv(
            output_path,
            index=False,
            encoding='utf-8',
            float_format='%.6f'
        )
        
        task_elapsed = time.time() - task_start_time
        print(f"  ✓ Exported to CSV: {os.path.basename(output_path)}")
        print(f"  ✓ Rows: {len(output_df):,} | Time: {task_elapsed:.1f}s")
        
        # Store summary
        key = f"{variable}_{scenario}"
        results_summary[key] = {
            'rows': len(output_df),
            'date_range': f"{output_df['date'].min()} to {output_df['date'].max()}",
            'output_file': output_filename,
            'time_seconds': task_elapsed
        }

total_elapsed = time.time() - total_start_time
print(f"\n{'='*70}")
print(f"ALL TASKS COMPLETED")
print(f"{'='*70}")
print(f"Total processing time: {total_elapsed:.1f} seconds ({total_elapsed/60:.1f} minutes)")
print(f"{'='*70}")


## 5. Summary Statistics


In [None]:
print("\n" + "="*70)
print("EXTRACTION SUMMARY")
print("="*70)
print(f"\nTarget coordinate: ({TARGET_LAT}, {TARGET_LON})")
print(f"\nFiles processed:")
for key, summary in results_summary.items():
    var, scen = key.split('_', 1)
    print(f"\n  {var.upper()} ({scen}):")
    print(f"      Rows: {summary['rows']}")
    print(f"      Date range: {summary['date_range']}")
    print(f"      Output file: {summary['output_file']}")

print(f"\nOutput directory: {OUTPUT_DIR}")
print("\nAll CSV files exported successfully!")
