# CHIRPS Data Downloader and Slicer

This notebook downloads CHIRPS precipitation data for specific coordinates and date ranges, then extracts the values into a pandas DataFrame.

## 📦 Installation Note for Windows Users

**GDAL installation on Windows can be tricky!** Here are your options:

### Option 1: Use Conda (Recommended)
```bash
conda install -c conda-forge gdal
```

### Option 2: Download Pre-compiled Wheel
1. Visit: https://www.lfd.uci.edu/~gohlke/pythonlibs/#gdal
2. Download the wheel for your Python version (e.g., `GDAL-3.x.x-cp311-cp311-win_amd64.whl`)
3. Install: `pip install path\to\GDAL-3.x.x-cp311-cp311-win_amd64.whl`

### Option 3: Use Alternative Libraries (Below)
We provide an alternative implementation using `xarray` and `rioxarray` that's easier to install on Windows.

## Alternative Implementation (Windows-Friendly)

This version uses `xarray` instead of GDAL, which is much easier to install on Windows.

In [15]:
# Alternative imports - Windows-friendly (no GDAL required)
import os
import sys
import requests
import pandas as pd
import numpy as np
import xarray as xr
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

print("✅ All packages imported successfully!")
print(f"xarray version: {xr.__version__}")
print(f"pandas version: {pd.__version__}")

✅ All packages imported successfully!
xarray version: 2025.9.1
pandas version: 2.3.1


In [16]:
# Alternative extract function using xarray (no GDAL needed)
def extract_chirps_data_xarray(nc_files, lat, lon, start_date, end_date):
    """
    Extract CHIRPS precipitation data using xarray (Windows-friendly, no GDAL)
    
    Parameters:
    -----------
    nc_files : list
        List of NetCDF file paths
    lat : float
        Latitude of the point
    lon : float
        Longitude of the point
    start_date : datetime
        Start date for extraction
    end_date : datetime
        End date for extraction
    
    Returns:
    --------
    pandas.Series : Time series of precipitation values with dates as index
    """
    nc_files.sort()  # Sort files chronologically
    
    all_data = []
    
    for nc_file in nc_files:
        if not nc_file.endswith('.nc'):
            continue
        
        print(f'Processing {os.path.basename(nc_file)}...')
        
        try:
            # Open NetCDF file with xarray
            ds = xr.open_dataset(nc_file)
            
            # Select nearest point to lat/lon
            ds_point = ds.sel(latitude=lat, longitude=lon, method='nearest')
            
            # Extract precipitation variable (usually 'precip' or 'precipitation')
            precip_var = None
            for var_name in ['precip', 'precipitation', 'prec', 'rain']:
                if var_name in ds_point.data_vars:
                    precip_var = var_name
                    break
            
            if precip_var is None:
                print(f"  Warning: Could not find precipitation variable in {nc_file}")
                print(f"  Available variables: {list(ds_point.data_vars)}")
                continue
            
            # Convert to DataFrame
            df_temp = ds_point[precip_var].to_dataframe(name='precip')
            
            # Filter by date range
            df_temp = df_temp[(df_temp.index >= start_date) & (df_temp.index <= end_date)]
            
            all_data.append(df_temp)
            
            ds.close()
            
        except Exception as e:
            print(f"  Error processing {nc_file}: {e}")
            continue
    
    if not all_data:
        print("⚠️ No data extracted!")
        return pd.Series(dtype=float)
    
    # Combine all data
    combined_df = pd.concat(all_data)
    
    # Convert to Series and handle duplicates
    precip_series = combined_df['precip']
    
    # Remove duplicates (keep first occurrence)
    if precip_series.index.duplicated().any():
        precip_series = precip_series[~precip_series.index.duplicated(keep='first')]
    
    # Sort by date
    precip_series = precip_series.sort_index()
    
    # Replace fill values with NaN
    precip_series = precip_series.replace(-9999.0, np.nan)
    
    return precip_series

In [None]:
# Visualization function
import matplotlib.pyplot as plt

def plot_chirps_data(df, title='CHIRPS Precipitation Data'):
    """
    Plot CHIRPS precipitation time series
    """
    if len(df) == 0:
        print("No data to plot!")
        return
    
    fig, ax = plt.subplots(figsize=(14, 6))
    
    for column in df.columns:
        ax.plot(df.index, df[column], marker='o', markersize=4, label=column, linewidth=1.5)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Precipitation (mm)', fontsize=12)
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

print("✅ Visualization function defined!")

NameError: name 'df_chirps' is not defined

In [None]:
# Test with a single location - Ames, Iowa (your project location)
coordinates_test = [
    {'id': 'Ames_IA', 'lat': 42.0, 'lon': -93.5}
]

# Get data for January 2024
start = '2024-01-01'
end = '2024-01-31'

print("🧪 Testing CHIRPS data extraction (xarray version, no GDAL needed)...\n")
df_chirps = get_chirps_dataframe_xarray(coordinates_test, start, end)

if len(df_chirps) > 0:
    print("\n📊 Sample data:")
    print(df_chirps.head(10))
    print("\n📈 Statistics:")
    print(df_chirps.describe())
else:
    print("\n⚠️ No data retrieved. Check download status above.")

## 🚀 Test the Alternative Implementation

Let's test the xarray-based functions with a sample location.

In [None]:
# Main function using xarray
def get_chirps_dataframe_xarray(coordinates, start_date, end_date, data_dir='./chirps_data'):
    """
    Download and extract CHIRPS data using xarray (Windows-friendly, no GDAL)
    
    Parameters:
    -----------
    coordinates : list of tuples or dict
        List of (lat, lon) tuples or dict with 'lat' and 'lon' keys
        Example: [(28.5, -81.5), (29.0, -82.0)]
        Or: [{'id': 'Point1', 'lat': 28.5, 'lon': -81.5}]
    start_date : str or datetime
        Start date (format: 'YYYY-MM-DD' or datetime object)
    end_date : str or datetime
        End date (format: 'YYYY-MM-DD' or datetime object)
    data_dir : str
        Base directory for storing downloaded files
    
    Returns:
    --------
    pandas.DataFrame : DataFrame with dates as index and locations as columns
                      (precipitation values in mm)
    """
    # Convert dates to datetime objects if needed
    if isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    if isinstance(end_date, str):
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    print(f"Fetching CHIRPS data from {start_date.date()} to {end_date.date()}")
    print("="*70)
    
    # Download corrected data
    print("\n📥 Downloading CHIRPS corrected data...")
    corrected_dir = os.path.join(data_dir, 'corrected')
    corrected_files = download_chirps_corrected(start_date, end_date, corrected_dir)
    
    # For simplicity, we'll just use corrected data for now
    # You can add preliminary data logic if needed
    all_files = corrected_files
    
    if not all_files:
        print("❌ No data files available!")
        return pd.DataFrame()
    
    # Extract data for each coordinate
    print("\n🔍 Extracting precipitation data for coordinates...")
    print("="*70)
    
    df_dict = {}
    
    for idx, coord in enumerate(coordinates):
        # Parse coordinate
        if isinstance(coord, dict):
            lat = coord['lat']
            lon = coord['lon']
            coord_id = coord.get('id', f'Point_{idx+1}')
        else:
            lat, lon = coord
            coord_id = f'Point_{idx+1}'
        
        print(f"\nProcessing {coord_id}: Lat={lat}, Lon={lon}")
        
        # Extract data using xarray
        precip_series = extract_chirps_data_xarray(all_files, lat, lon, start_date, end_date)
        
        df_dict[coord_id] = precip_series
        print(f"  ✓ Extracted {len(precip_series)} days of data")
    
    # Create DataFrame
    df = pd.DataFrame(df_dict)
    
    print("\n" + "="*70)
    print(f"✅ Complete! DataFrame shape: {df.shape}")
    if len(df) > 0:
        print(f"   Date range: {df.index[0].date()} to {df.index[-1].date()}")
    print(f"   Locations: {len(df.columns)}")
    
    return df

In [18]:
# Download functions (same as before, no GDAL dependency)
def download_chirps_corrected(start_date, end_date, output_dir='./chirps_data/corrected'):
    """
    Download corrected CHIRPS data (monthly basis)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    s = requests.Session()
    s.mount("https://data.chc.ucsb.edu", requests.adapters.HTTPAdapter(max_retries=10))
    
    diff_month = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
    downloaded_files = []
    
    for n in range(diff_month + 1):
        yymm = start_date + relativedelta(months=+n)
        yy = yymm.strftime("%Y")
        mm = yymm.strftime("%m")
        
        filename = f'chirps-v2.0.{yy}.{mm}.days_p05.nc'
        filepath = os.path.join(output_dir, filename)
        
        if os.path.exists(filepath):
            print(f'{filename} already exists. Skipping download.')
            downloaded_files.append(filepath)
            continue
        
        try:
            url = f'https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05/by_month/{filename}'
            print(f'Downloading {filename}...')
            response = s.get(url, timeout=80)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f'{filename} downloaded successfully.')
            downloaded_files.append(filepath)
            
        except requests.exceptions.HTTPError as err:
            print(f'Failed to download {filename}: {err}')
        finally:
            if 'response' in locals():
                response.close()
    
    return downloaded_files


def download_chirps_preliminary(start_date, end_date, output_dir='./chirps_data/preliminary'):
    """
    Download preliminary CHIRPS data (yearly basis)
    """
    os.makedirs(output_dir, exist_ok=True)
    
    s = requests.Session()
    s.mount("https://data.chc.ucsb.edu", requests.adapters.HTTPAdapter(max_retries=10))
    
    downloaded_files = []
    
    for year in range(start_date.year, end_date.year + 1):
        filename = f'chirps-v2.0.{year}.days_p05.nc'
        filepath = os.path.join(output_dir, filename)
        
        if os.path.exists(filepath):
            print(f'{filename} already exists. Skipping download.')
            downloaded_files.append(filepath)
            continue
        
        try:
            url = f'https://data.chc.ucsb.edu/products/CHIRPS-2.0/prelim/global_daily/fixed/netcdf/{filename}'
            print(f'Downloading {filename}...')
            response = s.get(url, timeout=80)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f'{filename} downloaded successfully.')
            downloaded_files.append(filepath)
            
        except requests.exceptions.HTTPError as err:
            print(f'Failed to download {filename}: {err}')
        finally:
            if 'response' in locals():
                response.close()
    
    return downloaded_files

In [None]:
import os
import sys
import requests
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta
from osgeo import gdal
from osgeo.gdalconst import GA_ReadOnly

# Register GDAL drivers
gdal.AllRegister()

ModuleNotFoundError: No module named 'osgeo'

## Step 1: Download CHIRPS NetCDF Files

Download corrected and preliminary CHIRPS data for the specified date range.

In [None]:
def download_chirps_corrected(start_date, end_date, output_dir='./chirps_data/corrected'):
    """
    Download corrected CHIRPS data (monthly basis)
    
    Parameters:
    -----------
    start_date : datetime
        Start date for data download
    end_date : datetime
        End date for data download
    output_dir : str
        Directory to save downloaded NetCDF files
    
    Returns:
    --------
    list : List of downloaded file paths
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    s = requests.Session()
    s.mount("https://data.chc.ucsb.edu", requests.adapters.HTTPAdapter(max_retries=10))
    
    diff_month = (end_date.year - start_date.year) * 12 + (end_date.month - start_date.month)
    downloaded_files = []
    
    for n in range(diff_month + 1):
        yymm = start_date + relativedelta(months=+n)
        yy = yymm.strftime("%Y")
        mm = yymm.strftime("%m")
        
        filename = f'chirps-v2.0.{yy}.{mm}.days_p05.nc'
        filepath = os.path.join(output_dir, filename)
        
        # Skip if file already exists
        if os.path.exists(filepath):
            print(f'{filename} already exists. Skipping download.')
            downloaded_files.append(filepath)
            continue
        
        try:
            url = f'https://data.chc.ucsb.edu/products/CHIRPS-2.0/global_daily/netcdf/p05/by_month/{filename}'
            print(f'Downloading {filename}...')
            response = s.get(url, timeout=80)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f'{filename} downloaded successfully.')
            downloaded_files.append(filepath)
            
        except requests.exceptions.HTTPError as err:
            print(f'Failed to download {filename}: {err}')
        finally:
            if 'response' in locals():
                response.close()
    
    return downloaded_files


def download_chirps_preliminary(start_date, end_date, output_dir='./chirps_data/preliminary'):
    """
    Download preliminary CHIRPS data (yearly basis)
    
    Parameters:
    -----------
    start_date : datetime
        Start date for data download
    end_date : datetime
        End date for data download
    output_dir : str
        Directory to save downloaded NetCDF files
    
    Returns:
    --------
    list : List of downloaded file paths
    """
    # Create output directory if it doesn't exist
    os.makedirs(output_dir, exist_ok=True)
    
    s = requests.Session()
    s.mount("https://data.chc.ucsb.edu", requests.adapters.HTTPAdapter(max_retries=10))
    
    downloaded_files = []
    
    for year in range(start_date.year, end_date.year + 1):
        filename = f'chirps-v2.0.{year}.days_p05.nc'
        filepath = os.path.join(output_dir, filename)
        
        # Skip if file already exists
        if os.path.exists(filepath):
            print(f'{filename} already exists. Skipping download.')
            downloaded_files.append(filepath)
            continue
        
        try:
            url = f'https://data.chc.ucsb.edu/products/CHIRPS-2.0/prelim/global_daily/fixed/netcdf/{filename}'
            print(f'Downloading {filename}...')
            response = s.get(url, timeout=80)
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                f.write(response.content)
            print(f'{filename} downloaded successfully.')
            downloaded_files.append(filepath)
            
        except requests.exceptions.HTTPError as err:
            print(f'Failed to download {filename}: {err}')
        finally:
            if 'response' in locals():
                response.close()
    
    return downloaded_files

## Step 2: Extract CHIRPS Data at Specific Coordinates

Extract precipitation values from NetCDF files for given lat/lon coordinates.

In [None]:
def extract_chirps_data(nc_files, lat, lon, start_date, end_date):
    """
    Extract CHIRPS precipitation data for specific coordinates and date range
    
    Parameters:
    -----------
    nc_files : list
        List of NetCDF file paths
    lat : float
        Latitude of the point
    lon : float
        Longitude of the point
    start_date : datetime
        Start date for extraction
    end_date : datetime
        End date for extraction
    
    Returns:
    --------
    pandas.Series : Time series of precipitation values with dates as index
    """
    nc_files.sort()  # Sort files chronologically
    
    precip_data = {}
    
    for nc_file in nc_files:
        if not nc_file.endswith('.nc'):
            continue
        
        print(f'Processing {os.path.basename(nc_file)}...')
        
        # Open NetCDF file
        dataset = gdal.Open(nc_file, GA_ReadOnly)
        
        if dataset is None:
            print(f'Could not open {nc_file}')
            continue
        
        # Get metadata
        meta_nc = dataset.GetMetadata()
        date_start_str = meta_nc['time#units'][-19:]  # e.g., '1980-1-1 0:0:0'
        datetime_origin = datetime.strptime(date_start_str, '%Y-%m-%d %H:%M:%S')
        
        # Get band time values
        bands_time_str = meta_nc['NETCDF_DIM_time_VALUES'][1:-1]
        bands_time = list(map(int, bands_time_str.split(',')))
        
        # Get geotransformation
        gt = dataset.GetGeoTransform()
        
        # Convert lat/lon to pixel coordinates
        px = int((lon - gt[0]) / gt[1])
        py = int((lat - gt[3]) / gt[5])
        
        # Get number of bands
        num_bands = dataset.RasterCount
        
        # Extract data from each band
        for i in range(1, num_bands + 1):
            # Calculate date for this band
            band_date = datetime_origin + timedelta(days=bands_time[i - 1])
            
            # Check if date is within requested range
            if band_date < start_date or band_date > end_date:
                continue
            
            # Read precipitation value at the pixel
            band = dataset.GetRasterBand(i)
            data = band.ReadAsArray(px, py, 1, 1)
            
            if data is None:
                precip_value = np.nan
            else:
                precip_value = data[0][0]
                if precip_value == -9999.0:
                    precip_value = np.nan
            
            # Store with date as key
            date_str = band_date.strftime('%Y-%m-%d')
            precip_data[date_str] = precip_value
        
        dataset = None  # Close dataset
    
    # Convert to pandas Series
    precip_series = pd.Series(precip_data)
    precip_series.index = pd.to_datetime(precip_series.index)
    precip_series = precip_series.sort_index()
    
    return precip_series

## Step 3: Main Function - Get CHIRPS Data for Multiple Coordinates

Process multiple coordinates and return a DataFrame.

In [None]:
def get_chirps_dataframe(coordinates, start_date, end_date, data_dir='./chirps_data'):
    """
    Download and extract CHIRPS data for multiple coordinates
    
    Parameters:
    -----------
    coordinates : list of tuples or dict
        List of (lat, lon) tuples or dict with 'lat' and 'lon' keys
        Example: [(28.5, -81.5), (29.0, -82.0)]
        Or: [{'id': 'Point1', 'lat': 28.5, 'lon': -81.5}]
    start_date : str or datetime
        Start date (format: 'YYYY-MM-DD' or datetime object)
    end_date : str or datetime
        End date (format: 'YYYY-MM-DD' or datetime object)
    data_dir : str
        Base directory for storing downloaded files
    
    Returns:
    --------
    pandas.DataFrame : DataFrame with dates as index and locations as columns
                      (precipitation values in mm)
    """
    # Convert dates to datetime objects if needed
    if isinstance(start_date, str):
        start_date = datetime.strptime(start_date, '%Y-%m-%d')
    if isinstance(end_date, str):
        end_date = datetime.strptime(end_date, '%Y-%m-%d')
    
    print(f"Fetching CHIRPS data from {start_date.date()} to {end_date.date()}")
    print("="*70)
    
    # Download corrected data
    print("\n📥 Downloading CHIRPS corrected data...")
    corrected_dir = os.path.join(data_dir, 'corrected')
    corrected_files = download_chirps_corrected(start_date, end_date, corrected_dir)
    
    # Determine if we need preliminary data
    # Check the last available date in corrected data
    if corrected_files:
        last_corrected_file = corrected_files[-1]
        dataset = gdal.Open(last_corrected_file, GA_ReadOnly)
        if dataset:
            meta_nc = dataset.GetMetadata()
            date_start_str = meta_nc['time#units'][-19:]
            datetime_origin = datetime.strptime(date_start_str, '%Y-%m-%d %H:%M:%S')
            bands_time_str = meta_nc['NETCDF_DIM_time_VALUES'][1:-1]
            bands_time = list(map(int, bands_time_str.split(',')))
            last_corrected_date = datetime_origin + timedelta(days=bands_time[-1])
            dataset = None
            
            print(f"Last date in corrected data: {last_corrected_date.date()}")
            
            # If end_date is after last corrected date, download preliminary data
            if end_date > last_corrected_date:
                print("\n📥 Downloading CHIRPS preliminary data...")
                preliminary_dir = os.path.join(data_dir, 'preliminary')
                preliminary_files = download_chirps_preliminary(
                    last_corrected_date + timedelta(days=1), 
                    end_date, 
                    preliminary_dir
                )
                all_files = corrected_files + preliminary_files
            else:
                all_files = corrected_files
        else:
            all_files = corrected_files
    else:
        all_files = []
    
    if not all_files:
        print("❌ No data files available!")
        return pd.DataFrame()
    
    # Extract data for each coordinate
    print("\n🔍 Extracting precipitation data for coordinates...")
    print("="*70)
    
    df_dict = {}
    
    for idx, coord in enumerate(coordinates):
        # Parse coordinate
        if isinstance(coord, dict):
            lat = coord['lat']
            lon = coord['lon']
            coord_id = coord.get('id', f'Point_{idx+1}')
        else:
            lat, lon = coord
            coord_id = f'Point_{idx+1}'
        
        print(f"\nProcessing {coord_id}: Lat={lat}, Lon={lon}")
        
        # Extract data
        precip_series = extract_chirps_data(all_files, lat, lon, start_date, end_date)
        
        df_dict[coord_id] = precip_series
        print(f"  ✓ Extracted {len(precip_series)} days of data")
    
    # Create DataFrame
    df = pd.DataFrame(df_dict)
    
    print("\n" + "="*70)
    print(f"✅ Complete! DataFrame shape: {df.shape}")
    print(f"   Date range: {df.index[0].date()} to {df.index[-1].date()}")
    print(f"   Locations: {len(df.columns)}")
    
    return df

## Example Usage

Let's test the functions with sample coordinates.

In [None]:
# Example 1: Single coordinate as tuple
coordinates_example1 = [
    (28.5384, -81.3789),  # Orlando, FL
]

# Example 2: Multiple coordinates with IDs
coordinates_example2 = [
    {'id': 'Orlando_FL', 'lat': 28.5384, 'lon': -81.3789},
    {'id': 'Gainesville_FL', 'lat': 29.6516, 'lon': -82.3248},
    {'id': 'Miami_FL', 'lat': 25.7617, 'lon': -80.1918},
]

# Define date range
start = '2024-01-01'
end = '2024-01-31'

# Uncomment to run:
# df_chirps = get_chirps_dataframe(coordinates_example2, start, end)
# print(df_chirps.head())
# print(df_chirps.describe())

## Visualization

Plot the precipitation time series.

In [None]:
import matplotlib.pyplot as plt

def plot_chirps_data(df, title='CHIRPS Precipitation Data'):
    """
    Plot CHIRPS precipitation time series
    
    Parameters:
    -----------
    df : pandas.DataFrame
        DataFrame with dates as index and locations as columns
    title : str
        Plot title
    """
    fig, ax = plt.subplots(figsize=(14, 6))
    
    for column in df.columns:
        ax.plot(df.index, df[column], marker='o', markersize=3, label=column, linewidth=1.5)
    
    ax.set_xlabel('Date', fontsize=12)
    ax.set_ylabel('Precipitation (mm)', fontsize=12)
    ax.set_title(title, fontsize=14, fontweight='bold')
    ax.legend(loc='best')
    ax.grid(True, alpha=0.3)
    plt.xticks(rotation=45)
    plt.tight_layout()
    plt.show()

# Uncomment to plot:
plot_chirps_data(df_chirps)

NameError: name 'df_chirps' is not defined