# CHIRPS V3 Download Performance Comparison

Comparing sequential vs multithreaded download performance for CHIRPS V3 data (366 daily files for year 2020).

In [1]:
import os
import requests
import pandas as pd
from datetime import datetime
from pathlib import Path
from concurrent.futures import ThreadPoolExecutor, as_completed
import shutil

In [2]:
# Configuration
CHIRPS_V3_BASE_URL = 'https://data.chc.ucsb.edu/products/CHIRPS/v3.0/daily/final/rnl'
START_DATE = '2020-01-01'
END_DATE = '2020-12-31'
DATA_DIR_SEQUENTIAL = './chirps_v3_sequential'
DATA_DIR_MULTITHREADED = './chirps_v3_multithreaded'
MAX_WORKERS = 10  # Number of concurrent download threads

In [3]:
def generate_date_list(start_date, end_date):
    """Generate list of dates for download."""
    start = pd.to_datetime(start_date)
    end = pd.to_datetime(end_date)
    
    dates = []
    current = start
    while current <= end:
        dates.append(current)
        current += pd.Timedelta(days=1)
    
    return dates

# Generate full year date list
all_dates = generate_date_list(START_DATE, END_DATE)
print(f"Total files to download: {len(all_dates)}")
print(f"Date range: {all_dates[0].date()} to {all_dates[-1].date()}")

Total files to download: 366
Date range: 2020-01-01 to 2020-12-31


## Method 1: Sequential Download (One by One)

In [4]:
def download_file_sequential(date, data_dir, base_url):
    """Download a single CHIRPS V3 file."""
    year = date.year
    month = date.month
    day = date.day
    
    filename = f'chirps-v3.0.rnl.{year}.{month:02d}.{day:02d}.tif'
    filepath = Path(data_dir) / filename
    url = f'{base_url}/{year}/{filename}'
    
    try:
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status()
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return True, filename
    except Exception as e:
        return False, f"{filename}: {str(e)}"

def download_all_sequential(dates, data_dir, base_url):
    """Download all files sequentially."""
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    
    downloaded = []
    failed = []
    
    for i, date in enumerate(dates, 1):
        success, result = download_file_sequential(date, data_dir, base_url)
        
        if success:
            downloaded.append(result)
            if i % 50 == 0:
                print(f"Downloaded {i}/{len(dates)} files...")
        else:
            failed.append(result)
    
    return {'downloaded': len(downloaded), 'failed': len(failed), 'failed_files': failed}

In [None]:
# Clean directory if exists
if Path(DATA_DIR_SEQUENTIAL).exists():
    shutil.rmtree(DATA_DIR_SEQUENTIAL)

print("Starting Sequential Download...")
print(f"Downloading {len(all_dates)} files one by one...")
print("=" * 60)

result_sequential = download_all_sequential(all_dates, DATA_DIR_SEQUENTIAL, CHIRPS_V3_BASE_URL)

print("=" * 60)
print(f"Sequential Download Complete!")
print(f"Downloaded: {result_sequential['downloaded']} files")
print(f"Failed: {result_sequential['failed']} files")

Starting Sequential Download...
Downloading 366 files one by one...


## Method 2: Multithreaded Download (Concurrent)

In [None]:
def download_file_threaded(date, data_dir, base_url):
    """Download a single file (thread-safe version)."""
    year = date.year
    month = date.month
    day = date.day
    
    filename = f'chirps-v3.0.rnl.{year}.{month:02d}.{day:02d}.tif'
    filepath = Path(data_dir) / filename
    url = f'{base_url}/{year}/{filename}'
    
    try:
        response = requests.get(url, stream=True, timeout=300)
        response.raise_for_status()
        
        with open(filepath, 'wb') as f:
            for chunk in response.iter_content(chunk_size=8192):
                if chunk:
                    f.write(chunk)
        
        return True, filename
    except Exception as e:
        return False, f"{filename}: {str(e)}"

def download_all_multithreaded(dates, data_dir, base_url, max_workers=10):
    """Download all files using multithreading."""
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    
    downloaded = []
    failed = []
    completed = 0
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        # Submit all download tasks
        future_to_date = {
            executor.submit(download_file_threaded, date, data_dir, base_url): date 
            for date in dates
        }
        
        # Process completed downloads
        for future in as_completed(future_to_date):
            completed += 1
            success, result = future.result()
            
            if success:
                downloaded.append(result)
            else:
                failed.append(result)
            
            if completed % 50 == 0:
                print(f"Downloaded {completed}/{len(dates)} files...")
    
    return {'downloaded': len(downloaded), 'failed': len(failed), 'failed_files': failed}

In [None]:
# Clean directory if exists
if Path(DATA_DIR_MULTITHREADED).exists():
    shutil.rmtree(DATA_DIR_MULTITHREADED)

print("Starting Multithreaded Download...")
print(f"Downloading {len(all_dates)} files with {MAX_WORKERS} concurrent threads...")
print("=" * 60)

result_multithreaded = download_all_multithreaded(
    all_dates, 
    DATA_DIR_MULTITHREADED, 
    CHIRPS_V3_BASE_URL, 
    MAX_WORKERS
)

print("=" * 60)
print(f"Multithreaded Download Complete!")
print(f"Downloaded: {result_multithreaded['downloaded']} files")
print(f"Failed: {result_multithreaded['failed']} files")

## Performance Summary

Check the execution time shown by Jupyter for each download cell above.

**Expected Results:**
- Sequential: Downloads files one at a time (slower)
- Multithreaded: Downloads multiple files simultaneously (significantly faster)

**Optimization Recommendations:**
1. **ThreadPoolExecutor** (implemented above) - Best for I/O-bound tasks like downloading
2. **Adjust MAX_WORKERS** - Try values between 5-20 based on network bandwidth
3. **Connection pooling** - Use `requests.Session()` to reuse connections
4. **Async/await** - Use `aiohttp` for even better performance with many files

The multithreaded approach should be 5-10x faster depending on network conditions and server limits.

## Method 3: Async Download (Most Optimized) - OPTIONAL

Using `aiohttp` with async/await for maximum performance.

In [None]:
# Uncomment to install aiohttp if needed
# !pip install aiohttp

import asyncio
import aiohttp

DATA_DIR_ASYNC = './chirps_v3_async'

async def download_file_async(session, date, data_dir, base_url):
    """Download a single file asynchronously."""
    year = date.year
    month = date.month
    day = date.day
    
    filename = f'chirps-v3.0.rnl.{year}.{month:02d}.{day:02d}.tif'
    filepath = Path(data_dir) / filename
    url = f'{base_url}/{year}/{filename}'
    
    try:
        async with session.get(url, timeout=300) as response:
            response.raise_for_status()
            
            with open(filepath, 'wb') as f:
                async for chunk in response.content.iter_chunked(8192):
                    f.write(chunk)
        
        return True, filename
    except Exception as e:
        return False, f"{filename}: {str(e)}"

async def download_all_async(dates, data_dir, base_url, max_concurrent=20):
    """Download all files asynchronously."""
    Path(data_dir).mkdir(parents=True, exist_ok=True)
    
    downloaded = []
    failed = []
    completed = 0
    
    # Create semaphore to limit concurrent downloads
    semaphore = asyncio.Semaphore(max_concurrent)
    
    async def download_with_semaphore(session, date):
        async with semaphore:
            return await download_file_async(session, date, data_dir, base_url)
    
    # Create session with connection pooling
    connector = aiohttp.TCPConnector(limit=max_concurrent)
    async with aiohttp.ClientSession(connector=connector) as session:
        tasks = [download_with_semaphore(session, date) for date in dates]
        
        for i, coro in enumerate(asyncio.as_completed(tasks), 1):
            success, result = await coro
            completed += 1
            
            if success:
                downloaded.append(result)
            else:
                failed.append(result)
            
            if completed % 50 == 0:
                print(f"Downloaded {completed}/{len(dates)} files...")
    
    return {'downloaded': len(downloaded), 'failed': len(failed), 'failed_files': failed}

In [None]:
# Clean directory if exists
if Path(DATA_DIR_ASYNC).exists():
    shutil.rmtree(DATA_DIR_ASYNC)

print("Starting Async Download...")
print(f"Downloading {len(all_dates)} files with async/await (max 20 concurrent)...")
print("=" * 60)

# Run async download
result_async = await download_all_async(
    all_dates, 
    DATA_DIR_ASYNC, 
    CHIRPS_V3_BASE_URL,
    max_concurrent=20
)

print("=" * 60)
print(f"Async Download Complete!")
print(f"Downloaded: {result_async['downloaded']} files")
print(f"Failed: {result_async['failed']} files")

## Final Comparison

Compare the execution times shown above for each method:

| Method | Description | Expected Performance |
|--------|-------------|---------------------|
| **Sequential** | Downloads one file at a time | Baseline (slowest) |
| **Multithreaded** | 10 concurrent threads | 5-10x faster |
| **Async** | 20 concurrent connections | 8-15x faster |

**Why Multithreading/Async is Faster:**
- Network I/O is the bottleneck, not CPU
- While waiting for one file, other downloads proceed
- Better utilization of network bandwidth
- Reduced idle time between requests

**Note:** Actual speedup depends on:
- Server rate limits
- Network bandwidth
- Latency to server
- File sizes