# Downloading Data

In [None]:
import requests
import os
from requests.exceptions import ConnectionError, Timeout

import time  # Import the time module

# Define base URL
base_url = "https://data.gharchive.org/"
MAX_RETRIES = 10  # Define the maximum number of retries
RETRY_DELAY = 5  # Define delay (seconds) between retries

def download_day(day, month):
    """Downloads data for a specific day in a month with retry logic and file checking."""
    date = f"2023-{month:02d}-{day:02d}"  # Format date with leading zeros (YYYY-MM-DD)

    # Ensure directory exists (optional)
    download_dir = f"D:/GhArchive/July_2023/files"  # Build directory path (YYYY-MM)
    os.makedirs(download_dir, exist_ok=True)  # Create directory structure with exist_ok

    # Get list of existing files (optional optimization)
    existing_files = [f for f in os.listdir(download_dir) if f.startswith(f"gharchive-{date}-")]

    # Loop through all hours (00 to 23)
    for hour in range(24):
        if hour == 0 and len(existing_files) > 0:  # Skip hour 0 if any files exist for the day
            continue  # Move to next hour

        url = f"{base_url}{date}-{hour}.json.gz"

        # Create filename with padding
        filename = f"gharchive-{date}-{hour:02d}.json.gz"

        if filename in existing_files:  # Check if file already exists
            print(f"Skipping download: {filename} (already exists)")
            continue

        # Download logic (unchanged)
        for attempt in range(1, MAX_RETRIES + 1):
            try:
                response = requests.get(url, stream=True, timeout=10)  # Set timeout value (seconds)
                response.raise_for_status()  # Raise an exception for non-200 status codes

                filepath = os.path.join(download_dir, filename)
                with open(filepath, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                    print(f"Downloaded: {filename}")
                break  # Exit the loop on successful download

            except (ConnectionError, Timeout) as e:
                print(f"Error downloading {url} (attempt {attempt}/{MAX_RETRIES}): {e}")
                if attempt == MAX_RETRIES:
                    print(f"Download failed for {url} after {MAX_RETRIES} attempts.")
                else:
                    # Wait before retrying
                    delay = RETRY_DELAY * attempt  # Exponential backoff
                    print(f"Retrying in {delay} seconds...")
                    time.sleep(delay)  # Use the imported time module

# Download data for July 2023 (assuming partial download)
for day in range(1, 32):  # July has 31 days
    download_day(day, month=7)

print("Download complete!")