In [7]:
import os
import requests
import cloudinary
import cloudinary.api
from dotenv import load_dotenv
import time
from requests.adapters import HTTPAdapter
from urllib3.util.retry import Retry

In [8]:
# Load environment variables from .env file
load_dotenv()

True

In [9]:
# Configure Cloudinary using environment variables
cloudinary.config(
    cloud_name=os.getenv('CLOUDINARY_CLOUD_NAME'),
    api_key=os.getenv('CLOUDINARY_API_KEY'),
    api_secret=os.getenv('CLOUDINARY_API_SECRET')
)

<cloudinary.Config at 0x170d35f01c0>

In [10]:
def create_requests_session(retries=3, backoff_factor=0.3, status_forcelist=(500, 502, 504)):
    """
    Create a requests session with retry capabilities
    """
    session = requests.Session()
    retry = Retry(
        total=retries,
        read=retries,
        connect=retries,
        backoff_factor=backoff_factor,
        status_forcelist=status_forcelist,
    )
    adapter = HTTPAdapter(max_retries=retry)
    session.mount('http://', adapter)
    session.mount('https://', adapter)
    return session

def download_image(url, download_path, session=None):
    """
    Download image with improved error handling and retry mechanism
    """
    if session is None:
        session = requests.Session()
    
    try:
        # Add a timeout to the request
        response = session.get(url, timeout=(10, 30))  # (connect timeout, read timeout)
        
        if response.status_code == 200:
            # Ensure directory exists
            os.makedirs(os.path.dirname(download_path), exist_ok=True)
            
            with open(download_path, 'wb') as f:
                f.write(response.content)
            print(f"Downloaded: {download_path}")
            return True
        else:
            print(f"Failed to download image from {url}. Status code: {response.status_code}")
            return False
    
    except requests.exceptions.RequestException as e:
        print(f"Error downloading {url}: {e}")
        return False

def fetch_and_download_images(folder_name, download_dir, max_retries=3):
    """
    Fetch and download images with improved error handling
    """
    # Create a requests session with retry capabilities
    session = create_requests_session()
    
    try:
        # Increase max_results if needed, or use pagination
        resources = cloudinary.api.resources(type="upload", prefix=folder_name, max_results=1000)
        
        if not resources.get('resources'):
            print(f"No resources found in the folder {folder_name}.")
            return
        
        # Ensure the download directory exists
        os.makedirs(download_dir, exist_ok=True)
        
        # Track download progress
        total_images = len(resources['resources'])
        downloaded_images = 0
        failed_images = []
        
        for resource in resources['resources']:
            image_url = resource['secure_url']
            public_id = resource['public_id']
            
            # Remove 'financial_data/' from the public_id 
            local_path = public_id.replace(f'{folder_name}/', '')
            
            # Create the full local path
            download_path = os.path.join(download_dir, f"{local_path}.jpg")
            
            # Attempt download with retries
            success = False
            for attempt in range(max_retries):
                if download_image(image_url, download_path, session):
                    success = True
                    downloaded_images += 1
                    break
                
                # Wait a bit before retrying
                time.sleep(2 ** attempt)  # Exponential backoff
            
            if not success:
                failed_images.append(image_url)
                print(f"Failed to download {image_url} after {max_retries} attempts")
            
            # Optional: Add a small delay between downloads to reduce load
            time.sleep(0.1)
        
        # Print summary
        print(f"\nDownload Summary:")
        print(f"Total Images: {total_images}")
        print(f"Successfully Downloaded: {downloaded_images}")
        print(f"Failed Downloads: {len(failed_images)}")
        
        # Optionally, log failed image URLs
        if failed_images:
            with open(os.path.join(download_dir, 'failed_downloads.txt'), 'w') as f:
                f.write("\n".join(failed_images))
    
    except Exception as e:
        print(f"Error fetching resources: {e}")

# Specify the Cloudinary folder name and the local directory to save images
folder_name = "financial_data"
download_dir = "./downloaded_financial_data"

# Fetch and download images from the folder
fetch_and_download_images(folder_name, download_dir)

Downloaded: ./downloaded_financial_data\bank_statements/image1.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image10.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image100.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image101.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image102.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image103.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image104.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image105.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image106.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image11.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image110.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image111.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image113.jpg
Downloaded: ./downloaded_financial_data\bank_statements/image114.jpg
Downloaded: ./downloaded_financial_dat