In [3]:
import io
import os
import re
import sys
import time
from pathlib import Path
from tempfile import NamedTemporaryFile
from typing import Dict, List
from zipfile import ZipFile

import pandas as pd
from dotenv import load_dotenv
from google.cloud import storage
from google.cloud.storage import Client, transfer_manager

sys.path.append("..")
from function.pdf_loader import *

load_dotenv()

True

In [4]:
import pandas as pd
import requests
import os
from urllib.parse import urlparse
from pathlib import Path

In [2]:
def download_image(url, save_path):
    """
    Download an image from a URL and save it to the specified path
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()

        # Get file extension from URL or default to .jpg
        parsed_url = urlparse(url)
        file_extension = os.path.splitext(parsed_url.path)[1]
        if not file_extension:
            file_extension = ".jpg"

        # Create full save path with extension
        full_save_path = save_path.with_suffix(file_extension)

        # Save the image
        with open(full_save_path, "wb") as f:
            for chunk in response.iter_content(chunk_size=8192):
                f.write(chunk)

        return True
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return False


def process_venues_and_photos(excel_path, base_folder):
    """
    Process the Excel file and download images to corresponding venue folders
    """
    # Read the Excel file
    df = pd.read_excel(excel_path)

    # Get all columns that contain 'photo'
    photo_columns = [col for col in df.columns if "photo" in col.lower()]

    # Process each venue
    for index, row in df.iterrows():
        venue_name = str(row["venue_name"])  # Adjust column name if different

        # Create venue folder path
        venue_folder = Path(base_folder) / venue_name / "figures"

        # Create folders if they don't exist
        venue_folder.mkdir(parents=True, exist_ok=True)

        # Process each photo column
        for photo_col in photo_columns:
            url = row[photo_col]

            # Skip if URL is empty or NaN
            if pd.isna(url) or not url:
                continue

            # Create save path for the image
            save_path = venue_folder / f"{photo_col}"

            # Download the image
            success = download_image(url, save_path)
            if success:
                print(f"Successfully downloaded {photo_col} for {venue_name}")
            else:
                print(f"Failed to download {photo_col} for {venue_name}")


In [5]:
def list_files(bucket_name, filter=None):
    if filter is not None:
        filter = re.compile(filter, re.IGNORECASE)

    client = storage.Client()
    bucket = client.bucket(bucket_name)
    blobs = bucket.list_blobs()
    return [blob.name for blob in blobs if filter is None or filter.search(blob.name)]

In [None]:
from google.cloud import storage
import pandas as pd
import requests
import io
from urllib.parse import urlparse
import os


def download_image(url):
    """
    Download an image from a URL and return the bytes
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        return response.content
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return None


def get_file_extension(url):
    """Get file extension from URL or default to .jpg"""
    parsed_url = urlparse(url)
    file_extension = os.path.splitext(parsed_url.path)[1]
    return file_extension if file_extension else ".jpg"


def process_venues_and_photos(bucket_name):
    """
    Process the Excel file from GCS and download images to corresponding venue folders
    """
    # Initialize GCS client
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)

    # Read Excel file from GCS
    excel_blob = bucket.blob("wedding_venue.xlsx")
    excel_content = excel_blob.download_as_bytes()
    df = pd.read_excel(io.BytesIO(excel_content))

    # Get all columns that contain 'photo'
    photo_columns = [col for col in df.columns if "photo" in col.lower()]

    # Process each venue
    for index, row in df.iterrows():
        venue_name = str(row["venue_name"])  # Adjust column name if different

        # Process each photo column
        for photo_col in photo_columns:
            url = row[photo_col]

            # Skip if URL is empty or NaN
            if pd.isna(url) or not url:
                continue

            # Download the image
            image_content = download_image(url)
            if image_content:
                # Get file extension from URL
                file_extension = get_file_extension(url)

                # Create GCS path for the image
                gcs_path = (
                    f"processed/{venue_name}/figures/extra_{photo_col}{file_extension}"
                )

                # Upload to GCS
                blob = bucket.blob(gcs_path)
                blob.upload_from_string(
                    image_content, content_type=f"image/{file_extension[1:]}"
                )

                print(f"Successfully uploaded {photo_col} for {venue_name}")
            else:
                print(f"Failed to download {photo_col} for {venue_name}")


In [None]:
def main():
    bucket_name = "001-wedding-venue"

    try:
        process_venues_and_photos(bucket_name)
        print("Processing completed!")
    except Exception as e:
        print(f"An error occurred: {str(e)}")


if __name__ == "__main__":
    main()

In [6]:
def download_image(url):
    """
    Download an image from a URL and return the bytes
    """
    try:
        response = requests.get(url, stream=True)
        response.raise_for_status()
        return response.content
    except Exception as e:
        print(f"Error downloading {url}: {str(e)}")
        return None

In [None]:
download_image()