In [1]:
import base64
import hashlib
import io
import json
import os
import time
import urllib.parse

import pandas as pd
from dotenv import load_dotenv
from google.cloud import storage
from selenium import webdriver
from selenium.webdriver.chrome.options import Options

load_dotenv(override=True)


class ChromeDriver:
    def __init__(self):
        chrome_options = Options()
        chrome_options.add_argument("--headless")  # Run in headless mode
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920x1080")
        # Add user agent
        chrome_options.add_argument(
            "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        )
        self.driver = webdriver.Chrome(options=chrome_options)

        # self.driver = webdriver.Chrome(options=chrome_options)


class SeleniumDownloader:
    def __init__(self):
        # chrome_options = Options()
        # chrome_options.add_argument("--headless")  # Run in headless mode
        # chrome_options.add_argument("--no-sandbox")
        # chrome_options.add_argument("--disable-dev-shm-usage")
        # chrome_options.add_argument("--disable-gpu")
        # chrome_options.add_argument("--window-size=1920x1080")
        # # Add user agent
        # chrome_options.add_argument(
        #     "user-agent=Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36"
        # )

        self.driver = ChromeDriver()

    def get_image(self, url):
        """Download image using Selenium"""
        try:
            # First visit WeddingWire to set up cookies
            self.driver.get("https://www.weddingwire.com")
            time.sleep(2)  # Wait for cookies to be set

            # Now get the image
            self.driver.get(url)
            time.sleep(2)  # Wait for image to load

            # Get the image as base64
            img_base64 = self.driver.execute_script("""
                var c = document.createElement('canvas');
                var ctx = c.getContext('2d');
                var img = document.querySelector('img');
                
                if (!img) return null;
                
                c.height = img.naturalHeight;
                c.width = img.naturalWidth;
                ctx.drawImage(img, 0, 0);
                
                return c.toDataURL('image/jpeg').split(',')[1];
            """)

            if img_base64:
                return base64.b64decode(img_base64)
            return None

        except Exception as e:
            print(f"Error downloading {url}: {str(e)}")
            return None

    def __del__(self):
        """Clean up the browser when done"""
        try:
            self.driver.quit()
        except:
            pass


class ChangeTracker:
    def __init__(self, bucket, tracker_path="image_tracker.json"):
        self.bucket = bucket
        self.tracker_path = tracker_path
        self.tracked_images = self.load_tracker()

    def load_tracker(self):
        try:
            blob = self.bucket.blob(self.tracker_path)
            content = blob.download_as_string()
            return json.loads(content)
        except Exception:
            return {}

    def save_tracker(self):
        blob = self.bucket.blob(self.tracker_path)
        blob.upload_from_string(json.dumps(self.tracked_images, indent=2))

    def get_url_hash(self, url):
        return hashlib.md5(url.encode()).hexdigest()

    def should_download(self, url, venue_name):
        url_hash = self.get_url_hash(url)
        venue_data = self.tracked_images.get(venue_name, {})
        return url_hash not in venue_data

    def mark_downloaded(self, url, venue_name, filename):
        url_hash = self.get_url_hash(url)
        if venue_name not in self.tracked_images:
            self.tracked_images[venue_name] = {}
        self.tracked_images[venue_name][url_hash] = {
            "filename": filename,
            "download_date": time.strftime("%Y-%m-%d %H:%M:%S"),
            "url": url,
        }


def get_filename_from_url(url, photo_col):
    parsed_url = urllib.parse.urlparse(url)
    path = parsed_url.path
    original_filename = os.path.basename(path)

    if original_filename and len(original_filename) > 10:
        return f"extra_{original_filename}"
    else:
        extension = os.path.splitext(parsed_url.path)[1] or ".jpg"
        return f"extra_{photo_col}{extension}"


def process_venues_and_photos(bucket_name):
    storage_client = storage.Client()
    bucket = storage_client.bucket(bucket_name)
    downloader = SeleniumDownloader()
    tracker = ChangeTracker(bucket)

    # Read Excel file
    excel_blob = bucket.blob("Wedding Venues.xlsx")
    excel_content = excel_blob.download_as_bytes()
    df = pd.read_excel(io.BytesIO(excel_content))

    photo_columns = [col for col in df.columns if "photo" in col.lower()]
    results = {"success": 0, "failed": 0, "skipped": 0}

    try:
        for index, row in df.iterrows():
            venue_name = str(row["Venue name"])
            print(f"\nProcessing venue: {venue_name}")

            for photo_col in photo_columns:
                url = row[photo_col]

                if pd.isna(url) or not url:
                    continue

                if not tracker.should_download(url, venue_name):
                    print(
                        f"Skipping already downloaded image for {venue_name}: {photo_col}"
                    )
                    results["skipped"] += 1
                    continue

                print(f"Downloading new image {photo_col}...")

                image_content = downloader.get_image(url)
                if image_content:
                    filename = get_filename_from_url(url, photo_col)
                    gcs_path = (
                        f"processed/adobe_extracted/{venue_name}/figures/{filename}"
                    )

                    try:
                        blob = bucket.blob(gcs_path)
                        blob.upload_from_string(
                            image_content, content_type="image/jpeg"
                        )
                        tracker.mark_downloaded(url, venue_name, filename)
                        print(f"✓ Successfully uploaded {filename}")
                        results["success"] += 1
                    except Exception as e:
                        print(f"✗ Failed to upload {filename}: {str(e)}")
                        results["failed"] += 1
                else:
                    print(f"✗ Failed to download {photo_col}")
                    results["failed"] += 1

        # Save the updated tracker
        tracker.save_tracker()

    finally:
        # Make sure we clean up the browser
        del downloader

    return results


In [2]:
def main():
    bucket_name = "wedding-venues-001"

    print("Starting image download and upload process...")
    results = process_venues_and_photos(bucket_name)
    print("\nProcessing completed!")
    print(f"Successfully processed: {results['success']} images")
    print(f"Skipped (already downloaded): {results['skipped']} images")
    print(f"Failed to process: {results['failed']} images")

    # print(f"An error occurred: {str(e)}")


if __name__ == "__main__":
    main()

Starting image download and upload process...

Processing venue: 94th Aero Squadron Restaurant_

Processing venue: a.o.c. Brentwood

Processing venue: Ace Hotel & Swim Club

Processing venue: Agua Hedionda Nature Center

Processing venue: Carousel House Santa Barbara

Processing venue: Alcazar Palm Springs

Processing venue: The Bank Ballroom

Processing venue: Aliso Viejo by Wedgewood Weddings

Processing venue: Aliso Viejo Country Club
Skipping already downloaded image for Aliso Viejo Country Club: Photo 1
Skipping already downloaded image for Aliso Viejo Country Club: Photo 2
Skipping already downloaded image for Aliso Viejo Country Club: Photo 3
Skipping already downloaded image for Aliso Viejo Country Club: Photo 4
Skipping already downloaded image for Aliso Viejo Country Club: Photo 5
Skipping already downloaded image for Aliso Viejo Country Club: Photo 6
Skipping already downloaded image for Aliso Viejo Country Club: Photo 7

Processing venue: Almansor Court
Skipping already dow

In [1]:
import sys

sys.path.append("..")

from function.retriever import initialize_retriever
from function.cloud import list_files

retriever = initialize_retriever()


Loaded existing FAISS index from faiss_db


In [6]:
all_image_paths_on_cloud

['Eden Gardens Moorpark/fileoutpart0.png',
 'Eden Gardens Moorpark/fileoutpart1.png',
 'Eden Gardens Moorpark/fileoutpart2.png',
 'Eden Gardens Moorpark/fileoutpart3.png',
 'Eden Gardens Moorpark/fileoutpart4.png',
 'Eden Gardens Moorpark/fileoutpart5.png',
 'Eden Gardens Moorpark/fileoutpart6.png',
 'Fig House Venue/fileoutpart0.png',
 'Fig House Venue/fileoutpart1.png',
 'Fig House Venue/fileoutpart12.png',
 'Fig House Venue/fileoutpart13.png',
 'Fig House Venue/fileoutpart14.png',
 'Fig House Venue/fileoutpart15.png',
 'Fig House Venue/fileoutpart2.png',
 'Fig House Venue/fileoutpart3.png',
 'Fig House Venue/fileoutpart4.png',
 'Fig House Venue/fileoutpart5.png',
 'Fig House Venue/fileoutpart6.png',
 'Fig House Venue/fileoutpart7.png',
 'Fig House Venue/fileoutpart8.png',
 'Fig House Venue/fileoutpart9.png',
 'Kimpton Shorebreak Resort/fileoutpart0.png',
 'Kimpton Shorebreak Resort/fileoutpart1.png',
 'Lakehouse Resort San Marcos/fileoutpart0.png',
 'Lakehouse Resort San Marcos/file

In [16]:
import random

downloads = random.sample(all_image_paths, 10)
destinations = ["downloads/" + path for path in downloads]
download_files(downloads, destinations)


[None, None, None, None, None, None, None, None, None, None]

In [18]:
list_files(r"Eden Gardens Moorxpark/figures/.*")


[]

In [38]:
import sys
import os

sys.path.append("..")

from function.retriever import initialize_retriever


def get_venue_images_from_cloud(venue):
    images = list_files(f"{venue}/figures/.*")
    return images


def download_to_folder(files, destination_folder):
    download_files(files, [destination_folder + "/" + file for file in files])


def get_venue_images_from_receiver(venue):
    receiver = initialize_retriever()
    docs = receiver.vectorstore.docstore._dict.values()
    image_docs = filter(
        lambda doc: doc.metadata["type"] == "image"
        and doc.metadata["company"] == venue,
        docs,
    )
    image_paths = [doc.metadata["image_path"] for doc in image_docs]
    image_names = [os.path.basename(path) for path in image_paths]
    return image_names


get_venue_images_from_receiver("The Los Angeles Arboretum")

Loaded existing FAISS index from faiss_db


['fileoutpart42.png',
 'fileoutpart8.png',
 'fileoutpart9.png',
 'fileoutpart36.png',
 'fileoutpart24.png',
 'fileoutpart26.png',
 'fileoutpart10.png',
 'fileoutpart7.png',
 'fileoutpart4.png',
 'fileoutpart5.png',
 'fileoutpart1.png',
 'fileoutpart0.png',
 'fileoutpart2.png',
 'fileoutpart3.png']

In [69]:
list_files(r"venues/.*")

['venues/94th Aero Squadron Restaurant_/94th Aero Squadron Restaurant_.pdf',
 'venues/Ace Hotel Palm Springs/Ace Hotel Palm Springs.pdf',
 'venues/Agua Hedionda Nature Center/Agua Hedionda Nature Center.pdf',
 'venues/Alcazar Palm Springs/Alcazar Palm Springs.pdf',
 'venues/Aliso Viejo Country Club/Aliso Viejo Country Club.pdf',
 'venues/Aliso Viejo Wedgewood/Aliso Viejo Wedgewood.pdf',
 'venues/Almansor Court/Almansor Court.pdf',
 'venues/Alta Vista Country Club/Alta Vista Country Club.pdf',
 'venues/Altar Long Beach/Altar Long Beach.pdf',
 'venues/Anaheim Hills Golf Course/Anaheim Hills Golf Course.pdf',
 'venues/Andaz San Diego/Andaz San Diego.pdf',
 'venues/Andrei_s/Andrei_s.pdf',
 'venues/Aquarium of the Pacific/Aquarium of the Pacific.pdf',
 'venues/Arroyo Trabuco Golf Club/Arroyo Trabuco Golf Club.pdf',
 'venues/Avenue of the Arts/Avenue of the Arts.pdf',
 'venues/Avila Lighthouse Suites/Avila Lighthouse Suites.pdf',
 'venues/Bahia Resort Hotel/Bahia Resort Hotel.pdf',
 'venues/

In [71]:
venue_paths = list_files(r"venues/.*")

['94th Aero Squadron Restaurant_',
 'Ace Hotel Palm Springs',
 'Agua Hedionda Nature Center',
 'Alcazar Palm Springs',
 'Aliso Viejo Country Club',
 'Aliso Viejo Wedgewood',
 'Almansor Court',
 'Alta Vista Country Club',
 'Altar Long Beach',
 'Anaheim Hills Golf Course',
 'Andaz San Diego',
 'Andrei_s',
 'Aquarium of the Pacific',
 'Arroyo Trabuco Golf Club',
 'Avenue of the Arts',
 'Avila Lighthouse Suites',
 'Bahia Resort Hotel',
 'Balboa Yacht Club',
 'Bali Hai Restaurant',
 'Bel Air-Bay Club',
 'Bella Blanca Event Center',
 'Beverly Hills Presbyterian Church',
 'Boulder Oaks Country Club',
 'Brand Park Community Center',
 'Cabrillo Pavilion Arts Center',
 'Cal-A-View Health Spa',
 'Calamigos Ranch',
 'California Center for the Arts, Escondido',
 'California Country Club',
 'Camarillo Ranch',
 'Cambria Pines Lodge',
 'Canopy Grove Wedgewood',
 'Cape Rey Carlsbad Beach_',
 'Carlsbad Windmill Wedgewood',
 'Carlton Oaks Country Club',
 'Carousel House Santa Barbara',
 'Carrillo Ballroo

In [59]:
from pathlib import Path
from tempfile import TemporaryDirectory
import uuid
from function.cloud import delete_file, download_file, upload_directory
from function.process_image import (
    generate_image_descriptions,
    is_photo,
    load_is_photo_classifier,
)
from function.retriever import PERSIST_DIRECTORY, load_venue_metadata
from function.retriever import add_documents_to_retriever


venue_metadata = load_venue_metadata()
photo_classifier = load_is_photo_classifier()

venue = "The Oviatt"
cloud_images = get_venue_images_from_cloud(venue)
receiver_images = get_venue_images_from_receiver(venue)

temp_output_dir = TemporaryDirectory().name
# with TemporaryDirectory(delete=False) as temp_output_dir:
if len(cloud_images) > 0:
    root = os.path.dirname(cloud_images[0])
    cloud_image_names = set([os.path.basename(image) for image in cloud_images])
    receiver_image_names = set(receiver_images)
    images_not_in_receiver = cloud_image_names - receiver_image_names

for image in images_not_in_receiver:
    image_on_disk = temp_output_dir + "/" + image
    download_file(root + "/" + image, image_on_disk)
    if not is_photo(photo_classifier, image_on_disk):
        delete_file(root + "/" + image)
        Path(image_on_disk).unlink()

image_descriptions = generate_image_descriptions(
    base_dir=temp_output_dir,
    venue=venue,
)

doc_id = str(uuid.uuid4())
# Include venue metadata in document_info
venue_info = venue_metadata.get(venue, {})
document_info = {
    "doc_id": doc_id,
    "text_content": "",
    "image_descriptions": image_descriptions,
    "metadata": venue_info,
}

add_documents_to_retriever({venue: document_info}, retriever, venue_metadata)
retriever.vectorstore.save_local(PERSIST_DIRECTORY)
upload_directory(PERSIST_DIRECTORY, "tmp/")


Loaded existing FAISS index from faiss_db
Generating image descriptions...
   (1/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart69.png
   (2/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart41.png
   (3/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart40.png
   (4/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart42.png
   (5/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart43.png
   (6/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart47.png
   (7/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart46.png
   (8/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart9.png
   (9/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart44.png
   (10/49) /var/folders/g6/gqdf2y4n3jj4f0vml_0s_q1r0000gn/T/tmp14twu23l/fileoutpart45.png
   (11/49) /var/folders/g6/gqdf2y4n

In [68]:
# add_documents_to_retriever({venue: document_info}, retriever, venue_metadata)
from function.secrets import secrets
from function.cloud import upload_directory

retriever.vectorstore.save_local(secrets.DATABASE_DIR.get_secret_value())
upload_directory(secrets.DATABASE_DIR.get_secret_value(), "tmp/")

['https://storage.googleapis.com/wedding-venues-001/tmp/index.faiss',
 'https://storage.googleapis.com/wedding-venues-001/tmp/index.pkl']

In [61]:
from function.retriever import add_documents_to_retriever

add_documents_to_retriever({venue: document_info}, retriever, venue_metadata)

Processed document: The Oviatt


In [63]:
for doc in retriever.vectorstore.docstore._dict.values():
    print(doc.metadata["company"], doc.metadata["type"], doc.metadata.get("image_path"))


dummy text None
The Los Angeles Arboretum text None
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart42.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart8.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart9.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart36.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart24.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart26.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart10.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart7.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart4.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart5.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart1.png
The Los Angeles Arboretum image The Los Angeles Arboretum/fileoutpart0.png
The Los Angeles Arboretum image The Los Ang

In [None]:
from tempfile import TemporaryDirectory

venue = "The Oviatt"

with TemporaryDirectory() as temp_output_dir:
    cloud_images = get_venue_images_from_cloud(venue)
    receiver_images = get_venue_images_from_receiver(venue)
    download_to_folder(cloud_images, temp_output_dir)
    download_to_folder(receiver_images, temp_output_dir)

    if not extracted_figure_folder.exists():
        print(f"no images found for {venue}.pdf")
        image_descriptions = []
    else:
        print(f"generating image descriptions for {venue}.pdf")
        image_descriptions = generate_image_descriptions(
            base_dir=extracted_figure_folder,
            venue=venue,
        )
print("uploading adobe_extracted_directory to google cloud")
upload_directory(temp_output_dir, f"/processed/adobe_extracted/{venue}/")

doc_id = str(uuid.uuid4())
# Include venue metadata in document_info
venue_info = venue_metadata.get(venue, {})
document_info = {
    "doc_id": doc_id,
    "text_content": text_content,
    "image_descriptions": image_descriptions,
    "metadata": venue_info,
}

In [22]:
from pathlib import Path

p = Path("tmp/The Oviatt/figures/")
paths = p.glob("*")
venue_image_names = [path.name for path in paths]
venue_image_names


['fileoutpart69.png',
 'fileoutpart41.png',
 'fileoutpart55.png',
 'fileoutpart40.png',
 'fileoutpart68.png',
 'fileoutpart56.png',
 'fileoutpart42.png',
 'fileoutpart43.png',
 'fileoutpart57.png',
 'fileoutpart8.png',
 'fileoutpart47.png',
 'fileoutpart46.png',
 'fileoutpart52.png',
 'fileoutpart9.png',
 'fileoutpart44.png',
 'fileoutpart50.png',
 'fileoutpart51.png',
 'fileoutpart45.png',
 'fileoutpart22.png',
 'fileoutpart36.png',
 'fileoutpart37.png',
 'fileoutpart23.png',
 'fileoutpart35.png',
 'fileoutpart21.png',
 'fileoutpart20.png',
 'fileoutpart34.png',
 'extra_0636-240229-tjd23398_51_2050505-173274204980834.jpeg',
 'extra_untitled-0809_51_2050505-173274204499986.jpeg',
 'fileoutpart30.png',
 'fileoutpart24.png',
 'fileoutpart18.png',
 'extra_20240224-karinaanddylanwedding-342_51_2050505-173274203068307.jpeg',
 'fileoutpart19.png',
 'fileoutpart25.png',
 'fileoutpart31.png',
 'fileoutpart27.png',
 'fileoutpart33.png',
 'fileoutpart32.png',
 'fileoutpart26.png',
 'fileoutpart1

In [7]:
import re

# r"/([anything that isn't /].*?)/figures/(.*)"
pattern = re.compile(r"/([^\/]*?)/figures/(.*)")
all_image_paths = list_files(r"processed/adobe_extracted/.*/figures/.*")
all_images_destination = [
    "tmp/" + pattern.findall(path)[0][0] + "/figures/" + pattern.findall(path)[0][1]
    for path in all_image_paths
]
all_images_destination


['tmp/Eden Gardens Moorpark/figures/fileoutpart0.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart1.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart2.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart3.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart4.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart5.png',
 'tmp/Eden Gardens Moorpark/figures/fileoutpart6.png',
 'tmp/Fig House Venue/figures/fileoutpart0.png',
 'tmp/Fig House Venue/figures/fileoutpart1.png',
 'tmp/Fig House Venue/figures/fileoutpart12.png',
 'tmp/Fig House Venue/figures/fileoutpart13.png',
 'tmp/Fig House Venue/figures/fileoutpart14.png',
 'tmp/Fig House Venue/figures/fileoutpart15.png',
 'tmp/Fig House Venue/figures/fileoutpart2.png',
 'tmp/Fig House Venue/figures/fileoutpart3.png',
 'tmp/Fig House Venue/figures/fileoutpart4.png',
 'tmp/Fig House Venue/figures/fileoutpart5.png',
 'tmp/Fig House Venue/figures/fileoutpart6.png',
 'tmp/Fig House Venue/figures/fileoutpart7.png',
 'tmp/Fig House Venue/f

In [10]:
# add all files in all_images_destination to a zip file and upload to the cloud
import zipfile
from function.cloud import upload_file
from tqdm import tqdm

with zipfile.ZipFile("tmp/all_images.zip", "w") as zipf:
    for file in tqdm(all_images_destination):
        zipf.write(file, arcname=file)

upload_file("tmp/all_images.zip", "tmp/all_images.zip")


  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_write(zinfo, force_zip64=force_zip64)
  return self._open_to_w

KeyboardInterrupt: 

In [None]:
# download zip from google cloud
from function.cloud import download_file

download_file("tmp/all_images.zip", "tmp/all_images.zip")

In [None]:
# unzip the file
with zipfile.ZipFile("tmp/all_images.zip", "r") as zipf:
    zipf.extractall("tmp/all_images")

In [8]:
from function.cloud import download_files

download_files(all_image_paths, all_images_destination)


[None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,
 None,

In [9]:
res = _

set(res)

{None}

In [10]:
xlsx_images = [filepath for filepath in all_images if "fileoutpart" not in filepath]

In [2]:
from function.retriever import initialize_retriever

retriever = initialize_retriever()

all_images_in_retriever = list(
    filter(
        lambda doc: doc.metadata["type"] == "image",
        retriever.vectorstore.docstore._dict.values(),
    )
)
all_image_paths_in_retriever = [
    doc.metadata["image_path"] for doc in all_images_in_retriever
]
all_image_paths_in_retriever


Loaded existing FAISS index from faiss_db


['The Los Angeles Arboretum/fileoutpart42.png',
 'The Los Angeles Arboretum/fileoutpart8.png',
 'The Los Angeles Arboretum/fileoutpart9.png',
 'The Los Angeles Arboretum/fileoutpart36.png',
 'The Los Angeles Arboretum/fileoutpart24.png',
 'The Los Angeles Arboretum/fileoutpart26.png',
 'The Los Angeles Arboretum/fileoutpart10.png',
 'The Los Angeles Arboretum/fileoutpart7.png',
 'The Los Angeles Arboretum/fileoutpart4.png',
 'The Los Angeles Arboretum/fileoutpart5.png',
 'The Los Angeles Arboretum/fileoutpart1.png',
 'The Los Angeles Arboretum/fileoutpart0.png',
 'The Los Angeles Arboretum/fileoutpart2.png',
 'The Los Angeles Arboretum/fileoutpart3.png']

In [3]:
all_image_paths_on_cloud = list_files(r"processed/adobe_extracted/.*/figures/.*")
all_image_paths_on_cloud = [
    path.split("processed/adobe_extracted/")[1].replace("/figures/", "/")
    for path in all_image_paths_on_cloud
]
root_path_cloud = "processed/adobe_extracted/"


In [13]:
print(len(set(all_image_paths_on_cloud) - set(all_image_paths_in_retriever)))
print(set(all_image_paths_in_retriever) - set(all_image_paths_on_cloud))


8870
set()


In [11]:
list_files(r"processed/adobe_extracted/The Los Angeles Arboretum/figures/.*")

['/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart0.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart1.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart10.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart11.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart12.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart13.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart14.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart15.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart16.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart17.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart2.png',
 '/processed/adobe_extracted/The Los Angeles Arboretum/figures/fileoutpart22.png',
 '/proc

In [11]:
from pathlib import Path

from tempfile import NamedTemporaryFile
from function.cloud import delete_file, download_file
from function.process_image import is_photo, load_is_photo_classifier

image_path = xlsx_images[0]

suffix = Path(image_path).suffix
with NamedTemporaryFile(suffix=suffix) as temp_image_path:
    download_file(image_path, temp_image_path.name)
    photo_classifier = load_is_photo_classifier()
    if not is_photo(photo_classifier, temp_image_path.name):
        delete_file(image_path)
    else:
        info = process_image(temp_image_path.name)


['processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_ht-sneakpeek-0019_51_413915-168824969439071.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-ceremony-36-websize_51_413915-172030940775100.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-couple-107-websize_51_413915-172030935613408.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-couple-52-websize_51_413915-172030935812055.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-couple-60-websize_51_413915-172030941469797.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-couple-63-websize_51_413915-172030939425275.jpeg',
 'processed/adobe_extracted/Aliso Viejo Country Club/figures/extra_jd-reception-213-websize_51_413915-172030937491770.jpeg',
 'processed/adobe_extracted/Almansor Court/figures/extra_1416370050888-072a7963-3.jpeg',
 'processed/adobe_extracted/Almansor Court/figures/extra_141

In [2]:
import sys

sys.path.append("..")

from function.cloud import delete_file, list_files

files = list_files(r"app.*.py")

for file in files:
    delete_file(file)
