## Copyright 2023 Google LLC,
### Jena Jordahl Gen AI Blackbelt
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at

    https://www.apache.org/licenses/LICENSE-2.0

 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License.


In [1]:
! pip install --upgrade google-cloud-aiplatform --upgrade

Collecting google-cloud-aiplatform
  Downloading google_cloud_aiplatform-1.32.0-py2.py3-none-any.whl (2.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.9/2.9 MB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
Collecting google-cloud-resource-manager<3.0.0dev,>=1.3.3 (from google-cloud-aiplatform)
  Downloading google_cloud_resource_manager-1.10.3-py2.py3-none-any.whl (320 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m321.0/321.0 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting shapely<2.0.0 (from google-cloud-aiplatform)
  Downloading Shapely-1.8.5.post1-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (2.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: shapely, google-cloud-resource-manager, google-cloud-aiplatform
  Attempting uninstall: shapely
    Found existing installation: shapely 2.0.1
    Uninstalling shapel

In [2]:
# # Automatically restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

In [1]:
import os

PROJECT_ID = "jena-genai-bb"  # @param {type:"string"}

if PROJECT_ID == "" or PROJECT_ID is None:
    # Get your GCP project id from gcloud


    # Get your Google Cloud project ID from gcloud
    if not os.getenv("IS_TESTING"):
        shell_output = !gcloud config list --format 'value(core.project)' 2>/dev/null
        PROJECT_ID = "" # shell_output[0]
        print("Project ID: ", PROJECT_ID)

In [2]:
! gcloud config set project $PROJECT_ID
print(f"$PROJECT_ID={PROJECT_ID}")

REGION = "[your-region]"  # @param {type: "string"}

if REGION == "[your-region]":
    REGION = "us-central1"
print(f"REGION={REGION}")


Updated property [core/project].
$PROJECT_ID=jena-genai-bb
REGION=us-central1


In [3]:
# If you are running this notebook in Colab, run this cell and follow the
# instructions to authenticate your GCP account. This provides access to your
# Cloud Storage bucket and lets you submit training jobs and prediction
# requests.
import sys

# If on Vertex AI Workbench, then don't execute this code
IS_COLAB = "google.colab" in sys.modules
if not os.path.exists("/opt/deeplearning/metadata/env_version") and not os.getenv( "DL_ANACONDA_HOME"):
    if "google.colab" in sys.modules:
        from google.colab import auth as google_auth

        google_auth.authenticate_user()

    # If you are running this notebook locally, replace the string below with the
    # path to your service account key and run this cell to authenticate your GCP
    # account.
    elif not os.getenv("IS_TESTING"):
        %env GOOGLE_APPLICATION_CREDENTIALS ''

In [6]:
import pandas as pd
import sys
import json
from PIL import Image
import requests
from io import BytesIO
import numpy as np
from IPython.display import display

# Utility Functions

# Make the directory to hold the combined images local temp storage
bucket_str = "airbnb_image_compare"
folder_nm = 'merged_images'
bucket_nm = f"gs://{bucket_str}"
# bucket_path =f"{bucket_nm}/{folder_nm}"
!gsutil mb -l us-central1 $bucket_nm
!gsutil cp /content/pairwise_training_data.csv $bucket_nm

path_to_new_images = f'/content/{folder_nm}/'
!mkdir -p {path_to_new_images}


def make_df(csv_name) -> pd.DataFrame:
    df = pd.read_csv(csv_name)
    df.columns = ["url1", "url2", "match-score"]
    return df



def get_images(idx, img1_url, img_url2, broken_url_dict):

    try:
      response1 = requests.get(img1_url)
      try:
          response2 = requests.get(img_url2)
          img1 = Image.open(BytesIO(response1.content))
          img2 = Image.open(BytesIO(response2.content))
      except:
          print(f"error: could NOT display image {idx} 2 on URL {row.url2}")
          broken_url_dict[f'{idx}.2']= row.url2
          return False, img1, None, broken_url_dict
    except:
        print(f"error: could not display image {idx} 1 on URL {row.url1}")
        broken_url_dict[f'{idx}.1']= row.url1
        return False, None, None, broken_url_dict

    return True, img1, img2, broken_url_dict



def make_combo(img1, img2, idx, img_path, imgsz_dict):

    # print(f"{img1.size} vs {img2.size}")
    #Transform images to the same height, if necessary
    temp_imgsz = img1.size


    if img1.size[1] != img2.size[1]:
        img1 = img1.resize((img1.size[0], img2.size[1]))

    # Create a new image combining both
    new_img = Image.new('RGB', (img1.size[0] + img2.size[0], img1.size[1]))
    new_img.paste(img1, (0, 0))
    new_img.paste(img2, (img1.size[0], 0))

    imgsz_dict[idx] = (temp_imgsz, img1.size, img2.size)

    # Save new image
    temp_img_nm = img_path + "test_sample" + str(idx).zfill(6) + ".jpg"
    new_img.save(temp_img_nm)

    # print(f"{img1.size} vs {img2.size}")
    # print(f"{img1.size[1]} vs {img2.size[1]}
    return new_img, imgsz_dict


def batch_update(debug, path_batch, bucket_path):
    done = False
    # !echo gsutil -m cp -r $path_batch $bucket_path
    # !ls -l $path_batch
    !gsutil -m cp -r $path_batch $bucket_path
    !rm -r $path_batch

    # recreate directory
    !mkdir -p '/content/merged_images/'

    # for prod create a test to ensure 50 were transfered using worked boolean
    return done

Creating gs://airbnb_image_compare/...
ServiceException: 409 A Cloud Storage bucket named 'airbnb_image_compare' already exists. Try another name. Bucket names must be globally unique across all Google Cloud projects, including those outside of your organization.
Copying file:///content/pairwise_training_data.csv [Content-Type=text/csv]...
\
Operation completed over 1 objects/23.8 MiB.                                     


In [None]:
debug = False
test_size = 10
dictsz = {}
broken_urls = {}


# import data into a dataframe
url_label_df = make_df(f'{bucket_nm}/pairwise_training_data.csv')
# !gsutil cp /content/pairwise_training_data.csv $bucket_path/pairwise_training_data.csv

if debug: print(url_label_df.head()); example_size = test_size # number of images to create
else: example_size = len(url_label_df) # number of images to create


# Look at the URLs for small sample review
for i,(idx,row) in enumerate(url_label_df.iterrows()):
  if i < example_size:
      # print(f"{i}: {row.url1} vs {row.url2}")
      both, img1, img2, broken_urls = get_images(idx, row.url1, row.url2, broken_urls.copy() )
      if not both: continue
      if debug: print(f"SAMPLE {idx} Both images retrieved? {both}")
      new_image, dictsz = make_combo(img1, img2, idx, path_to_new_images, dictsz.copy())
      if not new_image: continue
      if debug: print(f"SAMPLE {idx} new image created? {new_image}")
      if debug: print(dictsz)
      # Update CSV with img1.size and img2.size
      if debug: display(new_image)
      if i%50 == 0:
          worked = batch_update(debug, path_to_new_images, bucket_nm)
          # for prod create a test to ensure 50 were transfered using worked boolean

with open('drive/MyDrive/dictsz.json', 'w') as f:
    json.dump(dictsz, f)

with open('drive/MyDrive/broken_urls.json', 'w') as fp:
    json.dump(broken_urls, fp)







Copying file:///content/merged_images/test_sample000000.jpg [Content-Type=image/jpeg]...
/ [1/1 files][320.5 KiB/320.5 KiB] 100% Done                                    
Operation completed over 1 objects/320.5 KiB.                                    
error: could NOT display image 9 2 on URL https://a0.muscache.com/im//pictures/1013e840-3a4a-449e-af1c-4bba50d3fac1.jpg
error: could not display image 9 1 on URL https://a0.muscache.com/im//pictures/3f7371d6-141b-4825-a403-3c11b5b52064.jpg
error: could NOT display image 12 2 on URL https://a0.muscache.com/im/pictures/miso/Hosting-28046802/original/4034c92a-1443-4466-a53e-0f7a2e0341cf.jpeg
Copying file:///content/merged_images/test_sample000035.jpg [Content-Type=image/jpeg]...
Copying file:///content/merged_images/test_sample000045.jpg [Content-Type=image/jpeg]...
Copying file:///content/merged_images/test_sample000041.jpg [Content-Type=image/jpeg]...
Copying file:///content/merged_images/test_sample000022.jpg [Content-Type=image/jpeg]...


# setup
## create a bunch of combined images.
## initialize the model
# for the images test two outcomes
## test each image for room type
## test each combined image for same roomness
## test