## **Bringing 100 images from each one of the 10 different classes chosen using BigQuery**

In [3]:
# setting our project id
PROJECT_ID = 'imposing-kite-140412'

In [4]:
# authenticating to Google Cloud
from google.colab import auth
auth.authenticate_user()
!gcloud config set project {PROJECT_ID}

Updated property [core/project].


In [10]:
# creating a BigQuery client
import google.cloud.bigquery as bq
client = bq.Client(project=PROJECT_ID)

In [8]:
# creating a dataframe to store the data
import pandas as pd
df = pd.DataFrame(columns=['Classe', 'ImageID'])

In [11]:
# creating a class vector with 10 different classes to extract from the database
classe = ["Dog", "Cat", "Bird", "Horse", "Fish", "Antelope", "Deer", "Lizard", "Snake", "Spider"]
ids = "''"
ctrl = 0

# iterating in each one of those classes and bring 100 imagesId 
for c in classe:
  c = '"' + c + '"'
  query_job = client.query(
      """
      SELECT 
        a.Description,
        b.ImageID

      FROM `bdcc22project.openimages.classes` as a
      LEFT JOIN `bdcc22project.openimages.image_labels` as b on (a.Label = b.Label)

      WHERE a.Description in (""" + c + """)
      
      and b.ImageID not in (""" + ids + """)
      LIMIT 100
  """
  )
  results = query_job.result()  # Waits for job to complete.

  # append those 100 imageId returned from the above query in the dataframe
  for row in results:
      df = df.append(dict(zip(df.columns,[row.Description, row.ImageID])), ignore_index=True)
      if ctrl == 0:
        ids = ""
        ids = ids + "'" + row.ImageID + "'"
        ctrl += 1
      else:
        ids = ids + ", '" + row.ImageID + "'"

In [13]:
# defining the image path based on the imageId
df['path'] = "gs://bdccall2022bd/images/" + df["ImageID"] + ".jpg"

In [14]:
# creating a label column to identify the TRAIN (80 images for each class), 
# VALIDATION (10 images for each class) and TEST (10 images for each class) 
# division
df['LABEL'] = ""
t1 = 80
t2 = t1 + 10
t3 = t2 + 10
for i in range(1000):
  if i < t1:
    df.iloc[i, 3] = "TRAIN"
  elif i >= t1 and i < t2:
    df.iloc[i, 3] = "VALIDATION"
  elif i >= t2 and i < t3:
    df.iloc[i, 3] = "TEST"
  else:
    df.iloc[i, 3] = "TRAIN"
    t1 = t3 + 80
    t2 = t1 + 10
    t3 = t2 + 10

In [15]:
# saving the data to be used in the autoML model
df[["LABEL", "path", "Classe"]].to_csv("automl.csv")

## **Copying the 1000 images chosen from the souce bucket to our bucket**

In [16]:
import os
# setting our bucket name which we'll copy the images chosen above from the origin bucket.
bucket_from = 'bdcc_open_images_dataset'
bucket_to = 'bdccall2022bd'

# Copy all images chosen from origin bucket to our bucket.
# Full reference: https://cloud.google.com/storage/docs/gsutil/commands/cp
for i in range(df.shape[0]): 
  os.system(f'gsutil cp gs://{bucket_from}/images/{df.iloc[i, 1] + ".jpg"} gs://{bucket_to}/images/')