## Ingest Kaggle data to GCS

For staging and use in model training later.



## Setup

In [None]:
project = !gcloud config get-value project
PROJECT_ID = project[0]
PROJECT_ID

In [None]:
LOCATION = "us-central1"  
REGION = 'us-central1' 

BUCKET_NAME = f"{PROJECT_ID}-fruit-and-veg-image-model-kaggle-data-staging"

## model training 
DESIRED_LABELS = [
    'Apple__Healthy', 'Apple__Rotten',
    'Banana__Healthy', 'Banana__Rotten',
    'Bellpepper__Healthy', 'Bellpepper__Rotten'
]
NUM_CLASSES = len(DESIRED_LABELS)

### Packages

In [None]:
# Data Ingestion
import os
from pathlib import Path

from google.cloud import storage
from google.cloud.exceptions import NotFound

from kaggle.api.kaggle_api_extended import KaggleApi

# Data pre-processing
from PIL import Image  
from concurrent.futures import ThreadPoolExecutor
import subprocess

### Parameters

In [None]:
URI = f"gs://{BUCKET_NAME}" 
DIR = f"temp"
LOCAL_DATA_DIR = f"{DIR}/data"

### Create a local directories for staging files 


In [None]:
! rm -rf $LOCAL_DATA_DIR
! mkdir -p $LOCAL_DATA_DIR

## Clients 

In [None]:
storage_client = storage.Client(project=PROJECT_ID)

## Create Storage Bucket

In [None]:
def check_and_create_bucket(bucket_name, location):
    try:
        storage_client.get_bucket(bucket_name)
        print(f"Bucket {bucket_name} already exists.")
    except NotFound:
        bucket = storage_client.create_bucket(bucket_or_name=bucket_name, location=location)
        print(f"Bucket {bucket_name} created.")

In [None]:
check_and_create_bucket(BUCKET_NAME, LOCATION)

## Get Data from Kaggle

### Setup Kaggle credentials

You will need a Kaggle account and locate or create a kaggle.json file in the directory: `/home/jupyter/.config/kaggle`

Steps:

* manually download your credentail file from kaggle.com -> Profile
* run this command in terminal to move it to the correct location: `mv kaggle.json .config/kaggle/kaggle.json`


### Download images 

In [None]:
# Set up Kaggle credentials 
os.environ['KAGGLE_USERNAME'] = 'YOUR_KAGGLE_USERNAME' 
os.environ['KAGGLE_KEY'] = 'YOUR_KAGGLE_API_KEY'

# Initialize the Kaggle API
api = KaggleApi()
api.authenticate()

# Specify the dataset you want to download
dataset_slug = 'muhammad0subhan/fruit-and-vegetable-disease-healthy-vs-rotten'

# Download the dataset
api.dataset_download_files(dataset_slug, path=LOCAL_DATA_DIR, unzip=True)

### Convert images

In [None]:
def convert_image_to_rgb_and_jpeg(image_path):
    """Converts and saves an image to RGB JPEG format, overwriting the original."""
    try:
        img = Image.open(image_path)

        if img.mode != 'RGB':
            img = img.convert('RGB')

        img.save(image_path, format='JPEG')  # Overwrite the original
        # print(f'Converted and saved: {image_path}')

    except Exception as e:
        print(f'Error processing {image_path}: {e}')

def process_directory(root_dir, subdirs_to_convert, max_workers=None):
    """Processes images within specified subdirectories using multithreading."""
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        for root, dirs, files in os.walk(root_dir):
            # Filter directories based on the provided list
            dirs[:] = [d for d in dirs if d in subdirs_to_convert]

            for file in files:
                if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif')):  # Add more extensions if needed
                    image_path = Path(root) / file
                    executor.submit(convert_image_to_rgb_and_jpeg, image_path)

In [None]:
root_directory = f"{LOCAL_DATA_DIR}/Fruit And Vegetable Diseases Dataset"
subdirectories_to_convert = DESIRED_LABELS

process_directory(root_directory, subdirectories_to_convert)

## Load to GCS

Load only a subset of images (set by the `DESIRED_LABELS` list) for demonstration purposes, update the `DESIRED_LABELS` to include all the images in the Kaggle dataset.

In [None]:
# Loop over each subdirectory (label) and copy the contents using gsutil
for subdir in DESIRED_LABELS:
    source = f'"{LOCAL_DATA_DIR}/Fruit And Vegetable Diseases Dataset/{subdir}/*"'
    destination = f"{URI}/{subdir}/"
    print(destination)
    command = f"gsutil -m cp -r {source} {destination} > /dev/null 2>&1"
    
    # Execute the command using subprocess
    process = subprocess.run(command, shell=True)
    
    if process.returncode == 0:
        print(f"Successfully copied {subdir}")
    else:
        print(f"Failed to copy {subdir}")

## Copying to a new bucket

For use in model training

In [37]:
GCS_DESTINATION = "demos-vertex-ai-dev"

In [None]:
!gsutil -m rsync -r gs://$BUCKET_NAME gs://$GCS_DESTINATION/data

Building synchronization state...
At source listing 10000...
Starting synchronization...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (118).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (12).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (1).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (135).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (103).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging/Apple__Healthy/FreshApple (104).jpg [Content-Type=image/jpeg]...
Copying gs://demos-vertex-ai-fruit-and-veg-image-model-kaggle-data-staging