In [1]:
import os
import uuid
from google.cloud import storage

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../red-freedom-426709-a7-6904e9a53b27.json'
storage_client = storage.Client()
bucket_name = 'datalakes-ing3'

def upload_images_from_folder(folder_path):

    uuids = []

    if folder_path and os.path.isdir(folder_path):
        for filename in os.listdir(folder_path):
            img_path = os.path.join(folder_path, filename)
            unique_id = str(uuid.uuid4())
            uuids.append(unique_id)

            bucket = storage_client.bucket(bucket_name)
            destination_blob_name = f'0_raw/{unique_id}'
            blob = bucket.blob(destination_blob_name)
            blob.upload_from_filename(img_path)
            print(f"File {img_path} uploaded to {destination_blob_name}.")

    with open('../UUIDs.txt', 'w') as file:
        for unique_id in uuids:
            file.write(f"{unique_id}\n")

    return uuids

In [2]:
from google.cloud import storage
from rembg import new_session, remove
from pathlib import Path
from PIL import Image
import io
import os

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../red-freedom-426709-a7-6904e9a53b27.json'
storage_client = storage.Client()
bucket_name = 'datalakes-ing3'

session = new_session()
bucket = storage_client.bucket(bucket_name)

input_dir = '0_raw/'
output_dir = '1_staging/'

def background_removal(uuid_list):
    blobs = storage_client.list_blobs(bucket_name, prefix=input_dir)

    for blob in blobs:
        if Path(blob.name).stem in uuid_list:
            input_content = blob.download_as_bytes()
            output_content = remove(input_content, session=session)
            output_image = Image.open(io.BytesIO(output_content))
            
            if output_image.mode == 'RGBA':
                output_image = output_image.convert('RGB')
            
            output_image = output_image.resize((224, 224), Image.LANCZOS)
            
            output_buffer = io.BytesIO()
            output_image.save(output_buffer, format='JPEG')
            output_buffer.seek(0)
            
            filename = Path(blob.name).stem + '.nbg.jpg'
            output_blob_name = f'{output_dir}{filename}'

            output_blob = bucket.blob(output_blob_name)
            output_blob.upload_from_file(output_buffer, content_type='image/jpeg')

  "class": algorithms.Blowfish,


In [3]:
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.models import Model
from keras.preprocessing import image

import numpy as np
from PIL import Image
import io
from pinecone import Pinecone
from google.cloud import storage
import os
from pathlib import Path
from dotenv import find_dotenv, load_dotenv

os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = '../red-freedom-426709-a7-6904e9a53b27.json'

storage_client = storage.Client()
bucket_name = 'datalakes-ing3'
base_model = VGG16(weights='imagenet', include_top=True)
model = Model(inputs=base_model.input, outputs=base_model.get_layer('fc2').output)

dotenv_path = find_dotenv("../keys.env", raise_error_if_not_found=True, usecwd=True)
load_dotenv(dotenv_path, override=True)

pinecone = Pinecone(api_key=os.getenv('PINECONE_API_KEY'))
index = pinecone.Index('datalakes-ing3-curated')

def vectorize_images_from_uuid(uuid_list, bucket_name='datalakes-ing3'):

    modified_uuid_list = [f"{uuid}.nbg" for uuid in uuid_list]

    bucket = storage_client.bucket(bucket_name)

    def extract_features(image_bytes):
        img = Image.open(io.BytesIO(image_bytes))
        img = img.resize((224, 224))
        img_data = np.expand_dims(image.img_to_array(img), axis=0)
        img_data = preprocess_input(img_data)
        
        vgg16_feature = model.predict(img_data)
        flat_feature = vgg16_feature.flatten()
        return flat_feature

    blobs = storage_client.list_blobs(bucket_name, prefix='1_staging/', delimiter='/')

    for blob in blobs:
        file_uuid = Path(blob.name).stem.split('.')[0]
        print(file_uuid, uuid_list)
        if file_uuid in uuid_list:
            try:
                img_content = blob.download_as_bytes()
                features = extract_features(img_content)
                features /= np.linalg.norm(features)
                index.upsert([(blob.name, features)])
                print(f"Processed {blob.name}")

            except Exception as e:
                print(f"Error processing {blob.name}: {e}")

In [4]:
uuids = upload_images_from_folder("E:\AI\projets\projet_final_datalakes\data-to-add\leo")

  uuids = upload_images_from_folder("E:\AI\projets\projet_final_datalakes\data-to-add\leo")


File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172658.jpg uploaded to 0_raw/e91b5bc8-da59-4522-a807-f66676bd1b2c.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172703.jpg uploaded to 0_raw/1deca580-5f13-42c8-b627-ee41bb8d19f0.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172705.jpg uploaded to 0_raw/31b65f22-c5d8-4e5d-a79f-6088c8d66dbd.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172708.jpg uploaded to 0_raw/832996b1-2739-49cd-a598-752190d70cb0.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172711.jpg uploaded to 0_raw/560b9d5f-3ff7-412b-8c22-3daa2f34ba30.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172713.jpg uploaded to 0_raw/a0ce53af-d668-43d8-9d86-7decef7b7bdf.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\20240906_172715.jpg uploaded to 0_raw/b1b6f5d5-4988-494e-9012-1933489df924.
File E:\AI\projets\projet_final_datalakes\data-to-add\leo\2024

In [5]:
background_removal(uuids)

In [6]:
vectorize_images_from_uuid(uuids)

063e87fa-7d64-44f1-be7a-fa39f20ee537 ['e91b5bc8-da59-4522-a807-f66676bd1b2c', '1deca580-5f13-42c8-b627-ee41bb8d19f0', '31b65f22-c5d8-4e5d-a79f-6088c8d66dbd', '832996b1-2739-49cd-a598-752190d70cb0', '560b9d5f-3ff7-412b-8c22-3daa2f34ba30', 'a0ce53af-d668-43d8-9d86-7decef7b7bdf', 'b1b6f5d5-4988-494e-9012-1933489df924', '10a24ed2-6754-4e96-bcc1-f106d9e1a30d', 'bdb4b663-994a-4cf1-b1fe-ad644636316a', 'd2f089c2-2785-4fba-8d12-5fb46622c187', '063e87fa-7d64-44f1-be7a-fa39f20ee537', '470a2b7d-ca35-4c1f-9a61-20c34ca0766f', 'c40b158d-178f-48ea-b5e0-4fabddf4b5d9', 'c86222ed-a5ef-46d4-9e67-ceb933157603', '354c805e-a14f-4ad0-a626-a65124d43459', 'bebc05d5-e7a0-4262-a146-7cc2bd0849d4', '75dab0ff-4980-4068-9dcd-926e7b641c5b', 'cb009775-dfc0-4d4a-828b-c4accc395323', '81c46259-f682-4659-af4c-0454260acb02', '49707e6d-8390-4aee-a56f-8117caaa82ff', 'e397a55f-7e38-40e3-a5cc-2f362bde6412', 'e924e68b-8622-487e-a09c-a72aec591f07', '7e7c6f12-79de-47cd-83e4-b00e052d5096', '2f2b42ff-941d-439e-a6a9-f8c032e233b4', 'd