In [23]:
import os
import base64
from openai import OpenAI
from pymongo import MongoClient
import json
from PIL import Image
import re

In [13]:


def extract_json_schema(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()
        start = lines.index('```json\n') + 1
        end = lines.index('```\n', start)
        json_schema = ''.join(lines[start:end])
        return json.loads(json_schema)

def convert_to_mongodb_schema(file_path):
    schemas = {}
    #for file_path in file_paths:
    schema = extract_json_schema(file_path)
    schema_name = file_path.split('/')[-1].split('_')[0]
    schemas[schema_name] = schema
    return schemas

# Example usage:
file_paths = [
    'pdf_products/Atomiseurs/Atomiseurs_schema.txt',
    'pdf_products/Autres Comestibles/Autres Comestibles_schema.txt'
]

#mongodb_schemas = convert_to_mongodb_schema(file_paths)

# Print the schemas
# for name, schema in mongodb_schemas.items():
#     print(f"{name} Schema:")
#     print(json.dumps(schema, indent=4, ensure_ascii=False))

In [9]:
def resize_image(image_path,folder_path):
    resized_image_path = os.path.join(folder_path, f"resized_{os.path.basename(image_path)}")
    
    if os.path.exists(resized_image_path):
        return resized_image_path
    
    with Image.open(image_path) as img:
        width, height = img.size
        if width < height:
            if width > 768:
                ratio = 768 / width
                new_width = 768
                new_height = int(height * ratio)
                if new_height > 2000:
                    ratio = 2000 / height
                    new_width = int(width * ratio)
                    new_height = 2000
            elif height > 2000:
                ratio = 2000 / height
                new_width = int(width * ratio)
                new_height = 2000
            else:
                new_width, new_height = width, height
        else:
            if height > 768:
                ratio = 768 / height
                new_height = 768
                new_width = int(width * ratio)
                if new_width > 2000:
                    ratio = 2000 / width
                    new_height = int(height * ratio)
                    new_width = 2000
            elif width > 2000:
                ratio = 2000 / width
                new_width = 2000
                new_height = int(height * ratio)
            else:
                new_width, new_height = width, height
        
        img = img.resize((new_width, new_height), Image.LANCZOS)
        resized_image_path = os.path.join(folder_path, f"resized_{os.path.basename(image_path)}")
        img.save(resized_image_path)
        return resized_image_path

In [40]:
def extraire_produits_details(response):
        product_block = re.search(r'```json\n(.*?)\n```', response, re.DOTALL)
        if product_block:
            product_details = product_block.group(1).strip()
            try:
                # # Replace single quotes with double quotes
                # product_details = product_details.replace("'", '"')

                # # Handle unescaped double quotes inside string values
                # product_details = re.sub(r'(?<!\\)"', r'\\"', product_details)
                
                # # Revert the double quotes at the beginning and end of strings to their original state
                # product_details = re.sub(r'\\\\"', r'"', product_details)

                # # Remove trailing commas
                # product_details = re.sub(r',\s*}', '}', product_details)
                # product_details = re.sub(r',\s*]', ']', product_details)
                schema = json.loads(product_details)
                return schema
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON: {e}")
                print(f"JSON content: {product_details}")
        else:
            print("No valid JSON block found in API response")
            print(f"Response content: {response}")

        return None

In [41]:


# Set the API key and model name
MODEL = "gpt-4o-mini"
API_KEY = os.getenv("sqdc_api_key")


# Initialize the OpenAI and MongoDB clients
api_client = OpenAI(api_key=API_KEY)
mongo_client = MongoClient("mongodb://localhost:27017/")
db = mongo_client['sqdc'] 


# Function to encode image as a base64 string
def encode_image(image_path):
    with open(image_path, "rb") as image_file:
        return base64.b64encode(image_file.read()).decode("utf-8")

# Function to insert schema into MongoDB
def insert_into_mongo(collection, data):
    
    try:
        # Insert the dictionary into MongoDB
        collection.insert_one(data)
        print("Data inserted successfully.")
    except Exception as e:
        print(f"An error occurred while inserting :  {data}  :", e)

def check_collection(collection_name,validation_schema):
    if collection_name in db.list_collection_names():
        print(f"La collection '{collection_name}' existe déjà.")
        collection = db[collection_name]
    else:
        collection = db.create_collection(collection_name, validator=validation_schema, validationAction='warn')
        print(f"La collection '{collection_name}' a été créé.")
    return collection

# Function to process each folder
def process_folder(folder_path):

    
    # Get all .png files in the specified folder
    image_files = [f for f in os.listdir(folder_path) if f.endswith('.png')]
    schema_file = [f for f in os.listdir(folder_path) if f.endswith('.txt')]
    extracted_validation_schema = convert_to_mongodb_schema(f"{folder_path}/{schema_file[0]}")
    collection_name = os.path.basename(folder_path)
    collection = check_collection(collection_name,extracted_validation_schema)
    # Process each image individually
    for image_file in image_files:
        image_path = os.path.join(folder_path, image_file)
        resized_image_path = resize_image(image_path, folder_path)
        base64_image = encode_image(resized_image_path)
        
        # Create the message for the current image
        messages = [
            {"role": "system", "content": f"You are a database specialist assistant, that will respond only with MongoDB schema using this validation : {extracted_validation_schema}!"},
            {"role": "user", "content": [{"type": "image_url", "image_url": {"url": f"data:image/png;base64,{base64_image}"}}]}
        ]
        
        # Send the request to OpenAI API
        response = api_client.chat.completions.create(
            model=MODEL,
            messages=messages,
            temperature=0.0,
        )
        
        # Get the generated MongoDB schema for the current image
        print(response.choices[0].message.content)
        product_schema = extraire_produits_details(response.choices[0].message.content)
        
        # Insert the schema into MongoDB
        insert_into_mongo(collection, product_schema)
        
        # Print the inserted schema for verification
        #print(f"Inserted MongoDB Schema for {image_file}:\n")
        #print(schema)
        #print("\n" + "="*50 + "\n")

# Main execution: process the main folder and its subfolders
# def main():
#     BASE_FOLDER_PATH = "pdf_products"
#     for collection in os.listdir(BASE_FOLDER_PATH):
#         collection_path = os.path.join(BASE_FOLDER_PATH, collection)
#         if os.path.isdir(collection_path):
#             process_folder(collection_path)
#     print(f"{collection_path}[DONE]")

def main():
    BASE_FOLDER_PATH = "pdf_products"
    specific_folder = "Atomiseurs"  # Replace with your specific folder name
    collection_path = os.path.join(BASE_FOLDER_PATH, specific_folder)
    if os.path.isdir(collection_path):
        process_folder(collection_path)
    else:
        print(f"The folder '{specific_folder}' does not exist in the base folder.")
    print(f"{collection_path} [DONE]")

# Execute the main function
if __name__ == "__main__":
    main()


La collection 'Atomiseurs' existe déjà.
```json
{
  "Atomiseurs": {
    "type": "object",
    "properties": {
      "produit": {
        "type": "string",
        "enum": ["BIG BANG"]
      },
      "marque": {
        "type": "string",
        "enum": ["OLLOPA"]
      },
      "volume": {
        "type": "string",
        "enum": ["15ML"]
      },
      "caracteristiques": {
        "type": "string",
        "enum": [
          "Pourrait créer l'impression de ressentir des sensations stimulantes",
          "Créerait un sentiment de joie",
          "Pourrait stimuler certaines fonctions cérébrales",
          "Créerait un sentiment d'euphorie"
        ]
      },
      "ingredients": {
        "type": "string",
        "enum": ["Triglycérides à chaîne moyenne, extrait d'huile de cannabis"]
      },
      "extraction": {
        "type": "string",
        "enum": ["Éthanol cryogénique. Fait à partir d'un distillat de THC."]
      },
      "utilisation": {
        "type": "string",
     

In [47]:
import pandas as pd
from pymongo import MongoClient

def export_to_excel(db_name, collection_name, output_file):
    # Connect to MongoDB
    client = MongoClient("mongodb://localhost:27017/")
    db = client[f'{db_name}']
    collection = db[f'{collection_name}']
    
    # Query all documents in the collection
    data = list(collection.find())
    
    
    # Convert to DataFrame
    df = pd.json_normalize(data)
    columns_to_drop = [col for col in df.columns if col.endswith('.type')]
    df.drop(columns=columns_to_drop, inplace=True)
    # Optionally, you can drop the MongoDB ID field if not needed
    if '_id' in df.columns:
        df.drop('_id', axis=1, inplace=True)
    
    # Write to Excel file
    df.to_excel(output_file, index=False)
    print(f"Data exported successfully to {output_file}")
    

# Example usage
export_to_excel("sqdc", "Atomiseur", "testAtomiseurExcel.xlsx")

Data exported successfully to testAtomiseurExcel.xlsx
