## Initialization Script
This cell contains the initialization script for setting up the Flask application with SQLAlchemy and Flask-Migrate.


In [None]:
from flask import Flask
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from config import Config
import logging

# Initialize extensions
db = SQLAlchemy()
migrate = Migrate()

def create_app():
    # Create and configure the Flask application
    app = Flask(__name__)
    app.config.from_object(Config)
    
    # Initialize extensions
    db.init_app(app)
    migrate.init_app(app, db)
    
    # Set up logging
    logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
    
    # Import and register blueprints
    from . import routes, models
    
    # Example of how to register a blueprint
    # from .routes import main as main_blueprint
    # app.register_blueprint(main_blueprint)
    
    @app.errorhandler(500)
    def internal_error(error):
        db.session.rollback()
        return "500 error"
    
    @app.errorhandler(404)
    def not_found_error(error):
        return "404 error"
    
    return app



## Enhanced OpenAI Integration Script
This cell contains the enhanced script for integrating with the OpenAI API, with added flexibility for model selection, adjustable token limits, and improved error handling.


In [None]:
import openai
import os
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Set the OpenAI API key from environment variables
openai.api_key = os.getenv("OPENAI_API_KEY")

def ask_openai(prompt, model="text-davinci-003", max_tokens=150):
    """
    Send a prompt to the OpenAI API and return the generated response.

    Args:
        prompt (str): The input prompt to send to the OpenAI model.
        model (str, optional): The OpenAI model to use. Defaults to "text-davinci-003".
        max_tokens (int, optional): The maximum number of tokens to generate. Defaults to 150.

    Returns:
        str: The text generated by the OpenAI model in response to the prompt.
    """
    try:
        response = openai.Completion.create(
            engine=model,
            prompt=prompt,
            max_tokens=max_tokens
        )
        return response.choices[0].text.strip()
    except openai.error.OpenAIError as e:
        logging.error(f"OpenAI API error: {e}")
        return None
    except Exception as e:
        logging.error(f"Unexpected error: {e}")
        return None

# Example usage
if __name__ == "__main__":
    prompt = "Write a short story about a cat who can talk."
    response = ask_openai(prompt, model="text-davinci-003", max_tokens=200)
    if response:
        print(response)


## Combined OCR Module
This cell contains the combined OCR module that uses Google Vision for OCR and OpenAI for text enhancement. It includes improved logging, error handling, and flexible configuration.


In [None]:
import logging
from google_vision_ocr import perform_ocr, set_credentials
from openai_post_processing import enhance_ocr_text

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def process_image(image_path, credentials_path=None):
    """
    Process an image to extract and enhance text using Google Vision OCR and OpenAI.

    Args:
        image_path (str): The path to the image file.
        credentials_path (str, optional): The path to the Google Cloud credentials JSON file.

    Returns:
        list: A list of enhanced texts extracted from the image.
    """
    try:
        if credentials_path:
            set_credentials(credentials_path)
            logging.info(f"Credentials set from: {credentials_path}")
        ocr_texts = perform_ocr(image_path)
        logging.info(f"OCR completed successfully for image: {image_path}")
        enhanced_texts = [enhance_ocr_text(text.description) for text in ocr_texts]
        logging.info("Text enhancement completed successfully.")
        return enhanced_texts
    except FileNotFoundError:
        logging.error(f"Image file not found: {image_path}")
        return []
    except ValueError as e:
        logging.error(f"Error during OCR or text enhancement: {e}")
        return []
    except Exception as e:
        logging.error(f"An unexpected error occurred: {e}")
        return []

# Ensure example usage only runs when the script is executed directly
if __name__ == "__main__":
    image_path = input("Enter the path to your image: ")
    credentials_path = input("Enter the path to your Google Cloud credentials JSON file (optional): ").strip()
    try:
        results = process_image(image_path, credentials_path if credentials_path else None)
        if results:
            print("\nEnhanced Texts:")
            for result in results:
                print(result)
        else:
            print("No texts were enhanced.")
    except Exception as e:
        logging.error(f"An error occurred: {e}")


# VEDA OCR Processing Setup

This Jupyter Notebook will guide you through setting up and running the OCR processing component of VEDA. We will use a Flask application to handle OCR requests and process images using the combined OCR module.

## Prerequisites

Before we begin, ensure you have the following installed:
- Python 3.7 or later
- Flask
- filetype
- Your OCR module dependencies (e.g., Tesseract, Google Vision API, etc.)

## Step 1: Setting Up the Virtual Environment

First, let's create and activate a virtual environment.

```bash
# Create a virtual environment
python -m venv venv

# Activate the virtual environment
# On Windows
venv\Scripts\activate
# On Unix or MacOS
source venv/bin/activate


In [None]:
from flask import Flask, request, jsonify
from combined_ocr_module import process_image
import logging
import os
import tempfile
import filetype

# Configure logging
logging.basicConfig(level=logging.INFO, 
                    format='%(asctime)s [%(levelname)s] %(message)s', 
                    datefmt='%Y-%m-%d %H:%M:%S')

app = Flask(__name__)

@app.route('/ocr', methods=['POST'])
def ocr_endpoint():
    """
    Endpoint to handle OCR processing requests.
    Expects an image file and optional credentials path in the form data.
    """
    try:
        # Retrieve the uploaded image file from the request
        image_file = request.files['image']
        
        # Validate the file type
        if not filetype.guess(image_file.stream):
            logging.warning('Invalid image file received.')
            return jsonify({'error': 'Invalid image file'}), 400
        
        # Save the image file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False) as temp_file:
            image_path = temp_file.name
            image_file.save(image_path)
            logging.info(f'Image saved to {image_path}')

        # Retrieve credentials path from the form data
        credentials_path = request.form.get('credentials_path')
        if credentials_path:
            logging.info(f'Using credentials from {credentials_path}')
        else:
            logging.warning('No credentials path provided.')

        # Process the image using the combined OCR module
        results = process_image(image_path, credentials_path)
        return jsonify({'ocr_results': results})
    
    except KeyError as e:
        error_message = f"Missing form data: {e}"
        logging.error(error_message)
        return jsonify({'error': error_message}), 400
    
    except FileNotFoundError as e:
        error_message = f"Credentials file not found: {e}"
        logging.error(error_message)
        return jsonify({'error': error_message}), 400
    
    except Exception as e:
        error_message = f"An error occurred: {e}"
        logging.error(error_message)
        return jsonify({'error': error_message}), 500
    
    finally:
        # Ensure the temporary image file is deleted after processing
        if 'image_path' in locals() and os.path.exists(image_path):
            os.remove(image_path)
            logging.info(f'Temporary image file {image_path} deleted.')

if __name__ == '__main__':
    # Use environment variables for configuration
    debug = os.getenv('FLASK_DEBUG', 'true').lower() in ['true', '1', 't']
    port = int(os.getenv('FLASK_PORT', 5000))
    app.run(debug=debug, port=port)


# VEDA Project - OCR Module Development

## Objective

Develop a module to process images using OCR and enhance the extracted text using OpenAI's language model.

## Steps

1. **Setup and Configuration**
    - Configure logging for debugging and monitoring.
    - Set up necessary environment variables for Google Cloud and OpenAI.

2. **Function: `process_image`**
    - Validate the image file type.
    - Load the image.
    - Perform OCR using Google Vision.
    - Enhance the text using OpenAI.
    - Return the results.

## Implementation

### combined_ocr_module.py

```python
import logging
from google.cloud import vision
import openai
import os
from PIL import Image
import io
import filetype

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class ImageProcessingError(Exception):
    """Custom exception for image processing errors."""
    pass

class OCRError(Exception):
    """Custom exception for OCR errors."""
    pass

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""
    pass

def process_image(image_path, credentials_path, openai_api_key):
    """ 
    Process the given image to extract and enhance text using OCR and OpenAI.
    
    Parameters:
    - image_path: str, path to the image file
    - credentials_path: str, path to the Google Cloud credentials file
    - openai_api_key: str, OpenAI API key
    
    Returns:
    - str, enhanced text from the image
    """
    try:
        # Check if the image file type is valid
        if not filetype.is_image(image_path):
            raise ImageProcessingError("Invalid image file type.")

        # Set up Google Cloud Vision client
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        client = vision.ImageAnnotatorClient()

        # Load image
        with open(image_path, 'rb') as img_file:
            content = img_file.read()

        image = vision.Image(content=content)

        # Perform OCR using Google Cloud Vision
        response = client.text_detection(image=image)
        texts = response.text_annotations
        if not texts:
            logging.warning("No text detected in the image.")
            raise OCRError("No text detected.")

        # Extract detected text
        detected_text = texts[0].description
        logging.info(f'Detected text: {detected_text}')

        # Enhance text using OpenAI
        openai.api_key = openai_api_key
        enhanced_text = enhance_text_with_openai(detected_text)

        return enhanced_text

    except ImageProcessingError as e:
        logging.error(f"An error occurred during image processing: {e}")
        raise

    except OCRError as e:
        logging.error(f"An error occurred during OCR: {e}")
        raise

    except OpenAIError as e:
        logging.error(f"An error occurred during text enhancement: {e}")
        raise

    except Exception as e:
        logging.error(f"An unknown error occurred during image processing: {e}")
        raise ImageProcessingError(str(e))

def enhance_text_with_openai(text, engine="text-davinci-003"):
    """ 
    Enhance the given text using OpenAI's language model.
    
    Parameters:
    - text: str, the text to enhance
    - engine: str, OpenAI engine to use (default: "text-davinci-003")
    
    Returns:
    - str, enhanced text
    """
    try:
        response = openai.Completion.create(
            engine=engine,
            prompt=f"Enhance the following text:\n\n{text}",
            max_tokens=200,
            n=1,
            stop=None,
            temperature=0.7
        )
        enhanced_text = response.choices[0].text.strip()
        logging.info(f'Enhanced text: {enhanced_text}')

        return enhanced_text

    except Exception as e:
        logging.error(f"An error occurred during text enhancement: {e}")
        raise OpenAIError(str(e))

# Example usage
if __name__ == "__main__":
    image_path = "path/to/your/image.jpg"
    credentials_path = "path/to/your/credentials.json"
    openai_api_key = "your_openai_api_key"

    try:
        enhanced_text = process_image(image_path, credentials_path, openai_api_key)
        print(f"Enhanced Text: {enhanced_text}")

    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
import logging
from google.cloud import vision
import openai
import os
from PIL import Image
import io
import filetype

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class ImageProcessingError(Exception):
    """Custom exception for image processing errors."""
    pass

class OCRError(Exception):
    """Custom exception for OCR errors."""
    pass

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""
    pass

def process_image(image_path, credentials_path, openai_api_key):
    """ 
    Process the given image to extract and enhance text using OCR and OpenAI.
    
    Parameters:
    - image_path: str, path to the image file
    - credentials_path: str, path to the Google Cloud credentials file
    - openai_api_key: str, OpenAI API key
    
    Returns:
    - str, enhanced text from the image
    """
    try:
        # Check if the image file type is valid
        if not filetype.is_image(image_path):
            raise ImageProcessingError("Invalid image file type.")

        # Set up Google Cloud Vision client
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        client = vision.ImageAnnotatorClient()

        # Load image
        with open(image_path, 'rb') as img_file:
            content = img_file.read()

        image = vision.Image(content=content)

        # Perform OCR using Google Cloud Vision
        response = client.text_detection(image=image)
        texts = response.text_annotations
        if not texts:
            logging.warning("No text detected in the image.")
            raise OCRError("No text detected.")

        # Extract detected text
        detected_text = texts[0].description
        logging.info(f'Detected text: {detected_text}')

        # Enhance text using OpenAI
        openai.api_key = openai_api_key
        enhanced_text = enhance_text_with_openai(detected_text)

        return enhanced_text

    except ImageProcessingError as e:
        logging.error(f"An error occurred during image processing: {e}")
        raise

    except OCRError as e:
        logging.error(f"An error occurred during OCR: {e}")
        raise

    except OpenAIError as e:
        logging.error(f"An error occurred during text enhancement: {e}")
        raise

    except Exception as e:
        logging.error(f"An unknown error occurred during image processing: {e}")
        raise ImageProcessingError(str(e))

def enhance_text_with_openai(text, engine="text-davinci-003"):
    """ 
    Enhance the given text using OpenAI's language model.
    
    Parameters:
    - text: str, the text to enhance
    - engine: str, OpenAI engine to use (default: "text-davinci-003")
    
    Returns:
    - str, enhanced text
    """
    try:
        response = openai.Completion.create(
            engine=engine,
            prompt=f"Enhance the following text:\n\n{text}",
            max_tokens=200,
            n=1,
            stop=None,
            temperature=0.7
        )
        enhanced_text = response.choices[0].text.strip()
        logging.info(f'Enhanced text: {enhanced_text}')

        return enhanced_text

    except Exception as e:
        logging.error(f"An error occurred during text enhancement: {e}")
        raise OpenAIError(str(e))

# Example usage
if __name__ == "__main__":
    image_path = "path/to/your/image.jpg"
    credentials_path = "path/to/your/credentials.json"
    openai_api_key = "your_openai_api_key"

    try:
        enhanced_text = process_image(image_path, credentials_path, openai_api_key)
        print(f"Enhanced Text: {enhanced_text}")

    except Exception as e:
        print(f"An error occurred: {e}")


# OpenAI Integration Script

This script integrates OpenAI's language model to enhance text. It includes robust error handling, logging, and initialization functions to ensure smooth and reliable operations.

## Script Overview

1. **Logging Configuration**: Sets up logging to capture and display important events and errors.
2. **Custom Exceptions**: Defines specific exceptions for handling different types of errors.
3. **Function Definitions**:
   - `initialize_openai_api(api_key)`: Initializes the OpenAI API with the provided key.
   - `enhance_text_with_openai(text, engine="text-davinci-003", max_tokens=200, temperature=0.7)`: Enhances the given text using OpenAI's language model.
4. **Main Function**: Demonstrates the usage of the functions with an example text.

## Script Code

```python
import openai
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""

def initialize_openai_api(api_key):
    """Initialize OpenAI API with the provided key."""
    if not api_key:
        raise ValueError("OpenAI API key must be provided.")
    openai.api_key = api_key
    logging.info("OpenAI API initialized with the provided key.")

def enhance_text_with_openai(text, engine="text-davinci-003", max_tokens=200, temperature=0.7):
    """Enhance the given text using OpenAI's language model.

    Args:
    text (str): The text to enhance.
    engine (str, optional): OpenAI engine to use. Defaults to "text-davinci-003".
    max_tokens (int, optional): Maximum number of tokens for the response. Defaults to 200.
    temperature (float, optional): Sampling temperature. Defaults to 0.7.

    Returns:
    str: Enhanced text.

    Raises:
    OpenAIError: If an error occurs during text enhancement.
    """
    try:
        # Validate input text and engine
        if not isinstance(text, str):
            raise ValueError("Input text must be a string.")
        if not isinstance(engine, str):
            raise ValueError("Engine name must be a string.")

        response = openai.Completion.create(
            engine=engine,
            prompt=f"Enhance the following text:\n\n{text}",
            max_tokens=max_tokens,
            n=1,
            stop=None,
            temperature=temperature
        )

        enhanced_text = response.choices[0].text.strip()
        logging.info(f'Enhanced text: {enhanced_text}')
        return enhanced_text

    except openai.error.APIError as e:
        error_message = f"OpenAI API error during text enhancement: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

    except openai.error.InvalidRequestError as e:
        error_message = f"Invalid request to OpenAI API: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

    except Exception as e:
        error_message = f"An error occurred during text enhancement: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

def main():
    """Main function for example usage."""
    openai_api_key = os.getenv('OPENAI_API_KEY')
    if not openai_api_key:
        logging.error("OpenAI API key not found in environment variables.")
        return
    
    try:
        initialize_openai_api(openai_api_key)
        text_to_enhance = "The quick brown fox jumps over the lazy dog."

        print("Processing text...")
        enhanced_text = enhance_text_with_openai(text_to_enhance)
        print(f"Enhanced Text:\n{enhanced_text}")

    except (ValueError, OpenAIError) as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    main()


In [None]:
import openai
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""

def initialize_openai_api(api_key):
    """Initialize OpenAI API with the provided key."""
    if not api_key:
        raise ValueError("OpenAI API key must be provided.")
    openai.api_key = api_key
    logging.info("OpenAI API initialized with the provided key.")

def enhance_text_with_openai(text, engine="text-davinci-003", max_tokens=200, temperature=0.7):
    """Enhance the given text using OpenAI's language model.

    Args:
    text (str): The text to enhance.
    engine (str, optional): OpenAI engine to use. Defaults to "text-davinci-003".
    max_tokens (int, optional): Maximum number of tokens for the response. Defaults to 200.
    temperature (float, optional): Sampling temperature. Defaults to 0.7.

    Returns:
    str: Enhanced text.

    Raises:
    OpenAIError: If an error occurs during text enhancement.
    """
    try:
        # Validate input text and engine
        if not isinstance(text, str):
            raise ValueError("Input text must be a string.")
        if not isinstance(engine, str):
            raise ValueError("Engine name must be a string.")

        response = openai.Completion.create(
            engine=engine,
            prompt=f"Enhance the following text:\n\n{text}",
            max_tokens=max_tokens,
            n=1,
            stop=None,
            temperature=temperature
        )

        enhanced_text = response.choices[0].text.strip()
        logging.info(f'Enhanced text: {enhanced_text}')
        return enhanced_text

    except openai.error.APIError as e:
        error_message = f"OpenAI API error during text enhancement: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

    except openai.error.InvalidRequestError as e:
        error_message = f"Invalid request to OpenAI API: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

    except Exception as e:
        error_message = f"An error occurred during text enhancement: {e}"
        logging.error(error_message)
        raise OpenAIError(error_message)

def main():
    """Main function for example usage."""
    openai_api_key = os.getenv('OPENAI_API_KEY')
    if not openai_api_key:
        logging.error("OpenAI API key not found in environment variables.")
        return
    
    try:
        initialize_openai_api(openai_api_key)
        text_to_enhance = "The quick brown fox jumps over the lazy dog."

        print("Processing text...")
        enhanced_text = enhance_text_with_openai(text_to_enhance)
        print(f"Enhanced Text:\n{enhanced_text}")

    except (ValueError, OpenAIError) as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    main()


# Google Vertex AI Integration Script

This script integrates with Google Vertex AI to initialize the AI platform and make predictions using deployed models. It includes robust error handling, logging, and initialization functions.

## Script Overview

1. **Logging Configuration**: Sets up logging to capture and display important events and errors.
2. **Custom Exceptions**: Defines specific exceptions for handling different types of errors.
3. **Function Definitions**:
   - `initialize_vertex_ai(credentials_path, project_id, location="us-central1")`: Initializes Google Vertex AI with the provided credentials and project details.
   - `predict_with_vertex_ai(model_name, instances)`: Makes predictions using a deployed model in Google Vertex AI.
4. **Main Function**: Demonstrates the usage of the functions with example inputs.

## Script Code

```python
import logging
from google.cloud import aiplatform
import os
import yaml
from google.api_core.exceptions import NotFound, InvalidArgument

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class VertexAIError(Exception):
    """Custom exception for Google Vertex AI errors."""

def initialize_vertex_ai(credentials_path, project_id, location="us-central1"):
    """Initialize Google Vertex AI with the provided credentials and project details.

    Args:
        credentials_path (str): Path to the Google Cloud credentials file.
        project_id (str): Google Cloud project ID.
        location (str, optional): Vertex AI region. Defaults to "us-central1".

    Raises:
        VertexAIError: If an error occurs during initialization.
    """
    try:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        aiplatform.init(project=project_id, location=location)
        logging.info("Google Vertex AI initialized.")
    except FileNotFoundError as e:
        error_message = f"Credentials file not found: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except Exception as e:
        error_message = f"Failed to initialize Google Vertex AI: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def predict_with_vertex_ai(model_name, instances):
    """Make predictions using a deployed model in Google Vertex AI.

    Args:
        model_name (str): The name of the deployed model.
        instances (list): List of instances for prediction.

    Returns:
        list: Predictions from the model.

    Raises:
        VertexAIError: If an error occurs during prediction.
    """
    try:
        endpoint = aiplatform.Endpoint(model_name)
        predictions = endpoint.predict(instances=instances).predictions
        logging.info(f"Predictions: {predictions}")
        return predictions
    except NotFound as e:
        error_message = f"Model or endpoint not found: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except InvalidArgument as e:
        error_message = f"Invalid input data: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except Exception as e:
        error_message = f"Failed to make predictions with Vertex AI: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def main():
    """Main function for example usage."""
    config_path = "config.yaml"
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
    except FileNotFoundError:
        logging.error(f"Configuration file not found at {config_path}")
        return

    credentials_path = config.get('credentials_path')
    project_id = config.get('project_id')
    model_name = config.get('model_name')
    location = config.get('location', "us-central1")

    if not all([credentials_path, project_id, model_name]):
        logging.error("Missing required configuration values.")
        return

    try:
        initialize_vertex_ai(credentials_path, project_id, location)

        # Example input data
        instances = [{"input": "example input data"}]

        logging.info("Making predictions...")
        predictions = predict_with_vertex_ai(model_name, instances)
        logging.info(f"Predictions: {predictions}")

    except (ValueError, VertexAIError) as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    main()


In [None]:
import logging
from google.cloud import aiplatform
import os
import yaml
from google.api_core.exceptions import NotFound, InvalidArgument

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class VertexAIError(Exception):
    """Custom exception for Google Vertex AI errors."""

def initialize_vertex_ai(credentials_path, project_id, location="us-central1"):
    """Initialize Google Vertex AI with the provided credentials and project details.

    Args:
        credentials_path (str): Path to the Google Cloud credentials file.
        project_id (str): Google Cloud project ID.
        location (str, optional): Vertex AI region. Defaults to "us-central1".

    Raises:
        VertexAIError: If an error occurs during initialization.
    """
    try:
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        aiplatform.init(project=project_id, location=location)
        logging.info("Google Vertex AI initialized.")
    except FileNotFoundError as e:
        error_message = f"Credentials file not found: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except Exception as e:
        error_message = f"Failed to initialize Google Vertex AI: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def upload_model_to_vertex_ai(model_path, display_name, project_id, location="us-central1"):
    """Upload a model to Google Vertex AI.

    Args:
        model_path (str): Path to the model file.
        display_name (str): Display name for the model in Vertex AI.
        project_id (str): Google Cloud project ID.
        location (str, optional): Vertex AI region. Defaults to "us-central1".

    Returns:
        model: The uploaded model.

    Raises:
        VertexAIError: If an error occurs during model upload.
    """
    try:
        model = aiplatform.Model.upload(
            display_name=display_name,
            artifact_uri=model_path,
            project=project_id,
            location=location,
        )
        logging.info(f"Model uploaded successfully: {model.resource_name}")
        return model
    except Exception as e:
        error_message = f"Failed to upload model to Vertex AI: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def deploy_model_to_endpoint(model, endpoint_name, project_id, location="us-central1"):
    """Deploy a model to an endpoint in Google Vertex AI.

    Args:
        model (Model): The model to deploy.
        endpoint_name (str): Name of the endpoint to deploy the model to.
        project_id (str): Google Cloud project ID.
        location (str, optional): Vertex AI region. Defaults to "us-central1".

    Returns:
        endpoint: The deployed endpoint.

    Raises:
        VertexAIError: If an error occurs during model deployment.
    """
    try:
        endpoint = aiplatform.Endpoint.create(
            display_name=endpoint_name,
            project=project_id,
            location=location,
        )
        model.deploy(endpoint=endpoint)
        logging.info(f"Model deployed to endpoint: {endpoint.resource_name}")
        return endpoint
    except Exception as e:
        error_message = f"Failed to deploy model to endpoint: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def predict_with_vertex_ai(endpoint, instances):
    """Make predictions using a deployed model in Google Vertex AI.

    Args:
        endpoint (Endpoint): The endpoint to use for prediction.
        instances (list): List of instances for prediction.

    Returns:
        list: Predictions from the model.

    Raises:
        VertexAIError: If an error occurs during prediction.
    """
    try:
        predictions = endpoint.predict(instances=instances).predictions
        logging.info(f"Predictions: {predictions}")
        return predictions
    except NotFound as e:
        error_message = f"Model or endpoint not found: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except InvalidArgument as e:
        error_message = f"Invalid input data: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)
    except Exception as e:
        error_message = f"Failed to make predictions with Vertex AI: {e}"
        logging.error(error_message)
        raise VertexAIError(error_message)

def main():
    """Main function for example usage."""
    config_path = "config.yaml"
    try:
        with open(config_path, 'r') as f:
            config = yaml.safe_load(f)
    except FileNotFoundError:
        logging.error(f"Configuration file not found at {config_path}")
        return

    credentials_path = config.get('credentials_path')
    project_id = config.get('project_id')
    model_path = config.get('model_path')
    display_name = config.get('display_name')
    endpoint_name = config.get('endpoint_name')
    location = config.get('location', "us-central1")

    if not all([credentials_path, project_id, model_path, display_name, endpoint_name]):
        logging.error("Missing required configuration values.")
        return

    try:
        initialize_vertex_ai(credentials_path, project_id, location)
        model = upload_model_to_vertex_ai(model_path, display_name, project_id, location)
        endpoint = deploy_model_to_endpoint(model, endpoint_name, project_id, location)

        # Example input data
        instances = [{"input": "example input data"}]

        logging.info("Making predictions...")
        predictions = predict_with_vertex_ai(endpoint, instances)
        logging.info(f"Predictions: {predictions}")

    except (ValueError, VertexAIError) as e:
        logging.error(f"Error: {e}")

if __name__ == "__main__":
    main()


# Models Script for VEDA

In this notebook, we will review and enhance the `models.py` script for the VEDA project. The `models.py` script defines the database models using SQLAlchemy for the VEDA application. These models represent the database tables and their relationships.

## User Model

The `User` model represents the users of the VEDA application. It includes fields for storing user information such as username, email, password hash, and timestamps for creation and updates.

```python
from flask_sqlalchemy import SQLAlchemy
from datetime import datetime
from werkzeug.security import generate_password_hash, check_password_hash

# Initialize the SQLAlchemy instance
db = SQLAlchemy()

class User(db.Model):
    __tablename__ = 'users'
    
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(150), nullable=False, unique=True)
    email = db.Column(db.String(150), unique=True, nullable=False)
    password_hash = db.Column(db.String(150), nullable=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    
    def set_password(self, password):
        """Hash and set the user's password."""
        self.password_hash = generate_password_hash(password)
    
    def check_password(self, password):
        """Check the user's password."""
        return check_password_hash(self.password_hash, password)
    
    def __repr__(self):
        return f'<User {self.username}>'


In [None]:
from flask_sqlalchemy import SQLAlchemy
from datetime import datetime
from werkzeug.security import generate_password_hash, check_password_hash

# Initialize the SQLAlchemy instance
db = SQLAlchemy()

class User(db.Model):
    __tablename__ = 'users'
    
    id = db.Column(db.Integer, primary_key=True)
    username = db.Column(db.String(150), nullable=False, unique=True)
    email = db.Column(db.String(150), unique=True, nullable=False)
    password_hash = db.Column(db.String(150), nullable=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    
    def set_password(self, password):
        """Hash and set the user's password."""
        self.password_hash = generate_password_hash(password)
    
    def check_password(self, password):
        """Check the user's password."""
        return check_password_hash(self.password_hash, password)
    
    def __repr__(self):
        return f'<User {self.username}>'

class Document(db.Model):
    __tablename__ = 'documents'
    
    id = db.Column(db.Integer, primary_key=True)
    title = db.Column(db.String(150), nullable=False)
    content = db.Column(db.Text, nullable=False)
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    file_type = db.Column(db.String(50), nullable=True)  # Store file type, if applicable
    file_size = db.Column(db.Integer, nullable=True)  # Store file size, if applicable
    unique_id = db.Column(db.String(100), nullable=True)  # Store a unique identifier for the document
    user_id = db.Column(db.Integer, db.ForeignKey('users.id'), nullable=False)
    user = db.relationship('User', backref=db.backref('documents', lazy=True))
    
    def __repr__(self):
        return f'<Document {self.title}>'

class OcrResult(db.Model):
    __tablename__ = 'ocr_results'
    
    id = db.Column(db.Integer, primary_key=True)
    document_id = db.Column(db.Integer, db.ForeignKey('documents.id'), nullable=False)
    text = db.Column(db.Text, nullable=False)
    confidence_scores = db.Column(db.JSON, nullable=True)  # Store confidence scores for the OCR results
    bounding_boxes = db.Column(db.JSON, nullable=True)  # Store bounding boxes for the detected text
    language = db.Column(db.String(50), nullable=True)  # Store the language of the detected text
    created_at = db.Column(db.DateTime, default=datetime.utcnow)
    updated_at = db.Column(db.DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    document = db.relationship('Document', backref=db.backref('ocr_results', lazy=True))
    
    def __repr__(self):
        return f'<OcrResult {self.id}>'


## Flask Routes Integration

This notebook section covers the integration of Flask routes for VEDA, including the interaction with OpenAI and Vertex AI.

### Routes

1. **Default Route (`/`)**
    - Returns a simple message: "Hello, VEDA!"

2. **Ask OpenAI Endpoint (`/ask_openai`)**
    - Method: `POST`
    - Description: Interacts with OpenAI to get a response based on the provided prompt.
    - Request Body:
        ```json
        {
            "prompt": "Your question or prompt here"
        }
        ```
    - Response:
        ```json
        {
            "response": "OpenAI response here"
        }
        ```

3. **Predict Vertex AI Endpoint (`/predict_vertex_ai`)**
    - Method: `POST`
    - Description: Makes predictions using a deployed model on Vertex AI.
    - Request Body:
        ```json
        {
            "endpoint_id": "Your Vertex AI endpoint ID",
            "instances": ["Input data for prediction"]
        }
        ```
    - Response:
        ```json
        {
            "prediction": "Prediction results here"
        }
        ```

### Code

Below is the Flask routes integration code with enhancements for better logging, error handling, and documentation.

```python
# Flask routes integration code
# Ensure to review, comment, and apply as per the guidelines
from flask import render_template, request, jsonify
from veda_app import create_app
from veda_app.openai_integration import ask_openai
from veda_app.google_vertex_integration import init_vertex_ai, predict_with_vertex_ai
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# Initialize the Flask app and Vertex AI
app = create_app()
init_vertex_ai()

@app.route('/')
def index():
    """Default route for VEDA."""
    return "Hello, VEDA!"

def handle_api_error(error_message, status_code=500):
    """Helper function to handle API errors."""
    logging.error(error_message)
    return jsonify({'error': error_message}), status_code

@app.route('/ask_openai', methods=['POST'])
def ask_openai_endpoint():
    """Endpoint to interact with OpenAI."""
    try:
        data = request.json
        prompt = data.get('prompt')
        if not prompt or not isinstance(prompt, str):
            return handle_api_error("Invalid prompt provided.", 400)
        
        logging.info(f"Received prompt: {prompt}")
        response = ask_openai(prompt)
        logging.info(f"OpenAI response: {response}")
        
        return jsonify({'response': response})
    
    except Exception as e:
        return handle_api_error(f"An error occurred while interacting with OpenAI: {e}")

@app.route('/predict_vertex_ai', methods=['POST'])
def predict_vertex_ai_endpoint():
    """Endpoint to make predictions using Vertex AI."""
    try:
        data = request.json
        endpoint_id = data.get('endpoint_id')
        instances = data.get('instances')
        
        if not endpoint_id or not isinstance(endpoint_id, str):
            return handle_api_error("Invalid endpoint ID provided.", 400)
        
        if not instances or not isinstance(instances, list):
            return handle_api_error("Invalid instances provided.", 400)
        
        logging.info(f"Endpoint ID: {endpoint_id}, Instances: {instances}")
        prediction = predict_with_vertex_ai(endpoint_id, instances)
        logging.info(f"Vertex AI prediction: {prediction}")
        
        return jsonify({'prediction': prediction})
    
    except Exception as e:
        return handle_api_error(f"An error occurred while making predictions with Vertex AI: {e}")

if __name__ == '__main__':
    # Use environment variables for configuration
    debug = os.getenv('FLASK_DEBUG', 'true').lower() in ['true', '1', 't']
    port = int(os.getenv('FLASK_PORT', 5000))
    app.run(debug=debug, port=port)


In [None]:
from flask import render_template, request, jsonify
from veda_app import create_app
from veda_app.openai_integration import ask_openai
from veda_app.google_vertex_integration import init_vertex_ai, predict_with_vertex_ai
import logging
import os

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

# Initialize the Flask app and Vertex AI
app = create_app()
init_vertex_ai()

@app.route('/')
def index():
    """Default route for VEDA."""
    return "Hello, VEDA!"

def handle_api_error(error_message, status_code=500):
    """Helper function to handle API errors."""
    logging.error(error_message)
    return jsonify({'error': error_message}), status_code

@app.route('/ask_openai', methods=['POST'])
def ask_openai_endpoint():
    """Endpoint to interact with OpenAI."""
    try:
        data = request.json
        prompt = data.get('prompt')
        if not prompt or not isinstance(prompt, str):
            return handle_api_error("Invalid prompt provided.", 400)
        
        logging.info(f"Received prompt: {prompt}")
        response = ask_openai(prompt)
        logging.info(f"OpenAI response: {response}")
        
        return jsonify({'response': response})
    
    except Exception as e:
        return handle_api_error(f"An error occurred while interacting with OpenAI: {e}")

@app.route('/predict_vertex_ai', methods=['POST'])
def predict_vertex_ai_endpoint():
    """Endpoint to make predictions using Vertex AI."""
    try:
        data = request.json
        endpoint_id = data.get('endpoint_id')
        instances = data.get('instances')
        
        if not endpoint_id or not isinstance(endpoint_id, str):
            return handle_api_error("Invalid endpoint ID provided.", 400)
        
        if not instances or not isinstance(instances, list):
            return handle_api_error("Invalid instances provided.", 400)
        
        logging.info(f"Endpoint ID: {endpoint_id}, Instances: {instances}")
        prediction = predict_with_vertex_ai(endpoint_id, instances)
        logging.info(f"Vertex AI prediction: {prediction}")
        
        return jsonify({'prediction': prediction})
    
    except Exception as e:
        return handle_api_error(f"An error occurred while making predictions with Vertex AI: {e}")

if __name__ == '__main__':
    # Use environment variables for configuration
    debug = os.getenv('FLASK_DEBUG', 'true').lower() in ['true', '1', 't']
    port = int(os.getenv('FLASK_PORT', 5000))
    app.run(debug=debug, port=port)


## Configuring the Flask Application

The `config.py` script is essential for setting up the configuration of the Flask application. It handles various configuration settings such as database URIs, secret keys, and other environment-specific settings.

### Base Configuration Class
The `Config` class is the base configuration class that includes default settings. Environment-specific configurations inherit from this base class and can override its settings.

### Environment-Specific Configurations
- `DevelopmentConfig`: Configuration for the development environment. Enables debugging and sets the logging level to DEBUG.
- `TestingConfig`: Configuration for the testing environment. Uses a separate test database and sets the logging level to DEBUG.
- `ProductionConfig`: Configuration for the production environment. Disables debugging and sets the logging level to WARNING.

### Logging Configuration
The `configure_logging` function sets up logging based on the current environment's configuration. It ensures that log messages have a consistent format and appropriate log level.

### Environment Variables
Sensitive data, like secret keys and database URIs, should be sourced from environment variables. This practice enhances security and flexibility.

### Example Usage
```python
from flask import Flask
from config import init_app

app = Flask(__name__)
init_app(app)

# Additional initialization can be done here
# db.init_app(app)
# migrate.init_app(app, db)


In [None]:
import os
import secrets
import logging

class Config:
    """Base configuration class."""
    SECRET_KEY = os.environ.get('SECRET_KEY') or secrets.token_urlsafe(32)
    SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') or \
        'sqlite:///{}'.format(os.path.join(os.path.dirname(__file__), 'app.db'))
    SQLALCHEMY_TRACK_MODIFICATIONS = False
    UPLOAD_FOLDER = os.path.join(os.path.dirname(__file__), 'uploads')
    LOGGING_LEVEL = logging.INFO

class DevelopmentConfig(Config):
    """Development configuration class."""
    DEBUG = True
    LOGGING_LEVEL = logging.DEBUG

class TestingConfig(Config):
    """Testing configuration class."""
    TESTING = True
    SQLALCHEMY_DATABASE_URI = 'sqlite:///{}'.format(os.path.join(os.path.dirname(__file__), 'test.db'))
    LOGGING_LEVEL = logging.DEBUG

class ProductionConfig(Config):
    """Production configuration class."""
    DEBUG = False
    LOGGING_LEVEL = logging.WARNING
    # Add other production-specific settings here

def configure_logging():
    """Configure logging based on the environment."""
    logging.basicConfig(level=os.getenv('LOGGING_LEVEL', logging.INFO),
                        format='%(asctime)s [%(levelname)s] %(message)s',
                        datefmt='%Y-%m-%d %H:%M:%S')

# Mapping of configuration classes
config = {
    'development': DevelopmentConfig,
    'testing': TestingConfig,
    'production': ProductionConfig,
    'default': DevelopmentConfig
}

def init_app(app):
    """Initialize the Flask application with the appropriate configuration."""
    env = os.getenv('FLASK_ENV', 'default')
    app.config.from_object(config.get(env, 'default'))
    configure_logging()

    # Example of additional initialization
    # db.init_app(app)
    # migrate.init_app(app, db)


# Flask Application Initialization (`__init__.py`)

This script sets up the Flask application, initializes extensions, and configures error handling and logging.

## Key Components

1. **Application Factory**:
    - `create_app` function initializes the app with configuration settings.
    - Supports multiple configuration classes for different environments.

2. **Extensions**:
    - Initializes `SQLAlchemy`, `Migrate`, and `LoginManager`.

3. **Blueprints**:
    - Registers `main` and `auth` blueprints.

4. **Error Handling**:
    - Custom error handlers for 404 and 500 HTTP errors.

5. **Logging**:
    - Configures logging to stdout or a file based on the environment.
    - Logs application startup and other significant events.

## Code Example

```python
import os
import logging
from logging.handlers import RotatingFileHandler
from flask import Flask, render_template
from config import Config
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_login import LoginManager

db = SQLAlchemy()
migrate = Migrate()
login = LoginManager()
login.login_view = 'auth.login'

def create_app(config_class=Config):
    app = Flask(__name__, instance_relative_config=True)
    app.config.from_object(config_class)
    
    # Initialize extensions
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)
    
    # Register blueprints
    from app.main import bp as main_bp
    app.register_blueprint(main_bp)

    from app.auth import bp as auth_bp
    app.register_blueprint(auth_bp, url_prefix='/auth')
    
    # Error handling
    @app.errorhandler(404)
    def page_not_found(error):
        return render_template('errors/404.html'), 404

    @app.errorhandler(500)
    def internal_server_error(error):
        db.session.rollback()  # Ensure session is rolled back to avoid any database inconsistency
        return render_template('errors/500.html'), 500
    
    # Logging
    if not app.debug:
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            app.logger.addHandler(stream_handler)
        else:
            if not os.path.exists('logs'):
                os.mkdir('logs')
            file_handler = RotatingFileHandler('logs/veda.log', maxBytes=10240, backupCount=10)
            file_handler.setFormatter(logging.Formatter(
                '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
            ))
            file_handler.setLevel(logging.INFO)
            app.logger.addHandler(file_handler)

        app.logger.setLevel(logging.INFO)
        app.logger.info('Veda App startup')

    return app


In [None]:
import os
import logging
from logging.handlers import RotatingFileHandler
from flask import Flask, render_template
from config import Config
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_login import LoginManager

db = SQLAlchemy()
migrate = Migrate()
login = LoginManager()
login.login_view = 'auth.login'

def create_app(config_class=Config):
    app = Flask(__name__, instance_relative_config=True)
    app.config.from_object(config_class)
    
    # Initialize extensions
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)
    
    # Register blueprints
    from app.main import bp as main_bp
    app.register_blueprint(main_bp)

    from app.auth import bp as auth_bp
    app.register_blueprint(auth_bp, url_prefix='/auth')
    
    # Error handling
    @app.errorhandler(404)
    def page_not_found(error):
        return render_template('errors/404.html'), 404

    @app.errorhandler(500)
    def internal_server_error(error):
        db.session.rollback()  # Ensure session is rolled back to avoid any database inconsistency
        return render_template('errors/500.html'), 500
    
    # Logging
    if not app.debug:
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            app.logger.addHandler(stream_handler)
        else:
            if not os.path.exists('logs'):
                os.mkdir('logs')
            file_handler = RotatingFileHandler('logs/veda.log', maxBytes=10240, backupCount=10)
            file_handler.setFormatter(logging.Formatter(
                '%(asctime)s %(levelname)s: %(message)s [in %(pathname)s:%(lineno)d]'
            ))
            file_handler.setLevel(logging.INFO)
            app.logger.addHandler(file_handler)

        app.logger.setLevel(logging.INFO)
        app.logger.info('Veda App startup')

    return app

# Example configuration in config.py
class Config:
    SECRET_KEY = os.environ.get('SECRET_KEY') or 'you-will-never-guess'
    SQLALCHEMY_DATABASE_URI = os.environ.get('DATABASE_URL') or \
        'sqlite:///' + os.path.join(basedir, 'app.db')
    SQLALCHEMY_TRACK_MODIFICATIONS = False
    LOG_TO_STDOUT = os.environ.get('LOG_TO_STDOUT')

# Ensure the following directories and files exist:
# - templates/errors/404.html
# - templates/errors/500.html
# - logs directory for file logging


# Main Blueprint (`main.py`)

The `main.py` script defines the primary routes for the VEDA application. It includes routes for the homepage, about page, and contact page.

## Key Components

1. **Routes**:
    - `index`: Homepage route.
    - `about`: About page route.
    - `contact`: Contact page route with both GET and POST methods.

2. **Error Handling**:
    - Proper error handling for form submissions in the contact page route.

3. **Logging**:
    - Logging is implemented to track the rendering of pages and form submissions.

4. **Input Sanitization**:
    - User inputs are sanitized using the `bleach` library to prevent XSS attacks.

5. **Email Validation**:
    - Email format is validated using regular expressions.

6. **Email Sending**:
    - A `send_contact_email` function is included to handle the logic for sending emails.

## Code Example

```python
from flask import Blueprint, render_template, request, jsonify
import logging
import bleach
import re
from flask_mail import Mail, Message

bp = Blueprint('main', __name__)

@bp.route('/')
def index():
    """Homepage route."""
    logging.info("Rendering homepage.")
    return render_template('index.html', title="Homepage")

@bp.route('/about')
def about():
    """About page route."""
    logging.info("Rendering about page.")
    return render_template('about.html', title="About")

@bp.route('/contact', methods=['GET', 'POST'])
def contact():
    """Contact page route."""
    if request.method == 'POST':
        try:
            name = bleach.clean(request.form.get('name', ''))
            email = bleach.clean(request.form.get('email', ''))
            message = bleach.clean(request.form.get('message', ''))

            if not all([name, email, message]):
                logging.warning("Incomplete form submission.")
                return jsonify({'error': 'All fields are required.'}), 400

            if not re.match(r"^[^@]+@[^@]+\.[^@]+$", email):
                logging.warning("Invalid email format.")
                return jsonify({'error': 'Invalid email address.'}), 400

            # Process the contact form (e.g., send email)
            send_contact_email(name, email, message)
            logging.info(f"Contact form submitted by {name} with email {email}.")
            return jsonify({'success': 'Message sent successfully.'})

        except Exception as e:
            logging.error(f"Error processing contact form: {e}")
            return jsonify({'error': 'An error occurred. Please try again later.'}), 500

    logging.info("Rendering contact page.")
    return render_template('contact.html', title="Contact")

def send_contact_email(name, email, message):
    """Sends a contact email."""
    try:
        msg = Message(subject="Contact Form Submission",
                      sender=email,
                      recipients=["your-email@example.com"],  # Replace with your email
                      body=f"Name: {name}\nEmail: {email}\nMessage: {message}")
        mail.send(msg)
        logging.info("Contact email sent successfully.")
    except Exception as e:
        logging.error(f"Error sending contact email: {e}")
        raise

# Assuming Flask-Mail is set up in your application factory
mail = Mail()

def create_app(config_class=Config):
    app = Flask(__name__, instance_relative_config=True)
    app.config.from_object(config_class)
    
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)
    mail.init_app(app)

    app.register_blueprint(bp)

    # Logging
    if not app.debug:
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            app.logger.addHandler(stream_handler)
        else:
            file_handler = logging.FileHandler('app.log')
            file_handler.setLevel(logging.INFO)
            app.logger.addHandler(file_handler)

        app.logger.setLevel(logging.INFO)
        app.logger.info('Veda App startup')

    return app


In [None]:
from flask import Blueprint, render_template, request, jsonify
import logging
import bleach
import re
from flask_mail import Mail, Message

bp = Blueprint('main', __name__)

@bp.route('/')
def index():
    """Homepage route."""
    logging.info("Rendering homepage.")
    return render_template('index.html', title="Homepage")

@bp.route('/about')
def about():
    """About page route."""
    logging.info("Rendering about page.")
    return render_template('about.html', title="About")

@bp.route('/contact', methods=['GET', 'POST'])
def contact():
    """Contact page route."""
    if request.method == 'POST':
        try:
            name = bleach.clean(request.form.get('name', ''))
            email = bleach.clean(request.form.get('email', ''))
            message = bleach.clean(request.form.get('message', ''))

            if not all([name, email, message]):
                logging.warning("Incomplete form submission.")
                return jsonify({'error': 'All fields are required.'}), 400

            if not re.match(r"^[^@]+@[^@]+\.[^@]+$", email):
                logging.warning("Invalid email format.")
                return jsonify({'error': 'Invalid email address.'}), 400

            # Process the contact form (e.g., send email)
            send_contact_email(name, email, message)
            logging.info(f"Contact form submitted by {name} with email {email}.")
            return jsonify({'success': 'Message sent successfully.'})

        except Exception as e:
            logging.error(f"Error processing contact form: {e}")
            return jsonify({'error': 'An error occurred. Please try again later.'}), 500

    logging.info("Rendering contact page.")
    return render_template('contact.html', title="Contact")

def send_contact_email(name, email, message):
    """Sends a contact email."""
    try:
        msg = Message(subject="Contact Form Submission",
                      sender=email,
                      recipients=["your-email@example.com"],  # Replace with your email
                      body=f"Name: {name}\nEmail: {email}\nMessage: {message}")
        mail.send(msg)
        logging.info("Contact email sent successfully.")
    except Exception as e:
        logging.error(f"Error sending contact email: {e}")
        raise

# Assuming Flask-Mail is set up in your application factory
mail = Mail()

def create_app(config_class=Config):
    app = Flask(__name__, instance_relative_config=True)
    app.config.from_object(config_class)
    
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)
    mail.init_app(app)

    app.register_blueprint(bp)

    # Logging
    if not app.debug:
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(logging.INFO)
            app.logger.addHandler(stream_handler)
        else:
            file_handler = logging.FileHandler('app.log')
            file_handler.setLevel(logging.INFO)
            app.logger.addHandler(file_handler)

        app.logger.setLevel(logging.INFO)
        app.logger.info('Veda App startup')

    return app


# OCR Script (`ocr_script.py`)

The `ocr_script.py` script processes images to extract and enhance text using Google Cloud Vision OCR and OpenAI's language model.

## Key Components

1. **Image Processing**:
    - Load and validate image files.
    - Perform OCR using Google Cloud Vision.
    - Handle errors during image processing and OCR.

2. **Text Enhancement**:
    - Enhance extracted text using OpenAI's language model.
    - Handle errors during text enhancement.

3. **Logging**:
    - Detailed logging to track the flow and potential errors.

4. **Custom Exceptions**:
    - Defined custom exceptions for better error handling.

## Code Example

```python
import logging
from google.cloud import vision
import openai
import os
from PIL import Image
import io
import filetype

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class ImageProcessingError(Exception):
    """Custom exception for image processing errors."""
    pass

class OCRError(Exception):
    """Custom exception for OCR errors."""
    pass

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""
    pass

def process_image(image_path, credentials_path, openai_api_key, engine="text-davinci-003"):
    """Process the given image to extract and enhance text using OCR and OpenAI.
    
    Parameters:
    - image_path: str, path to the image file or URL
    - credentials_path: str, path to the Google Cloud credentials file
    - openai_api_key: str, OpenAI API key
    - engine: str, OpenAI engine to use (default: "text-davinci-003")
    
    Returns:
    - str, enhanced text from the image
    """
    try:
        # Set up Google Cloud Vision client
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        client = vision.ImageAnnotatorClient()
        
        # Load image and validate file type
        kind = filetype.guess(image_path)
        if kind is None:
            logging.error(f"Invalid file type: {image_path}")
            raise ImageProcessingError("Invalid file type.")
        
        image = Image.open(image_path)
        image_bytes = io.BytesIO()
        image.save(image_bytes, format=kind.extension)
        content = image_bytes.getvalue()
        
        # Perform OCR using Google Cloud Vision
        response = client.text_detection(image=vision.Image(content=content))
        texts = response.text_annotations
        if not texts:
            logging.warning("No text detected in the image.")
            raise OCRError("No text detected.")
        
        # Choose the text annotation with the highest confidence
        best_text = max(texts, key=lambda text: text.confidence)
        extracted_text = best_text.description
        
        # Enhance text using OpenAI
        openai.api_key = openai_api_key
        prompt = f"Original text: {extracted_text}\nEnhance the text:"
        response = openai.Completion.create(engine=engine, prompt=prompt, max_tokens=200, temperature=0.7)
        enhanced_text = response.choices[0].text.strip()
        
        logging.info(f"Enhanced text: {enhanced_text}")
        return enhanced_text
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e.filename}")
        raise ImageProcessingError("File not found.")
    
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

# Example Usage
if __name__ == "__main__":
    image_path = "path/to/image.jpg"
    credentials_path = "path/to/credentials.json"
    openai_api_key = "YOUR_OPENAI_API_KEY"
    try:
        enhanced_text = process_image(image_path, credentials_path, openai_api_key)
        print(f"Enhanced Text: {enhanced_text}")
    except Exception as e:
        print(f"An error occurred: {e}")


In [None]:
import logging
from google.cloud import vision
import openai
import os
from PIL import Image
import io
import filetype

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s [%(levelname)s] %(message)s', datefmt='%Y-%m-%d %H:%M:%S')

class ImageProcessingError(Exception):
    """Custom exception for image processing errors."""
    pass

class OCRError(Exception):
    """Custom exception for OCR errors."""
    pass

class OpenAIError(Exception):
    """Custom exception for OpenAI errors."""
    pass

def process_image(image_path, credentials_path, openai_api_key, engine="text-davinci-003"):
    """Process the given image to extract and enhance text using OCR and OpenAI.
    
    Parameters:
    - image_path: str, path to the image file or URL
    - credentials_path: str, path to the Google Cloud credentials file
    - openai_api_key: str, OpenAI API key
    - engine: str, OpenAI engine to use (default: "text-davinci-003")
    
    Returns:
    - str, enhanced text from the image
    """
    try:
        # Set up Google Cloud Vision client
        os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = credentials_path
        client = vision.ImageAnnotatorClient()
        
        # Load image and validate file type
        kind = filetype.guess(image_path)
        if kind is None:
            logging.error(f"Invalid file type: {image_path}")
            raise ImageProcessingError("Invalid file type.")
        
        image = Image.open(image_path)
        image_bytes = io.BytesIO()
        image.save(image_bytes, format=kind.extension)
        content = image_bytes.getvalue()
        
        # Perform OCR using Google Cloud Vision
        response = client.text_detection(image=vision.Image(content=content))
        texts = response.text_annotations
        if not texts:
            logging.warning("No text detected in the image.")
            raise OCRError("No text detected.")
        
        # Choose the text annotation with the highest confidence
        best_text = max(texts, key=lambda text: text.confidence)
        extracted_text = best_text.description
        
        # Enhance text using OpenAI
        openai.api_key = openai_api_key
        prompt = f"Original text: {extracted_text}\nEnhance the text:"
        response = openai.Completion.create(engine=engine, prompt=prompt, max_tokens=200, temperature=0.7)
        enhanced_text = response.choices[0].text.strip()
        
        logging.info(f"Enhanced text: {enhanced_text}")
        return enhanced_text
    
    except FileNotFoundError as e:
        logging.error(f"File not found: {e.filename}")
        raise ImageProcessingError("File not found.")
    
    except Exception as e:
        logging.error(f"An error occurred: {str(e)}")
        raise

# Example Usage
if __name__ == "__main__":
    image_path = "path/to/image.jpg"
    credentials_path = "path/to/credentials.json"
    openai_api_key = "YOUR_OPENAI_API_KEY"
    try:
        enhanced_text = process_image(image_path, credentials_path, openai_api_key)
        print(f"Enhanced Text: {enhanced_text}")
    except Exception as e:
        print(f"An error occurred: {e}")


# FSPL Dataset Creation Script (`create_fspl_dataset.py`)

The `create_fspl_dataset.py` script generates a dataset for Free Space Path Loss (FSPL) calculations. This dataset can be used for training machine learning models or other analysis purposes.

## Key Components

1. **Dataset Generation**:
    - Generates distances and frequencies.
    - Calculates FSPL in decibels (dB) using the formula:
      \[
      \text{FSPL} = 20 \log_{10}(\text{distance}) + 20 \log_{10}(\text{frequency}) - 147.55
      \]

2. **Logging**:
    - Detailed logging to track dataset generation and potential errors.

3. **Efficiency**:
    - Used `tqdm` outside the `Parallel` loop for efficient progress tracking.

4. **Parameter Validation**:
    - Included basic error handling and parameter validation.

## Code Example

```python
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import logging
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
SPEED_OF_LIGHT = 3e8  # Speed of light in m/s

def calculate_fspl(frequency_mhz, distance_m, tx_gain_dbi, rx_gain_dbi):
    """
    Calculate the Free Space Path Loss (FSPL) for given parameters.

    Args:
        frequency_mhz: Frequency in MHz.
        distance_m: Distance in meters.
        tx_gain_dbi: Transmitter gain in dBi.
        rx_gain_dbi: Receiver gain in dBi.

    Returns:
        A dictionary containing the calculated FSPL and input parameters.
    """
    try:
        frequency_hz = frequency_mhz * 1e6  # Convert MHz to Hz
        wavelength = SPEED_OF_LIGHT / frequency_hz
        fspl = (4 * np.pi * distance_m / wavelength) ** 2
        fspl_db = 10 * np.log10(fspl)
        return {
            'Frequency (MHz)': frequency_mhz,
            'Distance (m)': distance_m,
            'Tx Gain (dBi)': tx_gain_dbi,
            'Rx Gain (dBi)': rx_gain_dbi,
            'FSPL (dB)': fspl_db - tx_gain_dbi - rx_gain_dbi
        }
    except Exception as e:
        logging.error(f"Error in FSPL calculation: {e}")
        return {
            'Frequency (MHz)': frequency_mhz,
            'Distance (m)': distance_m,
            'Tx Gain (dBi)': tx_gain_dbi,
            'Rx Gain (dBi)': rx_gain_dbi,
            'FSPL (dB)': None
        }

def generate_dataset(frequencies, distances, tx_gains, rx_gains):
    """
    Generate a dataset for FSPL calculations.

    Args:
        frequencies: Array of frequencies in MHz.
        distances: Array of distances in meters.
        tx_gains: Array of transmitter gains in dBi.
        rx_gains: Array of receiver gains in dBi.

    Returns:
        A pandas DataFrame containing the generated FSPL dataset.
    """
    total_tasks = len(frequencies) * len(distances) * len(tx_gains) * len(rx_gains)
    logging.info(f"Total calculations to perform: {total_tasks}")

    # Use tqdm outside the Parallel loop for efficient progress tracking
    data = []
    with tqdm(total=total_tasks, desc="Generating FSPL Dataset") as pbar:
        for result in Parallel(n_jobs=-1)(
            delayed(calculate_fspl)(freq, dist, tx_gain, rx_gain)
            for freq in frequencies
            for dist in distances
            for tx_gain in tx_gains
            for rx_gain in rx_gains
        ):
            data.append(result)
            pbar.update(1)

    df = pd.DataFrame(data)
    return df

# Define ranges
frequencies = np.arange(1, 4001, 0.5)  # 1 MHz to 4 GHz with 0.5 MHz steps
distances = np.arange(3, 10001, 0.5)  # 3 meters to 10,000 meters with 0.5 meter steps
tx_gains = np.arange(0, 31, 0.5)  # 0 dBi to 30 dBi with 0.5 dBi steps
rx_gains = np.arange(0, 31, 0.5)  # 0 dBi to 30 dBi with 0.5 dBi steps

logging.info("Starting dataset generation...")
dataset = generate_dataset(frequencies, distances, tx_gains, rx_gains)
logging.info("Dataset generation completed.")

# Specify the save path
save_path = os.getenv('FSPL_DATASET_PATH', '/media/jmiguel-rai-control/fd67fcf7-7925-43d5-9ad0-85c2882c0795/fspl_dataset.csv.gz')

# Save to CSV
dataset.to_csv(save_path, index=False, compression='gzip')
logging.info(f"Dataset saved to {save_path}")


In [None]:
import numpy as np
import pandas as pd
from joblib import Parallel, delayed
from tqdm import tqdm
import logging
import os

# Setup logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Constants
SPEED_OF_LIGHT = 3e8  # Speed of light in m/s

def calculate_fspl(frequency_mhz, distance_m, tx_gain_dbi, rx_gain_dbi):
    """
    Calculate the Free Space Path Loss (FSPL) for given parameters.

    Args:
        frequency_mhz: Frequency in MHz.
        distance_m: Distance in meters.
        tx_gain_dbi: Transmitter gain in dBi.
        rx_gain_dbi: Receiver gain in dBi.

    Returns:
        A dictionary containing the calculated FSPL and input parameters.
    """
    try:
        frequency_hz = frequency_mhz * 1e6  # Convert MHz to Hz
        wavelength = SPEED_OF_LIGHT / frequency_hz
        fspl = (4 * np.pi * distance_m / wavelength) ** 2
        fspl_db = 10 * np.log10(fspl)
        return {
            'Frequency (MHz)': frequency_mhz,
            'Distance (m)': distance_m,
            'Tx Gain (dBi)': tx_gain_dbi,
            'Rx Gain (dBi)': rx_gain_dbi,
            'FSPL (dB)': fspl_db - tx_gain_dbi - rx_gain_dbi
        }
    except Exception as e:
        logging.error(f"Error in FSPL calculation: {e}")
        return {
            'Frequency (MHz)': frequency_mhz,
            'Distance (m)': distance_m,
            'Tx Gain (dBi)': tx_gain_dbi,
            'Rx Gain (dBi)': rx_gain_dbi,
            'FSPL (dB)': None
        }

def generate_dataset(frequencies, distances, tx_gains, rx_gains):
    """
    Generate a dataset for FSPL calculations.

    Args:
        frequencies: Array of frequencies in MHz.
        distances: Array of distances in meters.
        tx_gains: Array of transmitter gains in dBi.
        rx_gains: Array of receiver gains in dBi.

    Returns:
        A pandas DataFrame containing the generated FSPL dataset.
    """
    total_tasks = len(frequencies) * len(distances) * len(tx_gains) * len(rx_gains)
    logging.info(f"Total calculations to perform: {total_tasks}")

    # Use tqdm outside the Parallel loop for efficient progress tracking
    data = []
    with tqdm(total=total_tasks, desc="Generating FSPL Dataset") as pbar:
        for result in Parallel(n_jobs=-1)(
            delayed(calculate_fspl)(freq, dist, tx_gain, rx_gain)
            for freq in frequencies
            for dist in distances
            for tx_gain in tx_gains
            for rx_gain in rx_gains
        ):
            data.append(result)
            pbar.update(1)

    df = pd.DataFrame(data)
    return df

# Define ranges
frequencies = np.arange(1, 4001, 0.5)  # 1 MHz to 4 GHz with 0.5 MHz steps
distances = np.arange(3, 10001, 0.5)  # 3 meters to 10,000 meters with 0.5 meter steps
tx_gains = np.arange(0, 31, 0.5)  # 0 dBi to 30 dBi with 0.5 dBi steps
rx_gains = np.arange(0, 31, 0.5)  # 0 dBi to 30 dBi with 0.5 dBi steps

logging.info("Starting dataset generation...")
dataset = generate_dataset(frequencies, distances, tx_gains, rx_gains)
logging.info("Dataset generation completed.")

# Specify the save path
save_path = os.getenv('FSPL_DATASET_PATH', '/media/jmiguel-rai-control/fd67fcf7-7925-43d5-9ad0-85c2882c0795/fspl_dataset.csv.gz')

# Save to CSV
dataset.to_csv(save_path, index=False, compression='gzip')
logging.info(f"Dataset saved to {save_path}")


## `ingest_files.py` Script

### Overview

This script handles file uploads, processes the uploaded files using OCR, and saves the results to a database. It includes error handling, logging, and security measures to ensure the application's stability and security.

### Key Functions

- `allowed_file(filename)`: Checks if the uploaded file has an allowed extension.
- `save_document(filename, ocr_results, file_type, file_size, user_id)`: Saves the document and its OCR results to the database.
- `upload_file()`: Handles the file upload, processes the file using OCR, and saves the results to the database.

### Enhancements

- Added docstrings to all functions.
- Improved logging with more specific messages.
- Implemented robust error handling.
- Enhanced security measures with input sanitization and secure filename handling.
- Configured the application using environment variables for better flexibility.

### Example Usage

To run the Flask application:

```bash
export FLASK_DEBUG=true
export FLASK_PORT=5000
export GOOGLE_CLOUD_CREDENTIALS=/path/to/credentials.json
export OPENAI_API_KEY=your_openai_api_key
flask run


In [None]:
import os
import logging
from flask import Flask, request, jsonify, send_from_directory
from werkzeug.utils import secure_filename
from veda_app import create_app
from veda_app.models import db, Document
from combined_ocr_module import process_image
from sqlalchemy.exc import SQLAlchemyError

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s')

app = create_app()

# Allowed file extensions
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'pdf'}

def allowed_file(filename):
    """Check if the file has an allowed extension."""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def save_document(filename, ocr_results, file_type, file_size, user_id):
    """Save the document and its OCR results to the database."""
    try:
        document = Document(
            title=filename,
            content=ocr_results,
            file_type=file_type,
            file_size=file_size,
            user_id=user_id
        )
        db.session.add(document)
        db.session.commit()
        logging.info(f"Document {filename} processed and saved to database.")
        return True
    except SQLAlchemyError as e:
        logging.error(f"Database error: {e}")
        db.session.rollback()
        return False

@app.route('/upload', methods=['POST'])
def upload_file():
    """Handle file upload and processing."""
    if 'file' not in request.files:
        logging.error("No file part in the request.")
        return jsonify({'error': 'No file part'}), 400

    file = request.files['file']
    if file.filename == '':
        logging.error("No selected file.")
        return jsonify({'error': 'No selected file'}), 400

    if file and allowed_file(file.filename):
        filename = secure_filename(file.filename)
        file_path = os.path.join(app.config['UPLOAD_FOLDER'], filename)
        file.save(file_path)
        logging.info(f"File {filename} saved at {file_path}")

        # Process the file
        try:
            credentials_path = app.config.get('GOOGLE_CLOUD_CREDENTIALS')
            openai_api_key = app.config.get('OPENAI_API_KEY')
            ocr_results = process_image(file_path, credentials_path, openai_api_key)

            # Save to database
            if save_document(filename, ocr_results, file.mimetype, os.path.getsize(file_path), request.form.get('user_id')):
                return jsonify({'success': 'File uploaded and processed successfully', 'ocr_results': ocr_results})
            else:
                return jsonify({'error': 'An error occurred while saving to the database.'}), 500

        except Exception as e:
            logging.error(f"Error processing file: {e}")
            return jsonify({'error': 'An error occurred while processing the file.'}), 500

    else:
        logging.error("File type not allowed.")
        return jsonify({'error': 'File type not allowed'}), 400

if __name__ == '__main__':
    debug = os.getenv('FLASK_DEBUG', 'true').lower() in ['true', '1', 't']
    port = int(os.getenv('FLASK_PORT', 5000))
    app.run(debug=debug, port=port)


## `train_model.py` Script

### Overview

This script is responsible for training a machine learning model using the Free Space Path Loss (FSPL) dataset. It includes steps for loading data, preprocessing, training, evaluation, and saving the model.

### Key Functions

- `load_data(file_path)`: Loads the dataset from the specified file path.
- `preprocess_data(data)`: Preprocesses the dataset for training.
- `train_model(X_train, y_train)`: Trains a RandomForestRegressor model using the training data and performs hyperparameter tuning using GridSearchCV.
- `evaluate_model(model, X_test, y_test)`: Evaluates the model using the test data.
- `save_model(model, save_path)`: Saves the trained model to the specified path.
- `load_model(save_path)`: Loads a trained model from the specified path.

### Enhancements

- Added robust error handling for each function.
- Improved logging with detailed messages for tracking progress and errors.
- Used environment variables for configuration to enhance flexibility.
- Included data validation steps to ensure the dataset is suitable for training.
- Added docstrings and comments for better understanding and maintainability.
- Implemented GridSearchCV for hyperparameter tuning.

### Example Usage

To run the training script:

```bash
export DATASET_PATH=/path/to/fspl_dataset.csv.gz
export MODEL_SAVE_PATH=/path/to/rf_model.joblib
python train_model.py


In [None]:
import os
import logging
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load
from sklearn.metrics import mean_squared_error, r2_score

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s')

def load_data(file_path):
    """Load dataset from the specified file path."""
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Data loaded from {file_path}")
        return data
    except FileNotFoundError as e:
        logging.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

def preprocess_data(data):
    """Preprocess the dataset for training."""
    try:
        X = data.drop('FSPL (dB)', axis=1)
        y = data['FSPL (dB)']
        logging.info("Data preprocessing completed.")
        return X, y
    except KeyError as e:
        logging.error(f"Missing key in data: {e}")
        raise
    except Exception as e:
        logging.error(f"Error preprocessing data: {e}")
        raise

def train_model(X_train, y_train):
    """Train the model using the training data."""
    try:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        model = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        logging.info("Model training completed.")
        return best_model
    except Exception as e:
        logging.error(f"Error training model: {e}")
        raise

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using the test data."""
    try:
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        logging.info(f"Model evaluation completed. MSE: {mse}, R-squared: {r2}")
        return mse, r2
    except Exception as e:
        logging.error(f"Error evaluating model: {e}")
        raise

def save_model(model, save_path):
    """Save the trained model to the specified path."""
    try:
        dump(model, save_path)
        logging.info(f"Model saved to {save_path}")
    except Exception as e:
        logging.error(f"Error saving model: {e}")
        raise

def load_model(save_path):
    """Load the trained model from the specified path."""
    try:
        model = load(save_path)
        logging.info(f"Model loaded from {save_path}")
        return model
    except Exception as e:
        logging.error(f"Error loading model: {e}")
        raise

def main():
    """Main function to orchestrate the training process."""
    dataset_path = os.getenv('DATASET_PATH', 'fspl_dataset.csv.gz')
    model_save_path = os.getenv('MODEL_SAVE_PATH', 'rf_model.joblib')

    data = load_data(dataset_path)
    X, y = preprocess_data(data)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = train_model(X_train, y_train)

    mse, r2 = evaluate_model(model, X_test, y_test)

    save_model(model, model_save_path)

    logging.info(f"Training process completed with MSE: {mse}, R-squared: {r2}")

if __name__ == "__main__":
    main()


## `ingest_files.py` Script

### Overview

This combined script handles both file ingestion for OCR processing and data preprocessing, model training, and evaluation for FSPL (Free Space Path Loss) prediction.

### Key Functions

- **OCR File Ingestion**:
  - `allowed_file(filename)`: Checks if a file has an allowed extension.
  - `save_document(filename, ocr_results, file_type, file_size, user_id)`: Saves the OCR results and file metadata to the database.
  - `process_file(file_path, user_id)`: Processes an individual file, including OCR and saving to the database.
  - `ingest_files(file_paths, user_id)`: Ingests multiple files using parallel processing.

- **FSPL Model Training**:
  - `load_data(file_path)`: Loads dataset from the specified file path.
  - `preprocess_data(data)`: Preprocesses the dataset for training.
  - `train_model(X_train, y_train)`: Trains the model using the training data.
  - `evaluate_model(model, X_test, y_test)`: Evaluates the model using the test data.
  - `save_model(model, save_path)`: Saves the trained model to the specified path.
  - `load_model(save_path)`: Loads the trained model from the specified path.

### Enhancements

- Added comprehensive error handling to manage various exceptions.
- Implemented detailed logging to track the progress and identify issues.
- Ensured secure handling of file paths and contents.
- Used parallel processing to handle multiple files efficiently.
- Added docstrings and comments for better understanding and maintainability.

### Example Usage

To run the script:

```bash
export GOOGLE_CLOUD_CREDENTIALS=/path/to/credentials.json
export OPENAI_API_KEY=your_openai_api_key
export FLASK_ENV=development
export DATASET_PATH=path/to/fspl_dataset.csv.gz
export MODEL_SAVE_PATH=path/to/rf_model.joblib

python ingest_files.py


In [None]:
import os
import logging
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from joblib import dump, load
from sklearn.metrics import mean_squared_error, r2_score
from concurrent.futures import ThreadPoolExecutor
from werkzeug.utils import secure_filename
from sqlalchemy.exc import SQLAlchemyError
from veda_app import create_app, db
from veda_app.models import Document
from combined_ocr_module import process_image

# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(filename)s - %(lineno)d - %(levelname)s - %(message)s')

app = create_app()

# Allowed file extensions
ALLOWED_EXTENSIONS = {'png', 'jpg', 'jpeg', 'gif', 'pdf'}

def allowed_file(filename):
    """Check if the file has an allowed extension."""
    return '.' in filename and filename.rsplit('.', 1)[1].lower() in ALLOWED_EXTENSIONS

def save_document(filename, ocr_results, file_type, file_size, user_id):
    """Save the processed document to the database."""
    try:
        document = Document(
            title=filename,
            content=ocr_results,
            file_type=file_type,
            file_size=file_size,
            user_id=user_id
        )
        db.session.add(document)
        db.session.commit()
        logging.info(f"Document {filename} processed and saved to database.")
        return True
    except SQLAlchemyError as e:
        logging.error(f"Database error: {e}")
        db.session.rollback()
        return False

def process_file(file_path, user_id):
    """Process a single file, perform OCR, and save the results."""
    try:
        filename = secure_filename(os.path.basename(file_path))
        if allowed_file(filename):
            file_type = os.path.splitext(filename)[1][1:]
            file_size = os.path.getsize(file_path)

            credentials_path = app.config.get('GOOGLE_CLOUD_CREDENTIALS')
            openai_api_key = app.config.get('OPENAI_API_KEY')
            ocr_results = process_image(file_path, credentials_path, openai_api_key)

            if save_document(filename, ocr_results, file_type, file_size, user_id):
                logging.info(f"File {filename} processed successfully.")
            else:
                logging.error(f"Failed to save document {filename} to database.")
        else:
            logging.error(f"File type not allowed: {filename}")
    except Exception as e:
        logging.error(f"Error processing file {file_path}: {e}")

def ingest_files(file_paths, user_id):
    """Ingest multiple files concurrently using a thread pool."""
    with ThreadPoolExecutor(max_workers=4) as executor:
        for file_path in file_paths:
            executor.submit(process_file, file_path, user_id)

def load_data(file_path):
    """Load the dataset from the specified file path."""
    try:
        data = pd.read_csv(file_path)
        logging.info(f"Data loaded from {file_path}")
        return data
    except FileNotFoundError as e:
        logging.error(f"File not found: {file_path}")
        raise
    except Exception as e:
        logging.error(f"Error loading data: {e}")
        raise

def preprocess_data(data):
    """Preprocess the dataset for training."""
    try:
        X = data.drop('FSPL (dB)', axis=1)
        y = data['FSPL (dB)']
        logging.info("Data preprocessing completed.")
        return X, y
    except KeyError as e:
        logging.error(f"Missing key in data: {e}")
        raise
    except Exception as e:
        logging.error(f"Error preprocessing data: {e}")
        raise

def train_model(X_train, y_train):
    """Train the model using the training data."""
    try:
        param_grid = {
            'n_estimators': [50, 100, 200],
            'max_depth': [5, 10, 20],
            'min_samples_split': [2, 5, 10]
        }
        model = RandomForestRegressor(random_state=42)
        grid_search = GridSearchCV(estimator=model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        logging.info("Model training completed.")
        return best_model
    except Exception as e:
        logging.error(f"Error training model: {e}")
        raise

def evaluate_model(model, X_test, y_test):
    """Evaluate the model using the test data."""
    try:
        predictions = model.predict(X_test)
        mse = mean_squared_error(y_test, predictions)
        r2 = r2_score(y_test, predictions)
        logging.info(f"Model evaluation completed. MSE: {mse}, R-squared: {r2}")
        return mse, r2
    except Exception as e:
        logging.error(f"Error evaluating model: {e}")
        raise

def save_model(model, save_path):
    """Save the trained model to the specified path."""
    try:
        dump(model, save_path)
        logging.info(f"Model saved to {save_path}")
    except Exception as e:
        logging.error(f"Error saving model: {e}")
        raise

def load_model(save_path):
    """Load the trained model from the specified path."""
    try:
        model = load(save_path)
        logging.info(f"Model loaded from {save_path}")
        return model
    except Exception as e:
        logging.error(f"Error loading model: {e}")
        raise

def main():
    """Main function to orchestrate the entire process."""
    user_id = 'default_user_id'  # Replace with actual user ID logic
    file_directory = '/path/to/files'  # Replace with actual file directory
    file_paths = [os.path.join(file_directory, f) for f in os.listdir(file_directory) if allowed_file(f)]

    logging.info("Starting file ingestion process...")
    ingest_files(file_paths, user_id)
    logging.info("File ingestion process completed.")

    dataset_path = os.getenv('DATASET_PATH', 'fspl_dataset.csv.gz')
    model_save_path = os.getenv('MODEL_SAVE_PATH', 'rf_model.joblib')

    data = load_data(dataset_path)
    X, y = preprocess_data(data)

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    model = train_model(X_train, y_train)

    mse, r2 = evaluate_model(model, X_test, y_test)

    save_model(model, model_save_path)

    logging.info(f"Training process completed with MSE: {mse}, R-squared: {r2}")

if __name__ == "__main__":
    main()


### Summary of Completed Tasks:
Basic Infrastructure Setup:

Configured the Flask application with necessary modules (Flask, SQLAlchemy, Migrate, LoginManager).
Set up configurations (config.py) for different environments (development, testing, production).
Database Models:

Created User, Document, and OcrResult models in models.py.
Added relationships and appropriate fields for each model.
Main Application Routes:

Implemented main routes (routes.py) including endpoints for OpenAI and Google Vertex AI predictions.
Added logging, input validation, and error handling.
OCR and Image Processing:

Developed combined_ocr_module.py for processing images with OCR and enhancing text using OpenAI.
Integrated Google Cloud Vision and OpenAI APIs for OCR and text enhancement.
Training and Evaluation Scripts:

Created scripts for loading data, preprocessing, training, evaluating, and saving machine learning models.
Implemented hyperparameter tuning using GridSearchCV and saved models using joblib.
File Ingestion:

Set up file ingestion with support for concurrent processing using ThreadPoolExecutor.
Processed files, performed OCR, and saved results to the database.
Remaining Tasks:
Testing:

Unit Tests: Develop comprehensive unit tests for each function and module.
Integration Tests: Ensure that all components work together seamlessly.
Deployment:

Dockerization: Create Dockerfiles for containerizing the application.
CI/CD Pipeline: Set up continuous integration and deployment pipelines.
Cloud Deployment: Deploy the application on a cloud platform (e.g., Google Cloud, AWS).
User Authentication and Authorization:

Authentication: Implement user registration, login, and logout functionalities.
Authorization: Ensure that users have appropriate permissions to access different resources.
API Enhancements:

Additional Endpoints: Review and enhance any additional API endpoints required for VEDA’s functionality.
API Documentation: Document all API endpoints for better usability and maintenance.
Frontend Integration:

Frontend Framework: Choose a frontend framework (e.g., React, Angular) and begin integration.
UI/UX Design: Design user interfaces for interacting with VEDA.
API Integration: Connect frontend components with backend APIs.
Security Enhancements:

Input Validation: Ensure all inputs are properly validated and sanitized.
CSRF Protection: Implement CSRF protection for forms.
Rate Limiting: Implement rate limiting to prevent abuse of the API.
Performance Optimization:

Caching: Implement caching mechanisms for frequently accessed data.
Load Testing: Perform load testing to ensure the application can handle high traffic.

# Step 1: Database Design

## User Management Model

```python
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text
from sqlalchemy.orm import relationship
from datetime import datetime
from werkzeug.security import generate_password_hash, check_password_hash
from app import db

class User(db.Model):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    username = Column(String(150), nullable=False, unique=True)
    email = Column(String(150), unique=True, nullable=False)
    password_hash = Column(String(150), nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    documents = relationship('Document', backref='user', lazy=True)

    def set_password(self, password: str) -> None:
        """
        Hash and set the user's password.
        
        Args:
            password (str): The password to be hashed and set.
        """
        self.password_hash = generate_password_hash(password)

    def check_password(self, password: str) -> bool:
        """
        Check the user's password.
        
        Args:
            password (str): The password to be checked.
            
        Returns:
            bool: True if the password matches, False otherwise.
        """
        return check_password_hash(self.password_hash, password)

    def __repr__(self) -> str:
        return f'<User {self.username}>'


In [None]:
from sqlalchemy import Column, Integer, String, DateTime, ForeignKey, Text
from sqlalchemy.orm import relationship
from datetime import datetime
from werkzeug.security import generate_password_hash, check_password_hash
from app import db

class User(db.Model):
    __tablename__ = 'users'
    id = Column(Integer, primary_key=True)
    username = Column(String(150), nullable=False, unique=True)
    email = Column(String(150), unique=True, nullable=False)
    password_hash = Column(String(150), nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    documents = relationship('Document', backref='user', lazy=True)

    def set_password(self, password: str) -> None:
        """
        Hash and set the user's password.
        
        Args:
            password (str): The password to be hashed and set.
        """
        self.password_hash = generate_password_hash(password)

    def check_password(self, password: str) -> bool:
        """
        Check the user's password.
        
        Args:
            password (str): The password to be checked.
            
        Returns:
            bool: True if the password matches, False otherwise.
        """
        return check_password_hash(self.password_hash, password)

    def __repr__(self) -> str:
        return f'<User {self.username}>'

class Document(db.Model):
    __tablename__ = 'documents'
    id = Column(Integer, primary_key=True)
    title = Column(String(150), nullable=False)
    content = Column(Text, nullable=False)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    file_type = Column(String(50), nullable=True)
    file_size = Column(Integer, nullable=True)
    user_id = Column(Integer, ForeignKey('users.id'), nullable=False)
    file_path = Column(String(255), nullable=True)  # Added file_path attribute for file storage

    def __repr__(self) -> str:
        return f'<Document {self.title}>'

class Formula(db.Model):
    __tablename__ = 'formulas'
    id = Column(Integer, primary_key=True)
    name = Column(String(150), nullable=False)
    description = Column(Text, nullable=True)
    expression = Column(Text, nullable=False)
    jurisdiction = Column(String(150), nullable=True)
    category = Column(String(150), nullable=True)
    tags = Column(String(255), nullable=True)
    created_at = Column(DateTime, default=datetime.utcnow)
    updated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)

    def __repr__(self) -> str:
        return f'<Formula {self.name}>'


# Step 3: Database Configuration and Initialization

## Database Configuration and Initialization Code

```python
from flask import Flask, render_template
from config import config
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_login import LoginManager
import logging
import os

# Initialize extensions
db = SQLAlchemy()
migrate = Migrate()
login = LoginManager()
login.login_view = 'auth.login'

def create_app(config_name='default'):
    app = Flask(__name__)
    app.config.from_object(config[config_name])

    # Initialize extensions
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)

    # Configure logging
    if not app.debug and not app.testing:
        # Configure logging level
        app.logger.setLevel(app.config.get('LOG_LEVEL', logging.INFO))
        # Configure log handler
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(app.logger.level)
            app.logger.addHandler(stream_handler)
        else:
            if not os.path.exists('logs'):
                os.mkdir('logs')
            file_handler = logging.FileHandler('logs/veda.log')
            file_handler.setLevel(app.logger.level)
            app.logger.addHandler(file_handler)
        app.logger.info('VEDA startup')

    # Register blueprints
    from app.main import main_blueprint
    app.register_blueprint(main_blueprint)

    from app.auth import auth_blueprint
    app.register_blueprint(auth_blueprint, url_prefix='/auth')

    # Error handling
    @app.errorhandler(404)
    def page_not_found(error):
        return render_template('404.html'), 404

    @app.errorhandler(500)
    def internal_server_error(error):
        return render_template('500.html'), 500

    # Additional error handlers can be added here

    return app


In [None]:
from flask import Flask, render_template
from config import config
from flask_sqlalchemy import SQLAlchemy
from flask_migrate import Migrate
from flask_login import LoginManager
import logging
import os

# Initialize extensions
db = SQLAlchemy()
migrate = Migrate()
login = LoginManager()
login.login_view = 'auth.login'

def create_app(config_name='default'):
    app = Flask(__name__)
    app.config.from_object(config[config_name])

    # Initialize extensions
    db.init_app(app)
    migrate.init_app(app, db)
    login.init_app(app)

    # Configure logging
    if not app.debug and not app.testing:
        # Configure logging level
        app.logger.setLevel(app.config.get('LOG_LEVEL', logging.INFO))
        # Configure log handler
        if app.config.get('LOG_TO_STDOUT'):
            stream_handler = logging.StreamHandler()
            stream_handler.setLevel(app.logger.level)
            app.logger.addHandler(stream_handler)
        else:
            if not os.path.exists('logs'):
                os.mkdir('logs')
            file_handler = logging.FileHandler('logs/veda.log')
            file_handler.setLevel(app.logger.level)
            app.logger.addHandler(file_handler)
        app.logger.info('VEDA startup')

    # Register blueprints
    from app.main import main_blueprint
    app.register_blueprint(main_blueprint)

    from app.auth import auth_blueprint
    app.register_blueprint(auth_blueprint, url_prefix='/auth')

    # Error handling
    @app.errorhandler(404)
    def page_not_found(error):
        return render_template('404.html'), 404

    @app.errorhandler(500)
    def internal_server_error(error):
        return render_template('500.html'), 500

    # Additional error handlers can be added here

    return app
