In [None]:
!pip install google-generativeai
!pip install colorama
!pip install google-cloud-vision

In [None]:
# !unzip /content/images_probe.zip

In [None]:
import os
import time
import shutil
import concurrent.futures  # Import this to remove errors related to threading
from Class_Google_Clould_Vision import Google_Cloud_Vision
from Class_Logger import Logger
from PIL import Image
import cv2
import google.generativeai as genai
import json
import re
import numpy as np
import tempfile
import torch
import torchvision.transforms as transforms
from model import EnhancedCNN


vision_path = ""
model_name = "gemini-2.0-flash-exp"
classifier_path = '/content/enhanced_cnn_best.pth'


def load_api_keys(file_path):
    keys = []
    with open(file_path, 'r') as f:
        for line in f:
            line = line.strip()
            if line.startswith("key"):
                parts = line.split(":", 1)
                if len(parts) == 2:
                    key = parts[1].strip()
                    keys.append(key)
    return keys


api_key_file = "/content/Api keys _ gemini_for_production.txt"
api_keys = load_api_keys(api_key_file)
if not api_keys:
    raise ValueError("No Gemini API keys found in the provided file.")
print("Modules Imported Successfully!!!!!")

token_usage_list = []  # Global list to store token usage
failed_key_info = []  # global list to track failed keys

# Load class names from the text file
with open('/content/class_names.txt', 'r') as f:
    class_names = [line.strip() for line in f.readlines()]


# Function to perform inference on an image
def predict_image(image_path, model_path, class_names, threshold=0.5):
    # Instantiate the model and load the trained weights
    model = EnhancedCNN(num_classes=len(class_names))
    model.load_state_dict(
        torch.load(
            model_path,
            map_location=torch.device('cuda' if torch.cuda.is_available() else 'cpu'),
            weights_only=True
        )
    )
    model.eval()  # Set the model to evaluation mode

    # Move model to the appropriate device
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Define the transformation
    transform = transforms.Compose([
        transforms.Resize((128, 128)),  # Match input size during training
        transforms.ToTensor(),
        transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
    ])

    # Load and preprocess the image
    image = Image.open(image_path).convert('RGB')
    image = transform(image).unsqueeze(0).to(device)  # Add batch dimension and move to device

    # Perform inference
    with torch.no_grad():
        outputs = model(image)  # Forward pass
        probabilities = torch.nn.functional.softmax(outputs, dim=1)  # Convert to probabilities
        confidence, predicted = torch.max(probabilities, 1)  # Get the max probability and corresponding class index
        predicted_class = predicted.item()
        predicted_label = class_names[predicted_class]

    # Check confidence level
    if confidence.item() < threshold:
        predicted_label = "Unforeseen Image"

    return predicted_class, predicted_label, confidence.item()


def generate_gemini_content(prompt, image_path, img_binary, model_name, active_api_key, logger_instance):
    genai.configure(api_key=active_api_key)
    model = genai.GenerativeModel(model_name)
    image = Image.open(image_path)

    try:
        logger_instance.info(f"Attempting Gemini API call with key: {active_api_key}")
        response = model.generate_content([prompt, image])
        json_str = response.text.strip('```json').strip('```')

        processed_results = []
        # attempt to extract the valid dict from the JSON
        for match in re.finditer(r'{[^{}]*}', json_str):
            try:
                item = json.loads(match.group(0))
                if all(key in item for key in ["Given_Name", "Surname", "Occupation", "Date", "bounding_box", "confidence"]):
                    bounding_box = item.get('bounding_box')  # safely access bounding_box
                    if isinstance(bounding_box, str):
                        try:
                            x1, y1, x2, y2 = map(int, bounding_box.split(','))
                            cv2.rectangle(img_binary, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)
                        except ValueError:
                            print(f"Warning: Could not parse bounding box for record:{item}, skipping bbox drawing")
                    elif isinstance(bounding_box, list) and len(bounding_box) == 4:
                        x1, y1, x2, y2 = map(int, bounding_box)
                        cv2.rectangle(img_binary, (int(x1), int(y1)), (int(x2), int(y2)), (0, 255, 0), 1)
                    else:
                        print(
                            f"Warning: Invalid bounding box format or missing data for record:{item}, skipping bbox drawing")

                    item['Image'] = os.path.basename(image_path)  # adding the image key
                    processed_results.append(item)
            except json.JSONDecodeError:
                print("JSONDecodeError while parsing partial JSON from Gemini response")
                continue

        cv2.imwrite(os.path.join('output_images', os.path.basename(image_path)), img_binary)

        global token_usage_list
        # Add token count to global list
        token_usage_list.append({
            'Image': os.path.basename(image_path),
            'Tokens_used': response.usage_metadata.total_token_count
        })
        return processed_results, True

    except Exception as e:
        logger_instance.error(f"An Error occurred during the process, with error {e} with key: {active_api_key}")
        return None, False


def extract_table_rows_with_vision_data_v2(image_path, vision_response, img_binary, model_name, api_keys,
                                          logger_instance, key_index):

    prompt = f"""
        Analyze the following image and the provided text data:

        **Image:**
        [Image of {image_path}]

        **Text Data:**
        {json.dumps(vision_response, indent=2)}

        **Task:**
        1. Given_Name: The full first name(s) of the individual. It can be in multiple lines including multiple words as well with example.
        2. Surname: The full last name of the individual.
        3. Occupation: The full occupation or rank of the individual, including all titles and abbreviations.
        4. Date: The latest date associated with the individual. If multiple dates are present, use the most recent. If only one date, use that date.
        5. bounding_box:  A dictionary containing *estimated* bounding box coordinates for the *entire text area* of the *valid personnel record*, based on the original image dimensions, but not pixel-perfect accurate as model limitations are known. The coordinates should not include any titles, headings, or non-record text. Consider the text data from the provided list, including 'word', 'coordinates' (in x1,y1&x2,y2 format), and 'confidence' to improve accuracy and provide more detailed information.
            - x1: The leftmost pixel, using the original image dimensions, where the *first character of the valid personnel record* begins.
            - y1: The topmost pixel, using the original image dimensions, of the *highest character in the valid personnel record*.
            - x2: The rightmost pixel, using the original image dimensions, where the *last character of the valid personnel record* ends.
            - y2: The bottommost pixel, using the original image dimensions, of the *lowest character in the valid personnel record*.

        **Output:**
        Present the extracted rows in a JSON format with the following structure:
            [
            {{
                "Given_Name": "Richard Graham",
                "Surname": "Bloomfield",
                "Occupation": "Wing Commander",
                "Date": "1 Aug.19",
                "bounding_box": "x1,y1,x2,y2",
                "confidence": "0.93124869432"
            }},
            {{
                "Given_Name": "Some Given Name",
                "Surname": "Some Surname",
                "Occupation": "Some Occupation",
                "Date": "1 Jan.22",
                "bounding_box": "x1,y1,x2,y2",
                "confidence": "0.93124869432"
            }}
        ]
        """
    active_api_key = api_keys[key_index % len(api_keys)]  # Get the key based on the index and cycle if needed

    results, success = generate_gemini_content(prompt, image_path, img_binary, model_name, active_api_key,
                                                   logger_instance)
    if success:
        return results
    else:
        logger_instance.error(f"Failed to process {image_path} with current API key")
        return None


def process_image(image_path, vision_path, model_name, debugger_instance, logger_instance, vision_response, api_keys, image_index):
    img_binary = cv2.imread(image_path)
    if img_binary is None:
        logger_instance.warning(f"Could not read image: {image_path}")
        return None
    results = extract_table_rows_with_vision_data_v2(image_path, vision_response, img_binary, model_name, api_keys,
                                                   logger_instance, image_index)

    return results


def get_color_image(image_binary):
    if len(image_binary.shape) == 3 and image_binary.shape[2] == 3:
        return image_binary
    else:
        img_binary = cv2.cvtColor(image_binary, cv2.COLOR_GRAY2BGR)
        return img_binary


def convert_thresh_hybrid(image_binary):
    img_bin_color = get_color_image(image_binary)
    gray = cv2.cvtColor(img_bin_color, cv2.COLOR_BGR2GRAY)
    blur = cv2.GaussianBlur(gray, (5, 5), 0)
    clahe = cv2.createCLAHE(clipLimit=10.0, tileGridSize=(5, 5))
    gray = clahe.apply(gray)
    sharpen_kernel = np.array([[-1, -1, -1], [-1, 9, -1], [-1, -1, -1]])
    sharpen = cv2.filter2D(blur, -1, sharpen_kernel)
    thresh = cv2.adaptiveThreshold(sharpen, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY_INV, 11, 5)
    return thresh


def detect_contours_vertical(thresh, roi=None, kernel=20):
    try:
        vertical_kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (1, kernel))
        vertical = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, vertical_kernel, iterations=2)
        cnts = cv2.findContours(vertical, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
        cnts = cnts[0] if len(cnts) == 2 else cnts[1]
        vertical_contours = []
        for c in cnts:
            x, y, w, h = cv2.boundingRect(c)
            vertical_contour = ([(x, y), (x + w, y), (x + w, y + h), (x, y + h)])
            if roi is not None:
                roi_x1, roi_y1, roi_x2, roi_y2 = roi
                if (x <= roi_x2 and x + w >= roi_x1) and (y <= roi_y2 and y + h >= roi_y1):
                    vertical_contours.append(vertical_contour)
            else:
                vertical_contours.append(vertical_contour)
        return vertical_contours
    except Exception as ex:
        print(f"Error: {ex}")
        return None


def draw_vertical_polylines(vcs, image_binary):
    sorted_contours = sorted(vcs, key=lambda c: c[0][0])
    col_threshold = 10
    cols = [[]]
    for contour in sorted_contours:
        added_to_col = False
        for col in cols:
            if len(col) == 0:
                col.append(contour)
                added_to_col = True
                break
            else:
                prev_x = col[-1][0][0]
                current_x = contour[0][0]
                if abs(prev_x - current_x) <= col_threshold:
                    col.append(contour)
                    added_to_col = True
                    break
        if not added_to_col:
            cols.append([contour])

    vlines = []
    height, width, channels = image_binary.shape
    vlines_bin = image_binary.copy()
    for col in cols:
        if len(col) > 0:
            min_y = 0
            max_y = height
            x = col[0][0][0]
            vline = [(x, min_y), (x, max_y)]
            cv2.line(vlines_bin, (x, min_y), (x, max_y), (255, 0, 0), 1)
            vlines.append(vline)

    return vlines


def image_splitter(image_path, temp_dir, logger, debugger_instance):
    image_binary = cv2.imread(image_path)

    if image_binary is None:
        logger.warning(f"Could not read image: {image_path}")
        return [], None

    height, width, _ = image_binary.shape
    thresh = convert_thresh_hybrid(image_binary)
    vcs = detect_contours_vertical(thresh=thresh, kernel=100)
    vlines = draw_vertical_polylines(vcs, image_binary)
    sorted_vlines = sorted(vlines, key=lambda x: x[0])

    roi_coordinates = []
    prev_x = 0

    for i, line in enumerate(sorted_vlines):
        x, _ = line[0]
        roi_coordinates.append((prev_x, 1, x, height))
        prev_x = x
        if i == len(vlines) - 1:
            roi_coordinates.append((prev_x, 1, width, height))

    if len(roi_coordinates) > 0:
        temp_image_paths = []
        base_name = os.path.splitext(os.path.basename(image_path))[0]

        for i, roi in enumerate(roi_coordinates):
            x1, y1, x2, y2 = roi
            cropped_width = x2 - x1

            # Only process if width is more than 15% of total width
            if cropped_width > (width * 0.15):
                cropped_image = image_binary[y1:y2, x1:x2]

                if not cropped_image.size == 0:
                    temp_filename = f"{base_name}_part_{i + 1}.jpg"
                    temp_path = os.path.join(temp_dir, temp_filename)
                    cv2.imwrite(temp_path, cropped_image)
                    temp_image_paths.append(temp_path)
                else:
                    logger.warning(f"Empty Cropped Image, skipping the image for {image_path} and ROI {roi}")
            else:
                logger.info(f"Skipping ROI with width {cropped_width} which is less than 15% of total width: {width}")

        return temp_image_paths, image_binary
    else:
        logger.warning(f"Vertical segmentation error for {image_path}")
        return [], image_binary




def process_images_in_folder_v2(image_folder, vision_path, model_name, classifier_path, api_keys,
                                 debugger_instance=True, logger_instance=None):
    if not logger_instance:
        logger_instance = Logger().get_logger()

    all_results = []
    image_files = [os.path.join(image_folder, filename)
                   for filename in os.listdir(image_folder)
                   if filename.lower().endswith(('.png', '.jpg', '.jpeg'))]

    with tempfile.TemporaryDirectory() as temp_dir:
        for image_index, image_path in enumerate(image_files):
            predicted_class, predicted_label, confidence = predict_image(image_path, classifier_path, class_names)
            logger_instance.info(
                f'Predicted class for {os.path.basename(image_path)} is {predicted_label} with confidence {confidence}')
            print(f"Processing image: {os.path.basename(image_path)} from folder: {os.path.basename(image_folder)}")

            img_binary = cv2.imread(image_path)
            if img_binary is None:
                logger_instance.warning(f"Could not read image: {image_path}")
                continue

            gcv = Google_Cloud_Vision(logger=logger_instance)
            gcv.set_debug_mood(debugger_instance)
            gcv.set_gcv_key(vision_path)
            gcv.set_input_file(image_path)

            vision_response = gcv.google_cloud_vision_response_with_word_wise_coordinates(img_binary,
                                                                                           language_hints=['en'])
            logger_instance.info(f'Vision Done. Gemini on process {os.path.basename(image_path)}')

            if predicted_label == 'col1':
                logger_instance.info(
                    f"This {os.path.basename(image_path)} identified as {predicted_label}. So splitting in progress")
                temp_image_paths, image_binary = image_splitter(image_path, temp_dir, logger_instance,
                                                                debugger_instance)
                if len(temp_image_paths) > 0:
                    for temp_path in temp_image_paths:
                        results = process_image(temp_path, vision_path, model_name, debugger_instance,
                                                logger_instance, vision_response, api_keys, image_index)
                        if results:
                            all_results.extend(results)
                elif image_binary is not None:
                   results = process_image(image_path, vision_path, model_name, debugger_instance,
                                            logger_instance, vision_response, api_keys, image_index)
                   if results:
                        all_results.extend(results)

            elif predicted_label == 'col0':
                logger_instance.info(
                    f"This {os.path.basename(image_path)} identified as {predicted_label}. No splitting required")
                results = process_image(image_path, vision_path, model_name, debugger_instance, logger_instance,
                                        vision_response, api_keys, image_index)
                if results:
                    all_results.extend(results)
            else:
                logger_instance.warning(f"Skipping image {image_path} due to unknown classification {predicted_label}")

            time.sleep(5) # Introduce the 5 seconds delay

    global failed_key_info
    if failed_key_info:
        logger_instance.warning(f"Failed keys: {failed_key_info}")

    return all_results


def transform_records(records):
    def parse_date(date_str):
        if not date_str:
            return None, None, None

        date_str = date_str.replace('.', '')
        import re

        # Updated patterns to handle both formats
        patterns = [
            r'(\d{1,2})\s*([A-Za-z]+)\s*(\d{2})?',  # Handles "1 Sept" and "1 Sept 23"
            r'(\d{1,2})([A-Za-z]+)(\d{2})?'  # Handles "1Sept" and "1Sept23"
        ]

        for pattern in patterns:
            match = re.search(pattern, date_str)
            if match:
                day, month, year = match.groups()
                day = day.strip()
                month = month.strip()
                year = '19' + year.strip() if year else None
                return day, month, year

        return None, None, None

    transformed_records = []

    for record in records:
        transformed_record = record.copy()

        # Parse the date and add components
        day, month, year = parse_date(record['Date'])
        transformed_record['Day'] = day
        transformed_record['Month'] = month
        transformed_record['Year'] = year

        # Remove the original Date field
        del transformed_record['Date']

        transformed_records.append(transformed_record)

    return transformed_records


def correct_month_abbreviations(records):
    corrected_records = []
    for record in records:
        corrected_record = record.copy()

        # Only correct 'ct' to 'Oct'
        if record['Month'] == 'ct':
            corrected_record['Month'] = 'Oct'

        corrected_records.append(corrected_record)

    return corrected_records


def transform_none_to_space(data):
    for item in data:
        for key in item:
            if item[key] is None:
                item[key] = ' '
    return data


def clean_occupation_field(data):
    for item in data:
        occupation = item['Occupation']

        # Split Given_Name into parts and handle each part
        given_name_parts = item['Given_Name'].strip().replace('.', ' ').split()
        surname = item['Surname'].strip()

        # Create a working copy of occupation
        cleaned_occupation = occupation

        # Remove each part of Given_Name if found
        for part in given_name_parts:
            cleaned_occupation = cleaned_occupation.replace(part, '').strip()

        # Remove surname
        cleaned_occupation = cleaned_occupation.replace(surname, '').strip()

        # Clean up any remaining artifacts
        cleaned_occupation = cleaned_occupation.strip(' ,.').strip()

        # Update the occupation field
        item['Occupation'] = cleaned_occupation

    return data


def process_data(input_data):
    # First transform None values to spaces
    data = transform_none_to_space(input_data)
    # Then clean occupation fields
    data = clean_occupation_field(data)
    return data


def process_names_v2(input_data):
    processed_data = []

    for record in input_data:
        new_record = record.copy()

        # Handle cases where name contains comma
        if ',' in record['Given_Name']:
            # Split by comma first
            surname, given_parts = record['Given_Name'].split(',', 1)
            new_record['Surname'] = surname.strip()
            new_record['Given_Name'] = given_parts.strip()
        else:
            # Original logic for when no comma exists
            name_parts = record['Given_Name'].split()

            if record['Surname'] is None or record['Surname'].strip() == '':
                if name_parts:  # Check if name_parts is not empty
                    new_record['Surname'] = name_parts[-1]
                    new_record['Given_Name'] = ' '.join(name_parts[:-1])
                else:
                    new_record['Surname'] = ''
                    new_record['Given_Name'] = ''
            else:
                surname = record['Surname']
                filtered_parts = [part for part in name_parts if part != surname]
                new_record['Given_Name'] = ' '.join(filtered_parts)

        processed_data.append(new_record)

    return processed_data


def add_conf_Prect(input_records):
    transformed_records = []

    for record in input_records:
        new_record = {}

        # Fields to process
        fields = ['Given_Name', 'Surname', 'Occupation', 'Day', 'Month', 'Year']

        for field in fields:
            # Add the main field
            new_record[field] = record[field]
            # Add confidence
            new_record[f'{field}_conf'] = record['confidence']
            # Add bounding box
            new_record[f'{field}_PRect'] = record['bounding_box']

        # Add Image field
        new_record['Image'] = record['Image'].split('.jpg')[0]

        transformed_records.append(new_record)

    return transformed_records


def add_orig_hwr(input_records):
    base_fields = ['Given_Name', 'Surname', 'Occupation', 'Day', 'Month', 'Year']
    transformed_records = []

    for record in input_records:
        new_record = {}

        # Handle the base fields that need _orig and _hwr suffixes
        for field in base_fields:
            if field in record:
                new_record[f"{field}_orig"] = record[field]
                new_record[f"{field}_hwr"] = record[field]

                # Copy the associated confidence and PRect fields
                if f"{field}_conf" in record:
                    new_record[f"{field}_conf"] = record[f"{field}_conf"]
                if f"{field}_PRect" in record:
                    new_record[f"{field}_PRect"] = record[f"{field}_PRect"]

        # Copy the Image field as is
        if 'Image' in record:
            new_record['Image'] = record['Image']

        transformed_records.append(new_record)

    return transformed_records


def add_missing_keys(data_list, header, batch_name):
    """
    Add missing keys from the header to each dictionary in the input list.
    Sets ImageType to 'Graduation_List' and Batch to the provided batch_name.

    Parameters:
    data_list (list): A list of dictionaries containing data.
    header (list): A list of all expected keys.
    batch_name (str): The batch name to be added.

    Returns:
    list: Updated list of dictionaries with missing keys added in the correct order.
    """
    updated_list = []
    for data in data_list:
        ordered_data = {key: data.get(key, '') for key in header}
        ordered_data['ImageType'] = 'Graduation_List'
        ordered_data['Batch'] = os.path.basename(batch_name)
        updated_list.append(ordered_data)
    return updated_list


header = [
    'ImageType', 'Batch', 'Image', 'ZoneUserID', 'ZoneDate', 'ImageRotation',
    'Page_orig', 'Page_hwr', 'Page_conf', 'Page_PRect',
    'Occupation_orig', 'Occupation_hwr', 'Occupation_conf', 'Occupation_PRect',
    'Military Branch_orig', 'Military Branch_hwr', 'Military Branch_conf', 'Military Branch_PRect',
    'Given_Name_orig', 'Given_Name_hwr', 'Given_Name_conf', 'Given_Name_PRect',
    'Surname_orig', 'Surname_hwr', 'Surname_conf', 'Surname_PRect',
    'Rank_orig', 'Rank_hwr', 'Rank_conf', 'Rank_PRect',
    'Day_orig', 'Day_hwr', 'Day_conf', 'Day_PRect',
    'Month_orig', 'Month_hwr', 'Month_conf', 'Month_PRect',
    'Year_orig', 'Year_hwr', 'Year_conf', 'Year_PRect',
    'Organization_orig', 'Organization_hwr', 'Organization_conf', 'Organization_PRect'
]

def write_to_pipe_delimited_file(list_of_dicts, filename=None):
    try:
        if not list_of_dicts:
            return ""
        headers = list(list_of_dicts[0].keys())
        header_line = '|'.join(headers)
        data_lines = []
        for entry in list_of_dicts:
            values = []
            for key in headers:
                try:
                    value = str(entry.get(key, ''))
                    values.append(value)
                except Exception as e:
                    print(f"An error occurred while processing key '{key}' for entry: {str(e)}")
                    values.append('')
            data_lines.append('|'.join(values))
        result = f"{header_line}\n"
        result += '\n'.join(data_lines)
        if filename:
            try:
                with open(filename, 'w', encoding='utf-8') as file:
                    file.write(result)
            except Exception as e:
                print(f"An error occurred while writing to file '{filename}': {str(e)}")
        return result
    except Exception as e:
        print(f"An error occurred in write_to_pipe_delimited_file: {str(e)}")
        return ""


def organize_images_into_folders(image_folder, images_per_folder=20):
    """
    Organizes images from a given folder into subfolders, each containing a specified
    number of images.

    Args:
        image_folder (str): The path to the folder containing the images.
        images_per_folder (int, optional): The number of images to include in each subfolder.
            Defaults to 20.
    """
    if not os.path.exists(image_folder):
        print(f"Error: Image folder not found at '{image_folder}'")
        return

    image_files = [f for f in os.listdir(image_folder) if os.path.isfile(os.path.join(image_folder, f))
                   and f.lower().endswith(('.png', '.jpg', '.jpeg', '.gif', '.bmp'))]
    num_images = len(image_files)

    if num_images == 0:
        print(f"Error: No images found in folder '{image_folder}'")
        return

    num_folders = (num_images + images_per_folder - 1) // images_per_folder

    print(f"Organizing {num_images} images into {num_folders} folders...")

    for folder_num in range(num_folders):
        folder_name = f"{os.path.basename(image_folder)}_part_{folder_num + 1}"  # Modified folder naming
        new_folder_path = os.path.join(image_folder, folder_name)
        os.makedirs(new_folder_path, exist_ok=True)  # Create folder if it doesn't exist

        start_index = folder_num * images_per_folder
        end_index = min((folder_num + 1) * images_per_folder, num_images)

        for i in range(start_index, end_index):
            source_path = os.path.join(image_folder, image_files[i])
            destination_path = os.path.join(new_folder_path, image_files[i])
            shutil.move(source_path, destination_path)

    print("Image organization complete!")

def main(image_base_folder):
    start_time = time.time()
    print("Processing Starts.........!!!!!!")

    output_folder = "/content/subfolder_failed_output"  # set output folder
    os.makedirs(output_folder, exist_ok=True)  # create the folder if it doesn't exist

    # Get subfolders inside the main image folder
    subfolders = [f.path for f in os.scandir(image_base_folder) if f.is_dir()]

    for folder in subfolders:
        print(f"Processing folder: {folder}")

        results = process_images_in_folder_v2(folder, vision_path, model_name, classifier_path, api_keys)
        transformed_records = transform_records(results)
        corrected_output = correct_month_abbreviations(transformed_records)
        processed_data = process_data(corrected_output)
        processed_names = process_names_v2(processed_data)
        added_conf_prect = add_conf_Prect(processed_names)
        added_orig_hwr = add_orig_hwr(added_conf_prect)
        added_missing_keys = add_missing_keys(added_orig_hwr, header, folder)

        # Create a file based on the subfolder name in the output folder
        file_name = os.path.join(output_folder, f"{os.path.basename(folder)}.txt")
        write_to_pipe_delimited_file(added_missing_keys, file_name)

        print(f"Output has been saved to: {file_name}")

    end_time = time.time()
    print("\nProcessing Ends....!!!!!!!")

    elapsed_time = end_time - start_time
    print("\n")
    print(f"It took {elapsed_time} seconds to complete!!!!!!")
    print("\n")


if __name__ == "__main__":
    image_base_folder = "/content/image540"  # change this to your main image folder

    # First organize the images into subfolders
    organize_images_into_folders(image_base_folder, images_per_folder=3)

    # Then process the images in subfolders
    main(image_base_folder)

In [None]:
# code to combine  pipe delimited text files
import os

def combine_text_files(input_dir, output_file):
    """
    Combines multiple pipe-delimited text files into a single output file.

    Args:
        input_dir (str): Path to the directory containing the input files.
        output_file (str): Path to the desired output file.
    """

    with open(output_file, 'w', encoding='utf-8') as outfile:
        header_written = False  # Flag to track if the header has been written

        for filename in os.listdir(input_dir):
            if filename.endswith(".txt"):  # Process only .txt files
                filepath = os.path.join(input_dir, filename)
                with open(filepath, 'r', encoding='utf-8') as infile:
                    for i, line in enumerate(infile):
                        line = line.strip() #remove leading/trailing whitespace
                        if not line: #skip empty lines
                            continue

                        if not header_written:
                            outfile.write(line + "\n")
                            header_written = True  # Write the header once from first file
                        elif i > 0 :
                            outfile.write(line + "\n")


def main():
    input_directory = "/content/subfolder_failed_output"  # Replace with the actual directory
    output_filename = "failed_images_with_540.txt"  # Replace with the desired output file name

    combine_text_files(input_directory, output_filename)
    print(f"Files combined successfully into: {output_filename}")


main()

In [None]:
# code to transform pipe delimited text file to csv
import csv

def convert_pipe_to_csv(input_file, output_file):
    """
    Converts a pipe-delimited text file to a CSV file.

    Args:
        input_file (str): Path to the input pipe-delimited file.
        output_file (str): Path to the desired output CSV file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', newline='', encoding='utf-8') as outfile:

        reader = csv.reader(infile, delimiter='|', skipinitialspace=True)
        writer = csv.writer(outfile)

        for row in reader:
            #remove leading/trailing white space from row values
            cleaned_row = [value.strip() for value in row]
            writer.writerow(cleaned_row)

def main():
    input_filename = "/content/failed_images_with_540.txt"  # Replace with your input file
    output_filename = "failed_images_with_540.csv" # Replace with your desired output file name

    convert_pipe_to_csv(input_filename, output_filename)
    print(f"File converted successfully to: {output_filename}")

main()

In [10]:
# code to transform csv to pipe delimited text file

import csv

def convert_csv_to_pipe(input_file, output_file):
    """
    Converts a CSV file to a pipe-delimited text file.

    Args:
        input_file (str): Path to the input CSV file.
        output_file (str): Path to the desired output pipe-delimited file.
    """
    with open(input_file, 'r', encoding='utf-8') as infile, \
         open(output_file, 'w', encoding='utf-8') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile, delimiter='|')

        for row in reader:
            # remove leading/trailing white space from row values
            cleaned_row = [value.strip() for value in row]
            writer.writerow(cleaned_row)

def main():
    input_filename = "failed_images.csv"  # Replace with your input file
    output_filename = "failed_images_with_540_final.txt" # Replace with your desired output file name

    convert_csv_to_pipe(input_filename, output_filename)
    print(f"File converted successfully to: {output_filename}")

if __name__ == '__main__':
    main()


File converted successfully to: failed_images_with_540_final.txt
