crop the image

In [1]:
import cv2
import numpy as np
import os

def find_green_rectangle(image_path):
    # Load the image
    image = cv2.imread(image_path)
    if image is None:
        raise ValueError(f"Image not found: {image_path}")

    # Convert the image to HSV color space
    hsv_image = cv2.cvtColor(image, cv2.COLOR_BGR2HSV)

    # Define the range for the green color in HSV
    lower_green = np.array([35, 100, 100])
    upper_green = np.array([85, 255, 255])

    # Create a mask for the green color
    mask = cv2.inRange(hsv_image, lower_green, upper_green)

    # Find contours in the mask
    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    # Initialize variables to store the coordinates and dimensions of the green rectangle
    x_start, y_start, width, height = None, None, None, None

    for contour in contours:
        x, y, w, h = cv2.boundingRect(contour)
        if w > 10 and h > 10:  # Ensure the rectangle is large enough
            x_start, y_start, width, height = x, y, w, h
            break

    if x_start is None:
        raise ValueError(f"No green rectangle found in {image_path}")

    return x_start, y_start, width, height

def crop_and_save_image(input_image_path, output_image_path, x_start, y_start, width, height):
    # Load the image
    image = cv2.imread(input_image_path)
    if image is None:
        raise ValueError(f"Image not found: {input_image_path}")

    # Crop the image to the detected green rectangle
    cropped_image = image[y_start:y_start+height, x_start:x_start+width]

    # Save the cropped image
    cv2.imwrite(output_image_path, cropped_image)

def process_images(label_img_dir, raw_img_dir, final_img_dir, label_file_path):
    # Create the output directory if it doesn't exist
    os.makedirs(final_img_dir, exist_ok=True)

    # Open label file to store coordinates of the green rectangles
    with open(label_file_path, 'w') as label_file:
        label_file.write("Image Name, X_Start, Y_Start, Width, Height\n")

        # Process each labeled image (green box)
        for img_name in os.listdir(label_img_dir):
            img_path = os.path.join(label_img_dir, img_name)

            try:
                # Detect green rectangle in the labeled image
                x_start, y_start, width, height = find_green_rectangle(img_path)

                # Save the detected rectangle coordinates in the label file
                label_file.write(f"{img_name}, {x_start}, {y_start}, {width}, {height}\n")

                # Find the corresponding raw image (without label)
                raw_img_path = os.path.join(raw_img_dir, img_name)

                # Define the path to save the cropped image
                output_img_path = os.path.join(final_img_dir, f"final_{img_name}")

                # Crop and save the raw image using the coordinates from the labeled image
                crop_and_save_image(raw_img_path, output_img_path, x_start, y_start, width, height)

                print(f"Cropped and saved: {output_img_path}")

            except ValueError as e:
                print(e)

# Directories
label_img = './data/label'
raw_img = './data/not_label'
final_img = './data/final_out'
label_file = os.path.join(final_img, 'label.txt')

# Process the images and save results
process_images(label_img, raw_img, final_img, label_file)


Cropped and saved: ./data/final_out/final_037.jpeg
Cropped and saved: ./data/final_out/final_060.jpeg
Cropped and saved: ./data/final_out/final_021.jpeg
Cropped and saved: ./data/final_out/final_056.jpeg
Cropped and saved: ./data/final_out/final_001.jpeg
Cropped and saved: ./data/final_out/final_017.jpeg
Cropped and saved: ./data/final_out/final_040.jpeg
Cropped and saved: ./data/final_out/final_041.jpeg
Cropped and saved: ./data/final_out/final_016.jpeg
Cropped and saved: ./data/final_out/final_057.jpeg
Cropped and saved: ./data/final_out/final_020.jpeg
Cropped and saved: ./data/final_out/final_061.jpeg
Cropped and saved: ./data/final_out/final_036.jpeg
Cropped and saved: ./data/final_out/final_050.jpeg
Cropped and saved: ./data/final_out/final_007.jpeg
Cropped and saved: ./data/final_out/final_011.jpeg
Cropped and saved: ./data/final_out/final_046.jpeg
Cropped and saved: ./data/final_out/final_031.jpeg
Cropped and saved: ./data/final_out/final_066.jpeg
Cropped and saved: ./data/final



split data

In [2]:
import os
import shutil
from sklearn.model_selection import train_test_split

# Directories
raw_img_dir = './data/not_label'
final_img_dir = './data/final_out'

# Destination directory
data_dir = './content/data'

# Train-test split ratio
TEST_SIZE = 0.2

# Create directory structure
train_class1_dir = os.path.join(data_dir, 'train/class1')
train_class2_dir = os.path.join(data_dir, 'train/class2')
val_class1_dir = os.path.join(data_dir, 'validation/class1')
val_class2_dir = os.path.join(data_dir, 'validation/class2')

os.makedirs(train_class1_dir, exist_ok=True)
os.makedirs(train_class2_dir, exist_ok=True)
os.makedirs(val_class1_dir, exist_ok=True)
os.makedirs(val_class2_dir, exist_ok=True)

# Get list of all images from the raw_img and final_img directories
raw_images = [os.path.join(raw_img_dir, f) for f in os.listdir(raw_img_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]
final_images = [os.path.join(final_img_dir, f) for f in os.listdir(final_img_dir) if f.endswith(('.jpg', '.jpeg', '.png'))]

# Split into training and validation sets
train_raw, val_raw = train_test_split(raw_images, test_size=TEST_SIZE, random_state=42)
train_final, val_final = train_test_split(final_images, test_size=TEST_SIZE, random_state=42)

# Function to copy images to the target folder
def copy_images(image_list, dest_folder):
    for image_path in image_list:
        shutil.copy(image_path, dest_folder)

# Copy raw images to class1 and final biopsy images to class2
copy_images(train_raw, train_class1_dir)
copy_images(train_final, train_class2_dir)
copy_images(val_raw, val_class1_dir)
copy_images(val_final, val_class2_dir)

print("Data successfully split and saved to /content/data/train/ and /content/data/validation/")

# Validation to ensure each image in class1 has an associated image in class2
def validate_image_pairs(class1_dir, class2_dir):
    class1_images = set(os.path.splitext(f)[0] for f in os.listdir(class1_dir) if f.endswith(('.jpg', '.jpeg', '.png')))
    class2_images = set(os.path.splitext(f)[0] for f in os.listdir(class2_dir) if f.endswith(('.jpg', '.jpeg', '.png')))

    missing_in_class2 = class1_images - class2_images
    missing_in_class1 = class2_images - class1_images

    print(f"\nTotal images in {class1_dir}: {len(class1_images)}")
    print(f"Total images in {class2_dir}: {len(class2_images)}")
    print(f"Common images: {len(class1_images - missing_in_class2)}")

    if missing_in_class2:
        print(f"\nImages in {class1_dir} missing in {class2_dir}:")
        for img in missing_in_class2:
            print(f"{img}.jpg or .jpeg or .png")

    if missing_in_class1:
        print(f"\nImages in {class2_dir} missing in {class1_dir}:")
        for img in missing_in_class1:
            print(f"{img}.jpg or .jpeg or .png")

# Validate training and validation sets
print("Validating training sets...")
validate_image_pairs(train_class1_dir, train_class2_dir)
print("Validating validation sets...")
validate_image_pairs(val_class1_dir, val_class2_dir)


Data successfully split and saved to /content/data/train/ and /content/data/validation/
Validating training sets...

Total images in ./content/data/train/class1: 52
Total images in ./content/data/train/class2: 52
Common images: 0

Images in ./content/data/train/class1 missing in ./content/data/train/class2:
064.jpg or .jpeg or .png
058.jpg or .jpeg or .png
045.jpg or .jpeg or .png
011.jpg or .jpeg or .png
031.jpg or .jpeg or .png
039.jpg or .jpeg or .png
026.jpg or .jpeg or .png
016.jpg or .jpeg or .png
032.jpg or .jpeg or .png
056.jpg or .jpeg or .png
015.jpg or .jpeg or .png
020.jpg or .jpeg or .png
054.jpg or .jpeg or .png
055.jpg or .jpeg or .png
043.jpg or .jpeg or .png
021.jpg or .jpeg or .png
059.jpg or .jpeg or .png
001.jpg or .jpeg or .png
006.jpg or .jpeg or .png
009.jpg or .jpeg or .png
038.jpg or .jpeg or .png
041.jpg or .jpeg or .png
004.jpg or .jpeg or .png
049.jpg or .jpeg or .png
062.jpg or .jpeg or .png
007.jpg or .jpeg or .png
060.jpg or .jpeg or .png
042.jpg or .jpeg

count

In [5]:
import os

# Directories
data_dir = './content/data'

# Function to count images in a directory
def count_images(directory):
    return len([f for f in os.listdir(directory) if f.lower().endswith(('.jpg', '.jpeg', '.png'))])

# Count images in training and validation directories
train_class1_count = count_images(os.path.join(data_dir, 'train/class1'))
train_class2_count = count_images(os.path.join(data_dir, 'train/class2'))
val_class1_count = count_images(os.path.join(data_dir, 'validation/class1'))
val_class2_count = count_images(os.path.join(data_dir, 'validation/class2'))

print(f"Total training images for class1: {train_class1_count}")
print(f"Total training images for class2: {train_class2_count}")
print(f"Total validation images for class1: {val_class1_count}")
print(f"Total validation images for class2: {val_class2_count}")



Total training images for class1: 52
Total training images for class2: 52
Total validation images for class1: 14
Total validation images for class2: 14
