In [None]:
import os #
import pandas as pd

# Read the CSV file
retina_df = pd.read_csv("/home/studio-lab-user/sage/segregated_images/labels.csv")

# Define the base image directories
base_image_dirs = [
    '/home/studio-lab-user/sage/segregated_images/0',
    '/home/studio-lab-user/sage/segregated_images/1',
    '/home/studio-lab-user/sage/segregated_images/2',
    '/home/studio-lab-user/sage/segregated_images/3',
    '/home/studio-lab-user/sage/segregated_images/4',
]

# Create the full file path for each image and check if the image files exist (.jpeg, .jpg, .png)
def find_image_path(image_name):
    for base_dir in base_image_dirs:
        for ext in ['.jpeg', '.jpg', '.png']:
            path = os.path.join(base_dir, f"{image_name}{ext}")
            if os.path.exists(path):
                return path
    return None

# Apply the function to each image
retina_df['path'] = retina_df['name'].map(find_image_path)
retina_df['exists'] = retina_df['path'].notnull()

# Print the number of images found
print(f"{retina_df['exists'].sum()} images found out of {retina_df.shape[0]} total.")

# Filter the DataFrame to only include rows where the image files exist
retina_df = retina_df[retina_df['exists']]
retina_df = retina_df[['path','label']]
# Save the updated DataFrame
retina_df.to_csv(r"/home/studio-lab-user/sage/segregated_images/labels_pth.csv", index=False)


7552 images found out of 7552 total.


In [2]:
import pandas as pd

def remove_and_verify_records(csv_file, num_to_remove, removed_csv_file, remaining_csv_file):
    # Load the CSV file
    df = pd.read_csv(csv_file)

    # Dictionary to store record counts
    original_counts = df['label'].value_counts()
    removed_counts = {}

    # Create DataFrames to store the removed and remaining records
    removed_df = pd.DataFrame()
    remaining_df = pd.DataFrame()

    # Group by 'level'
    grouped = df.groupby('label')

    # Iterate over each group
    for level, group in grouped:
        if len(group) > num_to_remove:
            # Randomly sample num_to_remove records to remove
            to_remove = group.sample(n=num_to_remove, random_state=42)
            removed_counts[level] = to_remove.shape[0]
            # Append the removed records to the removed DataFrame
            removed_df = pd.concat([removed_df, to_remove])
            # Drop the sampled records from the group
            group = group.drop(to_remove.index)
        else:
            removed_counts[level] = 0

        # Append the remaining records to the remaining DataFrame
        remaining_df = pd.concat([remaining_df, group])

    # Save the removed records to a new CSV file
    removed_df.to_csv(removed_csv_file, index=False)
    # Save the remaining records to a new CSV file
    remaining_df.to_csv(remaining_csv_file, index=False)

    # Calculate new counts after removal
    new_counts = remaining_df['label'].value_counts()

    # Print record counts for verification
    print("Original counts per level:")
    print(original_counts)
    print("\nRemoved counts per level:")
    print(removed_counts)
    print("\nNew counts per level:")
    print(new_counts)

    # Verify the number of records removed
    discrepancies = []
    for level in original_counts.index:
        expected_count = original_counts[level] - removed_counts.get(level, 0)
        actual_count = new_counts.get(level, 0)
        if expected_count != actual_count:
            discrepancies.append((level, expected_count, actual_count))

    if discrepancies:
        for level, expected, actual in discrepancies:
            print(f"Discrepancy detected for level {level}: Expected {expected}, Found {actual}")
    else:
        print("No discrepancies detected.")

    print(f"Removed records saved as '{removed_csv_file}'.")
    print(f"Remaining records saved as '{remaining_csv_file}'.")

# Example usage
csv_file = '/home/studio-lab-user/sage/segregated_images/labels_pth.csv'
num_to_remove = 25  # Number of records to remove
removed_csv_file = '/home/studio-lab-user/sage/segregated_images/labels_test.csv'
remaining_csv_file = '/home/studio-lab-user/sage/segregated_images/labels_train.csv'

remove_and_verify_records(csv_file, num_to_remove, removed_csv_file, remaining_csv_file)


Original counts per level:
label
0    4036
1    1323
3    1036
2     732
4     425
Name: count, dtype: int64

Removed counts per level:
{0: 25, 1: 25, 2: 25, 3: 25, 4: 25}

New counts per level:
label
0    4011
1    1298
3    1011
2     707
4     400
Name: count, dtype: int64
No discrepancies detected.
Removed records saved as '/home/studio-lab-user/sage/segregated_images/labels_test.csv'.
Remaining records saved as '/home/studio-lab-user/sage/segregated_images/labels_train.csv'.


In [6]:
import os
import numpy as np
from PIL import Image
import cv2
import pandas as pd
from multiprocessing import Pool
from tqdm import tqdm


# Define the paths
original_csv_path = '/home/studio-lab-user/sage/segregated_images/labels_test.csv'  # Replace with your original CSV file path
processed_images_folder = '/home/studio-lab-user/sage/segregated_images/processed_images_test'  # Folder to save processed images
new_csv_path = '/home/studio-lab-user/sage/segregated_images/process_labels_test.csv'  # Path for the new CSV file

# Create the folder for processed images if it does not exist
os.makedirs(processed_images_folder, exist_ok=True)

# Read the original CSV file
df = pd.read_csv(original_csv_path)

# Function to trim the image
def trim(im):
    percentage = 0.02
    img = np.array(im)
    img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    im_bin = img_gray > 0.1 * np.mean(img_gray[img_gray != 0])
    row_sums = np.sum(im_bin, axis=1)
    col_sums = np.sum(im_bin, axis=0)
    rows = np.where(row_sums > img.shape[1] * percentage)[0]
    cols = np.where(col_sums > img.shape[0] * percentage)[0]
    min_row, min_col = np.min(rows), np.min(cols)
    max_row, max_col = np.max(rows), np.max(cols)
    im_crop = img[min_row : max_row + 1, min_col : max_col + 1]
    return Image.fromarray(im_crop)

# Function to resize the image while maintaining the aspect ratio
def resize_maintain_aspect(image, desired_size):
    old_size = image.size  # old_size[0] is in (width, height) format
    ratio = float(desired_size) / max(old_size)
    new_size = tuple([int(x * ratio) for x in old_size])
    im = image.resize(new_size, Image.LANCZOS)
    new_im = Image.new("RGB", (desired_size, desired_size))
    new_im.paste(im, ((desired_size - new_size[0]) // 2, (desired_size - new_size[1]) // 2))
    return new_im

# Function to apply CLAHE to the entire color image
def apply_clahe_color(image):
    lab = cv2.cvtColor(image, cv2.COLOR_RGB2LAB)
    l, a, b = cv2.split(lab)
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8, 8))
    cl = clahe.apply(l)
    limg = cv2.merge((cl, a, b))
    final_image = cv2.cvtColor(limg, cv2.COLOR_LAB2RGB)
    return final_image

# Function to process each image
def process_image(image_path, output_path, desired_size):
    try:
        # Load the image
        image = Image.open(image_path)
        
        # Trim the image
        trimmed_image = trim(image)
        
        # Resize the image while maintaining aspect ratio
        resized_image = resize_maintain_aspect(trimmed_image, desired_size)
        
        # Convert the image back to a numpy array
        resized_image_np = np.array(resized_image)
        
        # Apply CLAHE to the entire color image
        final_image = apply_clahe_color(resized_image_np)
        
        # Save the processed image
        final_image_pil = Image.fromarray(final_image)
        final_image_pil.save(output_path)
    except Exception as e:
        print(f"Error processing {image_path}: {e}")

# Process each image and update the CSV file
def process_images():
    full_paths = []
    desired_size = 1000  # Example size, you can adjust as needed
    for index, row in tqdm(df.iterrows(), total=len(df)):
        original_path = row['path']
        image_name = os.path.basename(original_path)
        processed_path = os.path.join(processed_images_folder, image_name)
        
        # Process the image
        process_image(original_path, processed_path, desired_size)
        
        # Use the full path of the processed image
        full_paths.append(os.path.abspath(processed_path))

    # Add the new processed path to the dataframe
    df['path'] = full_paths

    # Save the updated DataFrame to a new CSV file
    df.to_csv(new_csv_path, index=False)

    print(f"Processed images saved to {processed_images_folder}")
    print(f"Updated CSV file saved to {new_csv_path}")

# Run the processing
process_images()


100%|██████████| 125/125 [00:25<00:00,  4.82it/s]

Processed images saved to /home/studio-lab-user/sage/segregated_images/processed_images_test
Updated CSV file saved to /home/studio-lab-user/sage/segregated_images/process_labels_test.csv





In [4]:
! pip install opencv-python

Collecting opencv-python
  Downloading opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (20 kB)
Downloading opencv_python-4.10.0.84-cp37-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (62.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.5/62.5 MB[0m [31m44.0 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[?25hInstalling collected packages: opencv-python
Successfully installed opencv-python-4.10.0.84


In [10]:
import pandas as pd

file_path = '/home/studio-lab-user/sage/segregated_images/process_labels_train.csv'
retina_df = pd.read_csv(file_path)
# Define reduction rates for specific classes
class_reduction_rates = {
    0: 0.20,  # Reduce class 0 by 50%
    1: 0.62,  # Reduce class 1 by 50%
    2: 0.41,
    3: 0.52,  # Reduce class 2 by 50%
}

# Apply reduction for specified classes
reduced_df = pd.DataFrame()
for class_label, reduction_rate in class_reduction_rates.items():
    class_df = retina_df[retina_df['label'] == class_label]
    num_records = len(class_df)
    reduced_num_records = int(num_records * reduction_rate)
    
    if reduced_num_records > 0:
        class_df_reduced = class_df.sample(n=reduced_num_records, random_state=1)
    else:
        class_df_reduced = class_df
    
    reduced_df = pd.concat([reduced_df, class_df_reduced])

# For classes that are not specified in the reduction rates, keep all records
for class_label in retina_df['label'].unique():
    if class_label not in class_reduction_rates:
        class_df = retina_df[retina_df['label'] == class_label]
        reduced_df = pd.concat([reduced_df, class_df])

# Shuffle the updated DataFrame (optional, if you want to randomize the order)
reduced_df = reduced_df.sample(frac=1, random_state=1).reset_index(drop=True)


level_counts = reduced_df['label'].value_counts()
print(level_counts)

label
1    804
0    802
3    525
4    400
2    289
Name: count, dtype: int64


In [11]:
# Save the reduced DataFrame to a new CSV file
reduced_csv_file_path = 'processed_images_5_class_train.csv'
reduced_df.to_csv(reduced_csv_file_path, index=False)

In [2]:
import pandas as pd 
df = pd.read_csv('processed_images_4_class_train.csv')
df['label'].value_counts()

label
2    793
0    762
1    739
3    400
Name: count, dtype: int64

In [12]:
import pandas as pd
import numpy as np

reduced_csv_file_path = 'processed_images_5_class_train.csv'
# Define the function to map old labels to new groups
def map_labels(level):
    if level == 0:
        return 0  # Group 0 for levels 0 and 1
    if level == 1:
        return 1
    elif level in [2, 3]:
        return 2  # Group 1 for levels 2 and 3
    elif level == 4:
        return 3  # Class 2 for level 4
    else:
        raise ValueError("Unexpected level value")

# reduced_csv_file_path = 'total_data_class_mod.csv'
df = pd.read_csv(reduced_csv_file_path)

# Update 'level' column with new grouped labels
df['label'] = df['label'].apply(map_labels)

print(df['label'].value_counts())
#803	651	 (2: 284) + (3: 508) = 792		405


label
2    814
1    804
0    802
3    400
Name: count, dtype: int64


In [13]:
# Save the updated CSV file
# updated_csv_file_path = 'total_data_class_mod.csv'
df.to_csv('processed_images_4_class_train.csv', index=False)

print("CSV file updated successfully!")

CSV file updated successfully!


In [39]:
import pandas as pd
df = pd.read_csv('processed_images_4_class_train.csv')
df['label'].value_counts()

label
1    908
2    838
0    802
3    400
Name: count, dtype: int64