In [41]:
import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MERGED NEW\\base.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts)

class_name
Cardiomegaly          2446
Pleural thickening    1981
Pulmonary fibrosis    1617
Pleural effusion      1185
Nodule/Mass            990
Infiltration           613
Atelectasis            366
Consolidation          353
Pneumothorax           194
Name: count, dtype: int64


In [45]:
# Count the unique image_ids
import pandas as pd


csv_file_path = 'D:\\Dataset\\MERGED NEW\\base.csv'
df = pd.read_csv(csv_file_path)

unique_image_ids_count = df['image_id'].nunique()
print("Number of unique image_ids:", unique_image_ids_count)

Number of unique image_ids: 4850


In [47]:
#perform optimized balanced sampling to ensure each class has a specified minimum and maximum number of samples,

import pandas as pd

file_path = 'D:\\Dataset\\MERGED NEW\\base.csv'
df = pd.read_csv(file_path)

# Grouping the dataset by image_id and aggregating the class names
df_grouped = df.groupby('image_id')['class_name'].apply(list).reset_index()

def optimized_balanced_sampling(df, df_grouped, min_samples=353, max_samples=2446):
    # Counting the frequency of each class
    class_counts = df['class_name'].value_counts().to_dict()
    selected_image_ids = {class_name: set() for class_name in class_counts.keys()}

    # Iterating over each class
    for class_name in class_counts:
        class_df = df_grouped[df_grouped['class_name'].apply(lambda x: class_name in x)]
        class_df = class_df.sample(frac=1).reset_index(drop=True)

        for _, row in class_df.iterrows():
            image_id = row['image_id']
            if len(selected_image_ids[class_name]) < max_samples:
                selected_image_ids[class_name].add(image_id)

            if len(selected_image_ids[class_name]) >= min_samples:
                break

    final_image_ids = set.union(*selected_image_ids.values())
    final_df = df[df['image_id'].isin(final_image_ids)]

    return final_df

optimized_sampled_df = optimized_balanced_sampling(df, df_grouped)
optimized_samples_count = optimized_sampled_df['class_name'].value_counts()

# Save the DataFrame to CSV
output_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data.csv'
optimized_sampled_df.to_csv(output_csv_path, index=False)

unique_image_ids_count = df['image_id'].nunique()

print("Number of unique image_ids:", unique_image_ids_count)

print("Optimized CSV saved:", output_csv_path)
print("Sample counts per class:", optimized_samples_count)

Number of unique image_ids: 4850
Optimized CSV saved: D:\Dataset\MergedDataSet\VER6\balanced_data.csv
Sample counts per class: class_name
Pleural thickening    1082
Pulmonary fibrosis     968
Cardiomegaly           828
Pleural effusion       771
Nodule/Mass            604
Infiltration           501
Atelectasis            356
Consolidation          353
Pneumothorax           194
Name: count, dtype: int64


In [48]:
import os
import shutil
import pandas as pd

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data.csv'
source_folder = 'D:\\Dataset\\MergedDataSet\\512'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    source_file = os.path.join(source_folder, file_name)
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the source folder
    if os.path.exists(source_file):
        # Copy the file to the destination folder
        shutil.copy(source_file, destination_file)
        print(f"Copied: {file_name}")
    else:
        print(f"File not found: {file_name}")

print("Script completed.")

Copied: 0005e8e3701dfb1dd93d53e2ff537b6e.png
Copied: 0005e8e3701dfb1dd93d53e2ff537b6e.png
Copied: 0005e8e3701dfb1dd93d53e2ff537b6e.png
Copied: 000d68e42b71d3eac10ccc077aba07c1.png
Copied: 000d68e42b71d3eac10ccc077aba07c1.png
Copied: 00150343289f317a0ad5629d5b7d9ef9.png
Copied: 00150343289f317a0ad5629d5b7d9ef9.png
Copied: 00150343289f317a0ad5629d5b7d9ef9.png
Copied: 00150343289f317a0ad5629d5b7d9ef9.png
Copied: 0046f681f078851293c4e710c4466058.png
Copied: 0046f681f078851293c4e710c4466058.png
Copied: 0046f681f078851293c4e710c4466058.png
Copied: 009d4c31ebf87e51c5c8c160a4bd8006.png
Copied: 009d4c31ebf87e51c5c8c160a4bd8006.png
Copied: 009d4c31ebf87e51c5c8c160a4bd8006.png
Copied: 00bcb82818ea83d6a86df241762cd7d0.png
Copied: 00bcb82818ea83d6a86df241762cd7d0.png
Copied: 00bcb82818ea83d6a86df241762cd7d0.png
Copied: 010018c93ed33ae56ed048ee54867e46.png
Copied: 010018c93ed33ae56ed048ee54867e46.png
Copied: 010018c93ed33ae56ed048ee54867e46.png
Copied: 010018c93ed33ae56ed048ee54867e46.png
Copied: 01

In [None]:
import os
import pandas as pd
from PIL import Image

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data.csv'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data'
second_folder = 'D:\\Dataset\\NIH Dataset\\all_images'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Initialize a list to keep track of missing files
missing_files = []

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the destination folder
    if not os.path.exists(destination_file):
        # If the file is missing, check the second folder for the .jpg version
        jpg_file_name = f"{image_id}.jpg"
        jpg_file_path = os.path.join(second_folder, jpg_file_name)
        
        if os.path.exists(jpg_file_path):
            # If the .jpg file is found, convert it to .png and resize it
            with Image.open(jpg_file_path) as img:
                img = img.resize((512, 512))
                img.save(destination_file, 'PNG')
        else:
            missing_files.append(file_name)

# Print the results
if missing_files:
    print("The following files are missing:")
    for file in missing_files:
        print(file)
else:
    print("All files are present in the destination folder.")

print("Script completed.")

In [51]:
import os
import pandas as pd

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data.csv'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Initialize a list to keep track of missing files
missing_files = []

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the destination folder
    if not os.path.exists(destination_file):
        missing_files.append(file_name)

# Print the results
if missing_files:
    print("The following files are missing:")
    for file in missing_files:
        print(file)
else:
    print("All files are present in the destination folder.")

print("Script completed.")

All files are present in the destination folder.
Script completed.


In [52]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data.csv'
train_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_train.csv'
val_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Split the data into training and validation sets
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['class_name'], random_state=42)

# Save the training and validation sets to separate CSV files
train_df.to_csv(train_csv_path, index=False)
val_df.to_csv(val_csv_path, index=False)

print("Data split completed. Training data saved to train.csv and validation data saved to val.csv.")

Data split completed. Training data saved to train.csv and validation data saved to val.csv.


In [53]:
import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_train.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts) 

import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts)

class_name
Pleural thickening    866
Pulmonary fibrosis    774
Cardiomegaly          662
Pleural effusion      617
Nodule/Mass           483
Infiltration          401
Atelectasis           285
Consolidation         282
Pneumothorax          155
Name: count, dtype: int64
class_name
Pleural thickening    216
Pulmonary fibrosis    194
Cardiomegaly          166
Pleural effusion      154
Nodule/Mass           121
Infiltration          100
Atelectasis            71
Consolidation          71
Pneumothorax           39
Name: count, dtype: int64


In [54]:
import os
import shutil
import pandas as pd

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_train.csv'
source_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balance_TRAINING'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    source_file = os.path.join(source_folder, file_name)
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the source folder
    if os.path.exists(source_file):
        # Copy the file to the destination folder
        shutil.copy(source_file, destination_file)
        print(f"Copied: {file_name}")
    else:
        print(f"File not found: {file_name}")

print("Script completed.")

Copied: 124ec853c372fcf7f4428b26b32be62f.png
Copied: 1d36704fc4f9a8f997128b92551bbc4d.png
Copied: 826ef40d8bd63f54f66cc620ad52c8e9.png
Copied: b8202fc93f201492ab7880cb6b4c9165.png
Copied: f2e56bf4b304000674dde626b3335ca7.png
Copied: d70837c881f01f8c51afe9616879de50.png
Copied: dc7d6c6fa1fdde25e0aa64a1f6fd594a.png
Copied: ddec878b93cd18918c2b81bca339a5e9.png
Copied: 68f7ee667e33c638abc21bcd543ad9d5.png
Copied: 54402d9750c85f0e06e3d9464f6c2363.png
Copied: d91079b185012d745fb4c1aa0e04d3b9.png
Copied: 2ceaa4c6e93b4496df1831cccc3e433a.png
Copied: 9d66132dc1a3e262df38c3ad25626a00.png
Copied: 70050ec61a0f61030b7ce001eab0fefa.png
Copied: 64a46917dfbd81f3747a1635f47f622d.png
Copied: b3d1eabc4bb5b66d92f3fae7ddb67b29.png
Copied: d7d10dd5db989860a2758763e97edc05.png
Copied: 84293339bfff0b76aadb731526ac3dd2.png
Copied: 543c0591a8706112a0e71620509a398f.png
Copied: a6541e2d7a4e09d6c1bdee83632bf781.png
Copied: 55e39b1282cde6796b82c6b5019a1fb8.png
Copied: d7210d8d2f377ab74e46acfb4a158e79.png
Copied: 78

In [55]:
import os
import shutil
import pandas as pd

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'
source_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_data'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balance_VALIDATION'

# Create the destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    source_file = os.path.join(source_folder, file_name)
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the source folder
    if os.path.exists(source_file):
        # Copy the file to the destination folder
        shutil.copy(source_file, destination_file)
        print(f"Copied: {file_name}")
    else:
        print(f"File not found: {file_name}")

print("Script completed.")

Copied: b489964b59d3f6f9070e45a02e469d8b.png
Copied: 3c0343764fd0e6b3edb40b15740a597e.png
Copied: a673c55470f775d286249e189896c376.png
Copied: bda862be2d20d538b558300c97b8255c.png
Copied: 0e8d3736396b615c0798033f37e4a481.png
Copied: c2a2ce443b43b82ae5bd29d06716a44e.png
Copied: da0364cb98ded8ccbdea0d70fc8fa38c.png
Copied: 51b32791c77b71fca56fc28f10ce770c.png
Copied: c341b3f8a0353bab2ec49147b97ce9d0.png
Copied: 8933f5b721da3ab96e2a6b7efa61fc24.png
Copied: bd3042360365944aa845f34f0aa70424.png
Copied: 310a5c5df24cacd7bfc923cf0ce2f310.png
Copied: 0c577dc6594dab44ebbd6964de673eaa.png
Copied: 56dac092c516acffda2ffac85f90e9cc.png
Copied: 05721adb43ab7c061733568b274c006b.png
Copied: 995533127aa50c3b1a64829055850319.png
Copied: 942df4c44fdc6ffe0111740028a25581.png
Copied: 9259faeeda26a934fb32c74b8682e15f.png
Copied: 4c68b98043a1d2f7215d0db1a761bab3.png
Copied: cf412a6f906091434c19ffd30f2df9b6.png
Copied: c67b6e145e57b4fbd4db367f114bd879.png
Copied: 36a0490889068162384a000b02d37ad4.png
Copied: 9d

In [56]:
import os
import pandas as pd

# Define the paths
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'
destination_folder = 'D:\\Dataset\\MergedDataSet\\VER6\\balance_VALIDATION'

# Read the CSV file
df = pd.read_csv(csv_file_path)

# Initialize a list to keep track of missing files
missing_files = []

# Iterate through the image IDs in the CSV file
for image_id in df['image_id']:
    # Construct the full file name
    file_name = f"{image_id}.png"
    destination_file = os.path.join(destination_folder, file_name)
    
    # Check if the file exists in the destination folder
    if not os.path.exists(destination_file):
        missing_files.append(file_name)

# Print the results
if missing_files:
    print("The following files are missing:")
    for file in missing_files:
        print(file)
else:
    print("All files are present in the destination folder.")

print("Script completed.")

All files are present in the destination folder.
Script completed.


In [57]:
import pandas as pd

#To transform into binary

# Load the CSV file
val_df = pd.read_csv('D:\\Dataset\\MergedDataSet\\VER6\\balanced_train.csv')

# List of all classes
classes = ['Cardiomegaly', 'Pleural thickening', 'Pulmonary fibrosis',
           'Pleural effusion', 'Nodule/Mass', 'Infiltration', 'Consolidation', 'Atelectasis', 'Pneumothorax']

# Initialize a list to hold the binary labels
binary_label_data = []

# Iterate through each image_id and set binary labels
for image_id in val_df['image_id'].unique():
    image_data = {'image_id': image_id}
    for class_name in classes:
        image_data[class_name] = int(class_name in val_df[val_df['image_id'] == image_id]['class_name'].values)
    binary_label_data.append(image_data)

# Convert the list of dictionaries to a DataFrame
binary_label_df = pd.DataFrame(binary_label_data)

# Save the transformed data to a new CSV file
binary_label_df.to_csv('D:\\Dataset\\MergedDataSet\\VER6\\balanced_TRAINtrans.csv' , index=False)

In [58]:
import pandas as pd

#To transform into binary

# Load the CSV file
val_df = pd.read_csv('D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv')

# List of all classes
classes = ['Cardiomegaly', 'Pleural thickening', 'Pulmonary fibrosis',
           'Pleural effusion', 'Nodule/Mass', 'Infiltration', 'Consolidation', 'Atelectasis', 'Pneumothorax']

# Initialize a list to hold the binary labels
binary_label_data = []

# Iterate through each image_id and set binary labels
for image_id in val_df['image_id'].unique():
    image_data = {'image_id': image_id}
    for class_name in classes:
        image_data[class_name] = int(class_name in val_df[val_df['image_id'] == image_id]['class_name'].values)
    binary_label_data.append(image_data)

# Convert the list of dictionaries to a DataFrame
binary_label_df = pd.DataFrame(binary_label_data)

# Save the transformed data to a new CSV file
binary_label_df.to_csv('D:\\Dataset\\MergedDataSet\\VER6\\balanced_VALtrans.csv', index=False)

In [59]:
import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts)

class_name
Pleural thickening    216
Pulmonary fibrosis    194
Cardiomegaly          166
Pleural effusion      154
Nodule/Mass           121
Infiltration          100
Atelectasis            71
Consolidation          71
Pneumothorax           39
Name: count, dtype: int64


In [37]:
import pandas as pd

# Define the paths to your CSV files
first_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_val.csv'
second_csv_path = 'D:\\Dataset\\MERGED NEW\\BASE.csv'
updated_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\new_balanced_val.csv'

# Read the CSV files
first_df = pd.read_csv(first_csv_path)
second_df = pd.read_csv(second_csv_path)

# Merge the dataframes based on 'image_id'
merged_df = first_df.merge(second_df, on='image_id', suffixes=('_first', '_second'))

# Update the values in the first dataframe with the values from the second dataframe
for column in ['class_name', 'x_min', 'y_min', 'x_max', 'y_max']:
    merged_df[column] = merged_df[f"{column}_second"]

# Drop the extra columns
merged_df = merged_df[first_df.columns]

# Save the updated dataframe to a new CSV file
merged_df.to_csv(updated_csv_path, index=False)

print("CSV file updated successfully.")

CSV file updated successfully.


In [38]:
import pandas as pd

# Define the paths to your CSV files
first_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\balanced_train.csv'
second_csv_path = 'D:\\Dataset\\MERGED NEW\\BASE.csv'
updated_csv_path = 'D:\\Dataset\\MergedDataSet\\VER6\\new_balanced_train.csv'

# Read the CSV files
first_df = pd.read_csv(first_csv_path)
second_df = pd.read_csv(second_csv_path)

# Merge the dataframes based on 'image_id'
merged_df = first_df.merge(second_df, on='image_id', suffixes=('_first', '_second'))

# Update the values in the first dataframe with the values from the second dataframe
for column in ['class_name', 'x_min', 'y_min', 'x_max', 'y_max']:
    merged_df[column] = merged_df[f"{column}_second"]

# Drop the extra columns
merged_df = merged_df[first_df.columns]

# Save the updated dataframe to a new CSV file
merged_df.to_csv(updated_csv_path, index=False)

print("CSV file updated successfully.")

CSV file updated successfully.


In [39]:
import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\new_balanced_val.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts) 

import pandas as pd

# Count the number of samples for each class
csv_file_path = 'D:\\Dataset\\MergedDataSet\\VER6\\new_balanced_train.csv'
df = pd.read_csv(csv_file_path)
grouped = df.groupby('image_id')['class_name'].unique()
unique_classes_df = pd.DataFrame(grouped.explode()).reset_index()

class_counts = unique_classes_df['class_name'].value_counts()

print(class_counts)

class_name
Pleural thickening    539
Pulmonary fibrosis    474
Pleural effusion      394
Cardiomegaly          344
Nodule/Mass           281
Infiltration          248
Consolidation         180
Atelectasis           130
Pneumothorax           74
Name: count, dtype: int64
class_name
Pleural thickening    1067
Pulmonary fibrosis     936
Cardiomegaly           783
Pleural effusion       739
Nodule/Mass            551
Infiltration           480
Consolidation          351
Atelectasis            307
Pneumothorax           151
Name: count, dtype: int64


In [32]:
import pandas as pd

# Remove rows with the class_name 'Pneumothorax' and 'Atelectasis'
csv_file_path = 'D:\\Dataset\\MergedDataSet\\new_balanced_val.csv'
df = pd.read_csv(csv_file_path)
filtered_df = df[~df['class_name'].isin(['Pneumothorax', 'Atelectasis', 'Consolidation'])]
output_csv_path = 'D:\\Dataset\\MergedDataSet\\new_balanced_val(1).csv'
filtered_df.to_csv(output_csv_path, index=False)

print("Filtered CSV saved to:", output_csv_path)

Filtered CSV saved to: D:\Dataset\MergedDataSet\new_balanced_val(1).csv


In [4]:
import pandas as pd

# Remove rows with the class_name 'Pneumothorax' and 'Atelectasis'
csv_file_path = 'D:\\Dataset\\MergedDataSet\\new_balanced_train.csv'
df = pd.read_csv(csv_file_path)
filtered_df = df[~df['class_name'].isin(['Pneumothorax', 'Atelectasis', 'Consolidation'])]
output_csv_path = 'D:\\Dataset\\MergedDataSet\\new_balanced_train(V5).csv'
filtered_df.to_csv(output_csv_path, index=False)

print("Filtered CSV saved to:", output_csv_path)

Filtered CSV saved to: D:\Dataset\MergedDataSet\new_balanced_train(V5).csv


In [1]:
import os
import pandas as pd

def find_missing_images_in_folders(csv_path, folder_paths, image_column="image_id", file_extension=".jpg"):
    """
    Scans a CSV file to find image IDs that do not exist in the specified folders.

    Args:
        csv_path (str): Path to the CSV file containing the image IDs.
        folder_paths (list of str): List of paths to the folders where images should be located.
        image_column (str): Name of the column in the CSV file containing the image IDs.
        file_extension (str): The file extension of the images (e.g., ".jpg").

    Prints:
        A list of missing image IDs.
    """
    # Load the CSV file
    try:
        data = pd.read_csv(csv_path)
    except Exception as e:
        print(f"Error reading CSV file: {e}")
        return

    if image_column not in data.columns:
        print(f"Error: Column '{image_column}' not found in the CSV file.")
        return

    # Extract image IDs from the CSV
    image_ids = data[image_column].tolist()

    # Check for missing images
    missing_images = []
    for image_id in image_ids:
        image_found = False
        for folder_path in folder_paths:
            image_path = os.path.join(folder_path, f"{image_id}{file_extension}")
            if os.path.isfile(image_path):
                image_found = True
                break
        if not image_found:
            missing_images.append(image_id)

    # Output missing image IDs
    if missing_images:
        print("Missing image IDs:")
        for image_id in missing_images:
            print(image_id)
    else:
        print("All images exist in the specified folders.")

if __name__ == "__main__":
    # Specify the paths and parameters
    csv_path = "D:\\Dataset\\NIH Dataset\\BBox_List_2017.csv"  # Replace with your CSV file path
    folder_paths = [
        "D:\\Dataset\\MergedDataSet\\balance_VALIDATION",  # Folder 1
        "D:\\Dataset\\MergedDataSet\\balance_TRAINING"    # Folder 2
    ]  # List of folders to check
    image_column = "image_id"  # Replace with the column name in your CSV file
    file_extension = ".png"  # Replace with your image file extension

    # Run the script
    find_missing_images_in_folders(csv_path, folder_paths, image_column, file_extension)

Missing image IDs:
00013118_008.png
00014716_007.png
00029817_009.png
00014687_001.png
00017877_001.png
00003148_004.png
00012515_002.png
00022098_006.png
00014198_000.png
00021007_000.png
00030674_000.png
00003945_004.png
00000808_002.png
00006621_004.png
00000865_006.png
00028452_001.png
00007557_026.png
00000181_061.png
00009669_003.png
00025368_014.png
00000468_033.png
00010770_000.png
00016972_019.png
00030635_001.png
00021481_014.png
00019124_045.png
00022883_002.png
00028173_016.png
00027094_003.png
00012123_001.png
00020113_017.png
00004968_003.png
00028012_001.png
00029464_006.png
00001170_046.png
00016267_000.png
00018412_001.png
00020673_005.png
00027474_005.png
00007124_008.png
00027866_002.png
00000149_006.png
00030434_000.png
00019271_030.png
00020408_037.png
00023176_010.png
00016191_017.png
00010478_012.png
00025228_005.png
00021796_000.png
00008005_004.png
00021495_005.png
00018496_006.png
00029088_023.png
00014607_007.png
00004968_004.png
00019271_064.png
00010936_011