In [1]:
import os
import pandas as pd

In [2]:
import pandas as pd

def check_image_id_column(csv_path):
    """
    Checks if the 'image_id' column exists in the given CSV file, validates its contents, 
    and counts the number of unique IDs.

    Args:
        csv_path (str): Path to the CSV file.

    Returns:
        str: Message indicating the result of the check and the number of unique IDs.
    """
    try:
        # Load the CSV file
        df = pd.read_csv(csv_path)

        # Check if 'image_id' column exists
        if 'image_id' not in df.columns:
            return "Error: The 'image_id' column is missing from the CSV file."

        # Check if 'image_id' column contains valid data
        if df['image_id'].isnull().any():
            return "Warning: The 'image_id' column contains missing values."

        # Count the number of unique IDs
        unique_ids_count = df['image_id'].nunique()

        return ("Success: The 'image_id' column is present and contains valid data. "
                f"There are {unique_ids_count} unique IDs in the column.")

    except FileNotFoundError:
        return f"Error: The file '{csv_path}' was not found."

    except pd.errors.EmptyDataError:
        return "Error: The CSV file is empty."

    except Exception as e:
        return f"An unexpected error occurred: {e}"

if __name__ == "__main__":
    # Replace with your CSV file path
    csv_path = "D:\\Dataset\\NIH NEW\\BBox_List_2017.csv"
    result = check_image_id_column(csv_path)
    print(result)

Success: The 'image_id' column is present and contains valid data. There are 880 unique IDs in the column.


In [3]:
import pandas as pd
from PIL import Image
import os

# Set paths for source folder, destination folder, and CSV file
source_folder = 'D:\\Dataset\\NIH Dataset\\all_images'
destination_folder = 'D:\\Dataset\\NIH NEW\\wt_bounding_box'
csv_file_path = 'D:\\Dataset\\NIH NEW\\BBox_List_2017.csv'  # Update with the actual CSV file path

# Create destination folder if it doesn't exist
if not os.path.exists(destination_folder):
    os.makedirs(destination_folder)

# Load image IDs from CSV file and remove .png extensions
image_ids_df = pd.read_csv(csv_file_path)
image_ids = image_ids_df['image_id'].str.replace('.png', '', regex=False)

# Define the new size for resizing
new_size = (512, 512)

# Iterate over specified image IDs
for image_id in image_ids:
    img_path = os.path.join(source_folder, f"{image_id}.jpg")
    # Check if the image exists in the source folder
    if os.path.exists(img_path):
        with Image.open(img_path) as img:
            # Resize the image
            resized_img = img.resize(new_size, Image.LANCZOS)

            # Save the resized image to the destination folder
            resized_img.save(os.path.join(destination_folder, f"{image_id}.jpg"))

print("Specified images have been resized and saved to the destination folder.")

Specified images have been resized and saved to the destination folder.


In [None]:
import os

def count_images_in_folder(folder_path):
    """
    Counts the number of image files in the specified folder.

    Args:
        folder_path (str): Path to the folder containing images.

    Returns:
        int: Number of image files in the folder.
    """
    image_extensions = {'.jpg', '.jpeg', '.png', '.bmp', '.gif'}
    image_count = 0

    try:
        with os.scandir(folder_path) as entries:
            for entry in entries:
                if entry.is_file() and os.path.splitext(entry.name)[1].lower() in image_extensions:
                    image_count += 1
    except PermissionError as e:
        print(f"Permission denied: {e}")
        return 0
    except FileNotFoundError:
        print(f"Folder not found: {folder_path}")
        return 0

    return image_count

if __name__ == "__main__":
    folder_path = "D:\\Dataset\\new\\wt_bounding_box"
    num_images = count_images_in_folder(folder_path)
    print(f"Number of images in the folder: {num_images}")

In [5]:
# Specify the directory containing the files
directory = 'D:\\Dataset\\NIH NEW\\wt_bounding_box'

# Loop through all files in the directory
for filename in os.listdir(directory):
    # Check if the file has a .jpg extension
    if filename.endswith('.jpg'):
        # Create the new filename with .png extension
        new_filename = filename.replace('.jpg', '.png')
        # Get the full path to the old and new files
        old_file = os.path.join(directory, filename)
        new_file = os.path.join(directory, new_filename)
        # Rename the file
        os.rename(old_file, new_file)

print("File extensions have been replaced.")

File extensions have been replaced.


In [6]:
import pandas as pd
csv_file_path = 'D:\\Dataset\\NIH Dataset\\BBox_List_2017.csv'
df = pd.read_csv(csv_file_path)

original_dim = 1024
new_dim = 512

# Calculate the scaling factor
scale_factor = new_dim / original_dim

# Scale the bounding box coordinates
df['Bbox [x'] = df['Bbox [x'] * scale_factor
df['y'] = df['y'] * scale_factor
df['w'] = df['w'] * scale_factor
df['h]'] = df['h]'] * scale_factor

# Save the updated DataFrame to a new CSV file
output_file_path = 'D:\\Dataset\\NIH NEW\\NEWSCALED512NIH.csv'
df.to_csv(output_file_path, index=False)

print("CSV file with scaled coordinates has been saved.")

CSV file with scaled coordinates has been saved.


In [8]:
import pandas as pd

# Read the CSV file
df = pd.read_csv('D:\\Dataset\\NIH NEW\\NEWSCALED512NIH.csv')

# Remove the .png extension from the image_id column
df['image_id'] = df['image_id'].str.replace('.png', '')

# Save the updated DataFrame to a new CSV file
df.to_csv('D:\\Dataset\\NIH NEW\\NEWSCALED512NIH_NoFileExtension.csv', index=False)

print("The .png extension has been removed from the image_id column.")

The .png extension has been removed from the image_id column.


In [9]:
import pandas as pd

# Read the CSV file
file_path = 'D:\\Dataset\\NIH NEW\\NEWSCALED512NIH_NoFileExtension.csv'
data = pd.read_csv(file_path)

# List of specific abnormalities to keep
specific_abnormalities = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Infiltration",
    "Nodule", "Mass", "Pleural_Thickening", "Pneumothorax",
    "Fibrosis", "Effusion"
]

# Filter the dataset to keep only the specified abnormalities
filtered_data = data[data['class_name'].isin(specific_abnormalities)].copy()

# Function to replace the abnormalities
def replace_abnormalities(label):
    replacements = {
        "Nodule": "Nodule/Mass",
        "Mass": "Nodule/Mass",
        "Pleural_Thickening": "Pleural thickening",
        "Fibrosis": "Pulmonary fibrosis",
        "Effusion": "Pleural effusion"
    }
    return replacements.get(label, label)

# Apply the function to the 'class_name' column using .loc
filtered_data.loc[:, 'class_name'] = filtered_data['class_name'].apply(replace_abnormalities)

# Save the updated DataFrame to a new CSV file
filtered_data.to_csv('D:\\Dataset\\NIH NEW\\Modified_NEWSCALED512NIH_NoFileExtension.csv', index=False)

print("The abnormalities have been replaced and the new CSV file has been saved.")

The abnormalities have been replaced and the new CSV file has been saved.


In [17]:
import pandas as pd

# File path
csv_file_path = 'D:\\Dataset\\NIH NEW\\Modified_NEWSCALED512NIH_NoFileExtension.csv'

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Get unique image IDs
unique_image_ids = data['image_id'].unique()

# Print the number of unique image IDs
print(f"Number of unique image IDs: {len(unique_image_ids)}")

Number of unique image IDs: 692


In [10]:
import pandas as pd

# Read the CSV file
file_path = 'D:\\Dataset\\NIH NEW\\Data_Entry_2017.csv'
data = pd.read_csv(file_path)

# List of specific abnormalities to keep
specific_abnormalities = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Infiltration",
    "Nodule", "Mass", "Pleural_Thickening", "Pneumothorax",
    "Fibrosis", "Effusion"
]

# Filter the dataset to keep only the specified abnormalities
filtered_data = data[data['Finding Labels'].isin(specific_abnormalities)].copy()

# Function to replace the abnormalities
def replace_abnormalities(label):
    replacements = {
        "Nodule": "Nodule/Mass",
        "Mass": "Nodule/Mass",
        "Pleural_Thickening": "Pleural thickening",
        "Fibrosis": "Pulmonary fibrosis",
        "Effusion": "Pleural effusion"
    }
    return replacements.get(label, label)

# Apply the function to the 'Finding Labels' column using .loc
filtered_data.loc[:, 'Finding Labels'] = filtered_data['Finding Labels'].apply(replace_abnormalities)

# Save the updated DataFrame to a new CSV file
filtered_data.to_csv('D:\\Dataset\\NIH NEW\\Modified_NIH_Data_Entry_2017.csv', index=False)

print("The abnormalities have been replaced and the new CSV file has been saved.")

The abnormalities have been replaced and the new CSV file has been saved.


In [12]:
file_path = 'D:\\Dataset\\NIH NEW\\Modified_NIH_Data_Entry_2017.csv'
data = pd.read_csv(file_path)

# Count the number of "PA" in the "View Position" column
pa_count = data[data['View Position'] == 'PA'].shape[0]
print(f"Number of 'PA' view positions: {pa_count}")

# Drop rows where "View Position" is "AP"
data = data[data['View Position'] != 'AP']

data.to_csv('D:\\Dataset\\NIH NEW\\Modified_NIH_Data_Entry_2017_without_AP.csv', index=False)

print(data.head())

Number of 'PA' view positions: 16978
        Image Index Finding Labels  Follow-up #  Patient ID  Patient Age  \
0  00000001_000.png   Cardiomegaly            0           1           58   
1  00000005_006.png   Infiltration            6           5           70   
2  00000008_000.png   Cardiomegaly            0           8           69   
3  00000008_002.png    Nodule/Mass            2           8           73   
4  00000010_000.png   Infiltration            0          10           84   

  Patient Gender View Position  OriginalImage[Width  Height]  \
0              M            PA                 2682     2749   
1              F            PA                 2992     2991   
2              F            PA                 2048     2500   
3              F            PA                 2048     2500   
4              F            PA                 2992     2991   

   OriginalImagePixelSpacing[x     y]  
0                        0.143  0.143  
1                        0.143  0.143  
2

In [14]:
import shutil
import os
import pandas as pd

file_path = 'D:\\Dataset\\NIH NEW\\Modified_NIH_Data_Entry_2017_without_AP.csv'
data = pd.read_csv(file_path)

source_dir = 'D:\\Dataset\\NIH NEW\\wt_bounding_box'
dest_dir = 'D:\\Dataset\\NIH NEW\\wt_bounding_box_Filtered_without_AP'

if not os.path.exists(dest_dir):
    os.makedirs(dest_dir)

# Iterate over the image indices and copy the corresponding images
for image_index in data['Image Index']:
    source_path = os.path.join(source_dir, image_index)
    dest_path = os.path.join(dest_dir, image_index)

    # Copy the image if it exists in the source directory
    if os.path.exists(source_path):
        shutil.copy(source_path, dest_path)
        # Delete the image from the source directory after copying
        os.remove(source_path)
    else:
        print(f"Image not found: {image_index}")

print("Image copying and deletion complete.")

Image not found: 00000001_000.png
Image not found: 00000005_006.png
Image not found: 00000008_000.png
Image not found: 00000008_002.png
Image not found: 00000010_000.png
Image not found: 00000011_000.png
Image not found: 00000011_006.png
Image not found: 00000011_007.png
Image not found: 00000013_020.png
Image not found: 00000013_021.png
Image not found: 00000013_033.png
Image not found: 00000013_036.png
Image not found: 00000013_039.png
Image not found: 00000013_045.png
Image not found: 00000013_046.png
Image not found: 00000020_000.png
Image not found: 00000021_000.png
Image not found: 00000021_001.png
Image not found: 00000022_001.png
Image not found: 00000024_000.png
Image not found: 00000025_000.png
Image not found: 00000026_000.png
Image not found: 00000028_000.png
Image not found: 00000030_000.png
Image not found: 00000030_001.png
Image not found: 00000039_004.png
Image not found: 00000041_006.png
Image not found: 00000043_000.png
Image not found: 00000044_002.png
Image not foun

In [16]:
import pandas as pd

# File path
csv_file_path = 'D:\\Dataset\\NIH NEW\\Modified_NEWSCALED512NIH_NoFileExtension.csv'

# Read the CSV file
data = pd.read_csv(csv_file_path)

# Get unique image IDs
unique_image_ids = data['image_id'].unique()

# Print the number of unique image IDs
print(f"Number of unique image IDs: {len(unique_image_ids)}")

Number of unique image IDs: 692
