In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# File Extraction

In [None]:
zip_path = '/content/drive/MyDrive/İlkay Hoca/raw_images/vindr-mammo-a-large-scale-benchmark-dataset-for-computer-aided-detection-and-diagnosis-in-full-field-digital-mammography-1.0.0.zip'

In [None]:
# Define extraction path
extract_to = '/content/drive/MyDrive/İlkay Hoca/extracted_images'

# Extract ZIP file
!unzip -q "{zip_path}" -d "{extract_to}"
print("Extraction complete!")


Extraction complete!


# Dicom to PNG

In [None]:
!pip install pydicom

Collecting pydicom
  Downloading pydicom-3.0.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pydicom-3.0.1-py3-none-any.whl (2.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.4/2.4 MB[0m [31m21.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydicom
Successfully installed pydicom-3.0.1


In [None]:
import os
import pydicom
from PIL import Image

def convert_dicom_to_png(input_dir, output_dir):
    """
    Converts DICOM images to PNG while maintaining the folder structure.

    :param input_dir: Path to the root folder containing DICOM files.
    :param output_dir: Path to the root folder to save PNG files.
    """
    for root, _, files in os.walk(input_dir):
        for file in files:
            # Check for .dcm or .dicom file extensions
            if file.lower().endswith(('.dcm', '.dicom')):
                dicom_path = os.path.join(root, file)
                try:
                    # Read the DICOM file
                    dicom = pydicom.dcmread(dicom_path)

                    # Extract pixel data and normalize
                    pixel_array = dicom.pixel_array
                    pixel_array = ((pixel_array - pixel_array.min()) /
                                   (pixel_array.max() - pixel_array.min()) * 255).astype('uint8')

                    # Create the PNG image
                    img = Image.fromarray(pixel_array)

                    # Determine the corresponding output path
                    relative_path = os.path.relpath(root, input_dir)
                    output_folder = os.path.join(output_dir, relative_path)
                    os.makedirs(output_folder, exist_ok=True)

                    # Save the PNG file
                    output_file = os.path.join(output_folder, file.replace('.dicom', '.png').replace('.dcm', '.png'))
                    img.save(output_file)
                    print(f"Converted: {dicom_path} -> {output_file}")

                except Exception as e:
                    print(f"Failed to convert {dicom_path}: {e}")

# Input and output directories
input_directory = "/content/drive/MyDrive/İlkay Hoca/extracted_images/vindr-mammo-a-large-scale-benchmark-dataset-for-computer-aided-detection-and-diagnosis-in-full-field-digital-mammography-1.0.0/images"  # Replace with the path to your DICOM folder
output_directory = "/content/drive/MyDrive/İlkay Hoca/png_images"  # Replace with the path to your output folder

convert_dicom_to_png(input_directory, output_directory)


[1;30;43mGörüntülenen çıkış son 5000 satıra kısaltıldı.[0m
Converted: /content/drive/MyDrive/İlkay Hoca/extracted_images/vindr-mammo-a-large-scale-benchmark-dataset-for-computer-aided-detection-and-diagnosis-in-full-field-digital-mammography-1.0.0/images/5dee2905bde8aec364d404989a82e0f7/b7f9fb7a1f89f0d0e0a4a5fc4c6d0f0e.dicom -> /content/drive/MyDrive/İlkay Hoca/png_images/5dee2905bde8aec364d404989a82e0f7/b7f9fb7a1f89f0d0e0a4a5fc4c6d0f0e.png
Converted: /content/drive/MyDrive/İlkay Hoca/extracted_images/vindr-mammo-a-large-scale-benchmark-dataset-for-computer-aided-detection-and-diagnosis-in-full-field-digital-mammography-1.0.0/images/5dee2905bde8aec364d404989a82e0f7/72aa1ab4bd19ae3e00c3fb6912d6f37e.dicom -> /content/drive/MyDrive/İlkay Hoca/png_images/5dee2905bde8aec364d404989a82e0f7/72aa1ab4bd19ae3e00c3fb6912d6f37e.png
Converted: /content/drive/MyDrive/İlkay Hoca/extracted_images/vindr-mammo-a-large-scale-benchmark-dataset-for-computer-aided-detection-and-diagnosis-in-full-field

# Count PNG Files

In [None]:
import os

def count_png_files(directory):
    """
    Counts the total number of PNG files in a given directory and its subdirectories.

    :param directory: Path to the directory to search.
    :return: Total number of PNG files.
    """
    png_count = 0
    for root, _, files in os.walk(directory):
        png_count += sum(1 for file in files if file.lower().endswith('.png'))
    return png_count

# Specify the output directory
output_directory = "/content/drive/MyDrive/İlkay Hoca/png_images"  # Replace with the path to your output folder

# Count PNG files
total_png_files = count_png_files(output_directory)
print(f"Total PNG files: {total_png_files}")

Total PNG files: 20000


# Annotation File

In [None]:
import pandas as pd

df = pd.read_csv("/content/train_valid_test_split.csv")
df

Unnamed: 0,index,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,finding_categories,finding_birads,xmin,ymin,xmax,ymax,split,stratify_column,image_name
0,0,cceccf35004c6032e9755f843e374adc,4505660fef822aae3f7d0bd754ce3318,b755ced71d50184fabdfad246baa97c7,L,CC,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,train,BI-RADS 1_DENSITY D_No Finding,b755ced71d50184fabdfad246baa97c7_L_CC
1,1,f15c0d68384b92369cc483b67725d99b,3e73419f3327ddb7650a22c2f3946234,c9a104da2995e6c1ea7d93e703b022a4,L,MLO,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,train,BI-RADS 1_DENSITY D_No Finding,c9a104da2995e6c1ea7d93e703b022a4_L_MLO
2,2,bab17af07dd34f7b31656562cfc4cfc8,0b943de52b091650c3ce5e384761e4ca,29ce9df80d949b606f58e5007460f37d,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,train,BI-RADS 2_DENSITY C_No Finding,29ce9df80d949b606f58e5007460f37d_L_MLO
3,3,2acc075c29ee781d02f1c5dd8c756240,51fc50a8cbda5e566cf81348a33aef6b,65051e476f2f4168e62fea74a72bf64f,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,train,BI-RADS 1_DENSITY C_No Finding,65051e476f2f4168e62fea74a72bf64f_R_CC
4,4,6a77274d2391c35e0e4fcd8dbba32088,0d531615df7a3dfa8eff231aa37de903,16888ecf21147ac8567cd0582cba841c,R,MLO,3518,2800,BI-RADS 1,DENSITY B,No Finding,,,,,,train,BI-RADS 1_DENSITY B_No Finding,16888ecf21147ac8567cd0582cba841c_R_MLO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20507,20507,5e9175554fa9d6b9fb424d7065be4f20,a07d1cc99c7611c815ea99acb324f3b6,4adfc32f03228403f518f24e9b9313cf,R,MLO,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding,4adfc32f03228403f518f24e9b9313cf_R_MLO
20508,20508,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,4835a79d7e25eebd20104e7147cbcfb4,L,CC,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding,4835a79d7e25eebd20104e7147cbcfb4_L_CC
20509,20509,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,ebb437593a3fc80277296638f0607f5b,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding,ebb437593a3fc80277296638f0607f5b_L_MLO
20510,20510,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,9188a01b6b648cee39fb70702a860141,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding,9188a01b6b648cee39fb70702a860141_R_CC


# Adding Index to Annotation

In [None]:
# Add a unique index column
df.reset_index(inplace=True)  # This adds a column named 'index' with unique row indices
df
# Save the updated annotation file
#df.to_csv("updated_annotations.csv", index=False)

Unnamed: 0,index,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,finding_categories,finding_birads,xmin,ymin,xmax,ymax,split,stratify_column
0,0,cceccf35004c6032e9755f843e374adc,4505660fef822aae3f7d0bd754ce3318,b755ced71d50184fabdfad246baa97c7,L,CC,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,training,BI-RADS 1_DENSITY D_No Finding
1,1,f15c0d68384b92369cc483b67725d99b,3e73419f3327ddb7650a22c2f3946234,c9a104da2995e6c1ea7d93e703b022a4,L,MLO,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,training,BI-RADS 1_DENSITY D_No Finding
2,2,bab17af07dd34f7b31656562cfc4cfc8,0b943de52b091650c3ce5e384761e4ca,29ce9df80d949b606f58e5007460f37d,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,training,BI-RADS 2_DENSITY C_No Finding
3,3,2acc075c29ee781d02f1c5dd8c756240,51fc50a8cbda5e566cf81348a33aef6b,65051e476f2f4168e62fea74a72bf64f,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,training,BI-RADS 1_DENSITY C_No Finding
4,4,6a77274d2391c35e0e4fcd8dbba32088,0d531615df7a3dfa8eff231aa37de903,16888ecf21147ac8567cd0582cba841c,R,MLO,3518,2800,BI-RADS 1,DENSITY B,No Finding,,,,,,training,BI-RADS 1_DENSITY B_No Finding
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20507,20507,5e9175554fa9d6b9fb424d7065be4f20,a07d1cc99c7611c815ea99acb324f3b6,4adfc32f03228403f518f24e9b9313cf,R,MLO,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding
20508,20508,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,4835a79d7e25eebd20104e7147cbcfb4,L,CC,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding
20509,20509,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,ebb437593a3fc80277296638f0607f5b,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding
20510,20510,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,9188a01b6b648cee39fb70702a860141,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding


In [None]:
df['image_name'] = df['image_id'] + "_" + df['laterality'] + "_" + df['view_position']
df

Unnamed: 0,index,study_id,series_id,image_id,laterality,view_position,height,width,breast_birads,breast_density,finding_categories,finding_birads,xmin,ymin,xmax,ymax,split,stratify_column,image_name
0,0,cceccf35004c6032e9755f843e374adc,4505660fef822aae3f7d0bd754ce3318,b755ced71d50184fabdfad246baa97c7,L,CC,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,training,BI-RADS 1_DENSITY D_No Finding,b755ced71d50184fabdfad246baa97c7_L_CC
1,1,f15c0d68384b92369cc483b67725d99b,3e73419f3327ddb7650a22c2f3946234,c9a104da2995e6c1ea7d93e703b022a4,L,MLO,3518,2800,BI-RADS 1,DENSITY D,No Finding,,,,,,training,BI-RADS 1_DENSITY D_No Finding,c9a104da2995e6c1ea7d93e703b022a4_L_MLO
2,2,bab17af07dd34f7b31656562cfc4cfc8,0b943de52b091650c3ce5e384761e4ca,29ce9df80d949b606f58e5007460f37d,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,training,BI-RADS 2_DENSITY C_No Finding,29ce9df80d949b606f58e5007460f37d_L_MLO
3,3,2acc075c29ee781d02f1c5dd8c756240,51fc50a8cbda5e566cf81348a33aef6b,65051e476f2f4168e62fea74a72bf64f,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,training,BI-RADS 1_DENSITY C_No Finding,65051e476f2f4168e62fea74a72bf64f_R_CC
4,4,6a77274d2391c35e0e4fcd8dbba32088,0d531615df7a3dfa8eff231aa37de903,16888ecf21147ac8567cd0582cba841c,R,MLO,3518,2800,BI-RADS 1,DENSITY B,No Finding,,,,,,training,BI-RADS 1_DENSITY B_No Finding,16888ecf21147ac8567cd0582cba841c_R_MLO
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20507,20507,5e9175554fa9d6b9fb424d7065be4f20,a07d1cc99c7611c815ea99acb324f3b6,4adfc32f03228403f518f24e9b9313cf,R,MLO,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding,4adfc32f03228403f518f24e9b9313cf_R_MLO
20508,20508,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,4835a79d7e25eebd20104e7147cbcfb4,L,CC,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding,4835a79d7e25eebd20104e7147cbcfb4_L_CC
20509,20509,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,ebb437593a3fc80277296638f0607f5b,L,MLO,3518,2800,BI-RADS 2,DENSITY C,No Finding,,,,,,test,BI-RADS 2_DENSITY C_No Finding,ebb437593a3fc80277296638f0607f5b_L_MLO
20510,20510,bbe97d67cc7f0d28bf9860a864af684a,3dad6ed9f9682dfbb924407f407eaf12,9188a01b6b648cee39fb70702a860141,R,CC,3518,2800,BI-RADS 1,DENSITY C,No Finding,,,,,,test,BI-RADS 1_DENSITY C_No Finding,9188a01b6b648cee39fb70702a860141_R_CC


# Duplicate Images Based on Bounding Box and Split to Train/Valid/Test

In [None]:
import os
import shutil

# Paths
base_image_dir = "/content/drive/MyDrive/İlkay Hoca/png_images"  # Root directory with images (study_id > image_id.png)
output_image_dir = "/content/drive/MyDrive/İlkay Hoca/yolo_images"  # Directory to store all images
base_split_dir = "/content/drive/MyDrive/İlkay Hoca/final_yolo"  # Base directory for splits

# Create output directory if it doesn't exist
os.makedirs(output_image_dir, exist_ok=True)

# Create split directories if they don't exist
splits = ['training', 'validation', 'test']
for split in splits:
    os.makedirs(os.path.join(base_split_dir, 'images', split), exist_ok=True)

# Iterate through annotations to duplicate images and move them to respective split directories
for _, row in df.iterrows():
    study_id = row['study_id']
    image_id = row['image_id']
    unique_id = row['index']  # Unique identifier for the bounding box
    split = row['split']  # 'train', 'val', or 'test'

    # Source and destination paths
    image_src = os.path.join(base_image_dir, study_id, f"{image_id}.png")
    image_dst = os.path.join(output_image_dir, f"{unique_id}.png")

    # Copy image if it exists (Duplicate image based on bounding box)
    if os.path.exists(image_src):
        shutil.copy(image_src, image_dst)

        # Move the image to the appropriate split folder
        image_split_dst = os.path.join(base_split_dir, 'images', split, f"{unique_id}.png")
        shutil.move(image_dst, image_split_dst)

In [None]:
df.split.value_counts()

Unnamed: 0_level_0,count
split,Unnamed: 1_level_1
train,13132
test,4098
val,3282


# COCO Annotations

In [None]:
import os
import json
from tqdm import tqdm

# Define the mapping of finding categories to class IDs (1-based for COCO)
category_to_id = {
    'No Finding': 0,  # Special case, not included in "categories"
    'Mass': 1,
    'Suspicious Calcification': 2,
    'Architectural Distortion': 3,
    'Asymmetry': 4,
    'Focal Asymmetry': 5,
    'Global Asymmetry': 6
}

# Initialize COCO annotation structure
coco_annotations = {
    "images": [],
    "annotations": [],
    "categories": [
        {"id": class_id, "name": category}
        for category, class_id in category_to_id.items() if class_id != 0  # Exclude 'No Finding'
    ]
}

# Annotation ID counter
annotation_id = 1

# Base image directory
base_image_dir = "/content/drive/MyDrive/İlkay Hoca/final_yolo/images"
coco_output_path = "/content/drive/MyDrive/İlkay Hoca/final_yolo"

# Iterate through the DataFrame to populate the COCO structure
for _, row in tqdm(df.iterrows(), total=len(df)):
    # Extract relevant information
    unique_id = row['index']  # Unique ID for the image
    finding_category = row['finding_categories']  # Finding category
    split = row['split']  # 'training', 'validation', or 'test'

    # Get image dimensions
    img_width, img_height = row['width'], row['height']
    image_info = {
        "id": unique_id,
        "file_name": f"{unique_id}.png",
        "width": img_width,
        "height": img_height
    }
    coco_annotations["images"].append(image_info)

    # Skip adding annotations for "No Finding"
    if finding_category == 'No Finding':
        continue

    # Map the finding category to a class ID
    class_id = category_to_id.get(finding_category)
    if class_id is None:
        continue

    # Get bounding box coordinates
    xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']
    bbox_width = xmax - xmin
    bbox_height = ymax - ymin

    # Add the annotation
    annotation = {
        "id": annotation_id,
        "image_id": unique_id,
        "category_id": class_id,
        "bbox": [xmin, ymin, bbox_width, bbox_height],  # COCO uses [x_min, y_min, width, height]
        "area": bbox_width * bbox_height,
        "iscrowd": 0  # Set to 0 for regular annotations
    }
    coco_annotations["annotations"].append(annotation)
    annotation_id += 1

# Save the COCO annotations as a JSON file
for split in ['train', 'val', 'test']:
    split_annotations = {
        "images": [img for img in coco_annotations["images"] if img["file_name"].startswith(split)],
        "annotations": [ann for ann in coco_annotations["annotations"] if str(ann["image_id"]).startswith(split)],
        "categories": coco_annotations["categories"]
    }
    output_path = os.path.join(coco_output_path, f"{split}_annotations.json")
    with open(output_path, 'w') as json_file:
        json.dump(split_annotations, json_file, indent=4)

print("COCO annotation files created for each split!")


100%|██████████| 20512/20512 [00:01<00:00, 17404.06it/s]


COCO annotation files created for each split!


# YOLO Annotations

In [None]:
import os
from tqdm import tqdm

# Define the mapping of finding categories to class IDs (0 to 6)
category_to_id = {
    'No Finding': 0,
    'Mass': 1,
    'Suspicious Calcification': 2,
    'Architectural Distortion': 3,
    'Asymmetry': 4,
    'Focal Asymmetry': 5,
    'Global Asymmetry': 6
}

# Base directories for YOLO labels and images
base_yolo_label_dir = "/content/drive/MyDrive/İlkay Hoca/final_yolo/labels"
base_image_dir = "/content/drive/MyDrive/İlkay Hoca/final_yolo/images"

# Create split directories
splits = ['train', 'val', 'test']
for split in splits:
    os.makedirs(os.path.join(base_yolo_label_dir, split), exist_ok=True)

# Counter dictionary for tracking label and image counts
counts = {split: {'labels': 0, 'images': 0} for split in splits}

# Iterate through the DataFrame to create YOLO `.txt` files and place them in the correct split
for _, row in tqdm(df.iterrows(), total=len(df)):
    # Extract relevant information
    unique_id = row['index']  # Unique ID for the bounding box
    finding_category = row['finding_categories']  # Finding category
    split = row['split']  # 'training', 'validation', or 'test'

    # Define the output directory for the split
    split_label_dir = os.path.join(base_yolo_label_dir, split)
    label_file_path = os.path.join(split_label_dir, f"{unique_id}.txt")

    # Check if the finding is 'No Finding'
    if finding_category == 'No Finding':
        # Create an empty `.txt` file for "No Finding"
        open(label_file_path, 'w').close()
        counts[split]['labels'] += 1
    else:
        # Map the finding category to a class ID
        class_id = category_to_id.get(finding_category)
        if class_id is None:
            continue

        # Get bounding box coordinates and image dimensions
        xmin, ymin, xmax, ymax = row['xmin'], row['ymin'], row['xmax'], row['ymax']
        img_width, img_height = row['width'], row['height']

        # Normalize bounding box coordinates for YOLO format
        x_center = (xmin + xmax) / 2 / img_width
        y_center = (ymin + ymax) / 2 / img_height
        width = (xmax - xmin) / img_width
        height = (ymax - ymin) / img_height

        # Create the YOLO label string
        label = f"{class_id} {x_center:.6f} {y_center:.6f} {width:.6f} {height:.6f}"

        # Write the label to the file
        with open(label_file_path, 'w') as label_file:
            label_file.write(label)

        counts[split]['labels'] += 1

    # Check if the corresponding image exists in the image directory
    split_image_dir = os.path.join(base_image_dir, split)
    image_file_path = os.path.join(split_image_dir, f"{unique_id}.png")
    if os.path.exists(image_file_path):
        counts[split]['images'] += 1

# Print summary of counts
print("Summary of Images and Labels per Split:")
for split in splits:
    print(f"Split: {split}")
    print(f"  Number of label files: {counts[split]['labels']}")
    print(f"  Number of images: {counts[split]['images']}")
    print()


100%|██████████| 20512/20512 [01:57<00:00, 174.87it/s]

Summary of Images and Labels per Split:
Split: train
  Number of label files: 13132
  Number of images: 13132

Split: val
  Number of label files: 3282
  Number of images: 3282

Split: test
  Number of label files: 4098
  Number of images: 4098






# Final Check

In [None]:
import os

# Paths to split directories
base_dir = '/content/drive/MyDrive/İlkay Hoca/final_yolo'
splits = ['train', 'val', 'test']

# Initialize counters
total_images = 0
total_labels = 0

# Check each split
for split in splits:
    image_dir = os.path.join(base_dir, 'images', split)
    label_dir = os.path.join(base_dir, 'labels', split)

    # List files in image and label directories
    image_files = {os.path.splitext(f)[0] for f in os.listdir(image_dir) if f.endswith('.png')}
    label_files = {os.path.splitext(f)[0] for f in os.listdir(label_dir) if f.endswith('.txt')}

    # Check matching images and labels
    missing_labels = image_files - label_files
    missing_images = label_files - image_files

    print(f"--- {split.upper()} SPLIT ---")
    print(f"Total images: {len(image_files)}")
    print(f"Total labels: {len(label_files)}")
    print(f"Missing labels: {len(missing_labels)} -> {missing_labels}")
    print(f"Missing images: {len(missing_images)} -> {missing_images}")
    print()

    # Update counters
    total_images += len(image_files)
    total_labels += len(label_files)

# Summary
print("=== SUMMARY ===")
print(f"Total images across all splits: {total_images}")
print(f"Total labels across all splits: {total_labels}")
if total_images == total_labels:
    print("✅ All images have matching labels!")
else:
    print("❌ Some images or labels are missing!")


--- TRAIN SPLIT ---
Total images: 13132
Total labels: 13132
Missing labels: 0 -> set()
Missing images: 0 -> set()

--- VAL SPLIT ---
Total images: 3282
Total labels: 3282
Missing labels: 0 -> set()
Missing images: 0 -> set()

--- TEST SPLIT ---
Total images: 4098
Total labels: 4098
Missing labels: 0 -> set()
Missing images: 0 -> set()

=== SUMMARY ===
Total images across all splits: 20512
Total labels across all splits: 20512
✅ All images have matching labels!


In [None]:
import os
import pandas as pd

# Paths
base_dir = '/content/drive/MyDrive/İlkay Hoca/final_yolo'
splits = ['training', 'validation', 'test']

# Original DataFrame
# Replace this with your DataFrame

# Check for mismatches
mismatches = []

for split in splits:
    # Get DataFrame subset for the current split
    split_df = df[df['split'] == split]

    # Paths for images and labels
    image_dir = os.path.join(base_dir, 'images', split)
    label_dir = os.path.join(base_dir, 'labels', split)

    for _, row in split_df.iterrows():
        index = row['index']
        # Expected filenames
        expected_image = f"{index}.png"
        expected_label = f"{index}.txt"

        # Check if they exist
        image_exists = os.path.exists(os.path.join(image_dir, expected_image))
        label_exists = os.path.exists(os.path.join(label_dir, expected_label))

        if not (image_exists and label_exists):
            mismatches.append({
                'split': split,
                'index': index,
                'image_exists': image_exists,
                'label_exists': label_exists
            })

# Print results
if mismatches:
    print("=== MISMATCHES FOUND ===")
    for mismatch in mismatches:
        print(f"Split: {mismatch['split']}, Index: {mismatch['index']}, "
              f"Image Exists: {mismatch['image_exists']}, Label Exists: {mismatch['label_exists']}")
else:
    print("✅ All files are correctly split and matched!")


✅ All files are correctly split and matched!


# YOLO Training

In [None]:
!pip install ultralytics

Collecting ultralytics
  Downloading ultralytics-8.3.48-py3-none-any.whl.metadata (35 kB)
Collecting ultralytics-thop>=2.0.0 (from ultralytics)
  Downloading ultralytics_thop-2.0.13-py3-none-any.whl.metadata (9.4 kB)
Downloading ultralytics-8.3.48-py3-none-any.whl (898 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m898.8/898.8 kB[0m [31m29.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading ultralytics_thop-2.0.13-py3-none-any.whl (26 kB)
Installing collected packages: ultralytics-thop, ultralytics
Successfully installed ultralytics-8.3.48 ultralytics-thop-2.0.13


In [None]:
from ultralytics import YOLO

Creating new Ultralytics Settings v0.0.6 file ✅ 
View Ultralytics Settings with 'yolo settings' or at '/root/.config/Ultralytics/settings.json'
Update Settings with 'yolo settings key=value', i.e. 'yolo settings runs_dir=path/to/dir'. For help see https://docs.ultralytics.com/quickstart/#ultralytics-settings.


In [None]:
!yolo task=detect mode=train model=yolov5l6u.pt data="/content/drive/MyDrive/İlkay Hoca/dataset_config_weights.yaml" epochs=20 imgsz=640

Downloading https://github.com/ultralytics/assets/releases/download/v8.3.0/yolov5l6u.pt to 'yolov5l6u.pt'...
100% 165M/165M [00:02<00:00, 78.8MB/s]
Ultralytics 8.3.48 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla T4, 15102MiB)
[34m[1mengine/trainer: [0mtask=detect, mode=train, model=yolov5l6u.pt, data=/content/drive/MyDrive/İlkay Hoca/dataset_config_weights.yaml, epochs=20, time=None, patience=100, batch=16, imgsz=640, save=True, save_period=-1, cache=False, device=None, workers=8, project=None, name=train, exist_ok=False, pretrained=True, optimizer=auto, verbose=True, seed=0, deterministic=True, single_cls=False, rect=False, cos_lr=False, close_mosaic=10, resume=False, amp=True, fraction=1.0, profile=False, freeze=None, multi_scale=False, overlap_mask=True, mask_ratio=4, dropout=0.0, val=True, split=val, save_json=False, save_hybrid=False, conf=None, iou=0.7, max_det=300, half=False, dnn=False, plots=True, source=None, vid_stride=1, stream_buffer=False, visualize=False, augment

In [None]:
!yolo task=detect mode=val model="/content/runs/detect/train/weights/best.pt" data="/content/drive/MyDrive/İlkay Hoca/dataset_config.yaml" split=test

Ultralytics 8.3.48 🚀 Python-3.10.12 torch-2.5.1+cu121 CUDA:0 (Tesla T4, 15102MiB)
YOLOv5l6u summary (fused): 393 layers, 85,978,476 parameters, 0 gradients, 137.0 GFLOPs
[34m[1mval: [0mScanning /content/drive/MyDrive/İlkay Hoca/final_yolo/labels/test... 4098 images, 3643 backgrounds, 0 corrupt: 100% 4098/4098 [11:07<00:00,  6.14it/s]
[34m[1mval: [0mNew cache created: /content/drive/MyDrive/İlkay Hoca/final_yolo/labels/test.cache
                 Class     Images  Instances      Box(P          R      mAP50  mAP50-95): 100% 257/257 [05:36<00:00,  1.31s/it]
                   all       4098        455      0.587      0.036     0.0274     0.0135
                  Mass        237        237      0.323       0.19      0.139     0.0699
Suspicious Calcification        115        115        0.2     0.0261     0.0149    0.00487
Architectural Distortion         24         24          0          0          0          0
             Asymmetry         20         20          1          0     