<a href="https://colab.research.google.com/github/iamshanevictor/CXRaide2.0-Model_Training/blob/main/Data_Preprocessing%20Part%202/Final_DataPreprocess.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
import shutil
import pandas as pd
import xml.etree.ElementTree as ET
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

In [None]:
!pip install pillow



In [None]:
# Define the base path for VOCdevkit
base_path = '/content/drive/Shareddrives/cxraide/ssd300_vgg16/VER6/VOCdevkit/VOC2007'

# Create the directory structure
os.makedirs(os.path.join(base_path, 'Annotations'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'ImageSets/Main'), exist_ok=True)
os.makedirs(os.path.join(base_path, 'JPEGImages'), exist_ok=True)

# Paths to your training and validation folders
train_folder = '/content/drive/Shareddrives/cxraide/ssd300_vgg16/VER6/balance_TRAINING'
val_folder = '/content/drive/Shareddrives/cxraide/ssd300_vgg16/VER6/balance_VALIDATION'
jpeg_images_folder = os.path.join(base_path, 'JPEGImages')

# Copy training images
for filename in os.listdir(train_folder):
    if filename.endswith('.png'):
        shutil.copy(os.path.join(train_folder, filename), os.path.join(jpeg_images_folder, filename))

# Copy validation images
for filename in os.listdir(val_folder):
    if filename.endswith('.png'):
        shutil.copy(os.path.join(val_folder, filename), os.path.join(jpeg_images_folder, filename))

def create_annotation(image_id, width, height, objects, output_dir):
    annotation = ET.Element('annotation')
    ET.SubElement(annotation, 'folder').text = 'VOC2007'
    ET.SubElement(annotation, 'filename').text = image_id + '.png'
    ET.SubElement(annotation, 'path').text = os.path.join(jpeg_images_folder, image_id + '.png')

    source = ET.SubElement(annotation, 'source')
    ET.SubElement(source, 'database').text = 'Unknown'

    size = ET.SubElement(annotation, 'size')
    ET.SubElement(size, 'width').text = str(width)
    ET.SubElement(size, 'height').text = str(height)
    ET.SubElement(size, 'depth').text = '3'

    ET.SubElement(annotation, 'segmented').text = '0'

    for obj in objects:
        obj_elem = ET.SubElement(annotation, 'object')
        ET.SubElement(obj_elem, 'name').text = obj['class_name']
        ET.SubElement(obj_elem, 'pose').text = 'Unspecified'
        ET.SubElement(obj_elem, 'truncated').text = '0'
        ET.SubElement(obj_elem, 'difficult').text = '0'

        bndbox = ET.SubElement(obj_elem, 'bndbox')
        ET.SubElement(bndbox, 'xmin').text = str(obj['x_min'])
        ET.SubElement(bndbox, 'ymin').text = str(obj['y_min'])
        ET.SubElement(bndbox, 'xmax').text = str(obj['x_max'])
        ET.SubElement(bndbox, 'ymax').text = str(obj['y_max'])

    tree = ET.ElementTree(annotation)
    tree.write(os.path.join(output_dir, image_id + '.xml'))

# Load CSV files
train_df = pd.read_csv('/content/drive/Shareddrives/cxraide/ssd300_vgg16/VER6/balanced_train.csv')
val_df = pd.read_csv('/content/drive/Shareddrives/cxraide/ssd300_vgg16/VER6/balanced_val.csv')

# Create Annotations folder if it doesn't exist
annotations_folder = os.path.join(base_path, 'Annotations')
os.makedirs(annotations_folder, exist_ok=True)

# Process training data
for image_id, group in train_df.groupby('image_id'):
    objects = group.to_dict('records')
    create_annotation(image_id, 300, 300, objects, annotations_folder)

# Process validation data
for image_id, group in val_df.groupby('image_id'):
    objects = group.to_dict('records')
    create_annotation(image_id, 300, 300, objects, annotations_folder)

# Create ImageSets/Main folder if it doesn't exist
image_sets_folder = os.path.join(base_path, 'ImageSets/Main')
os.makedirs(image_sets_folder, exist_ok=True)

# List of image filenames (without extensions)
trainval_filenames = train_df['image_id'].unique()
test_filenames = val_df['image_id'].unique()

# Write to trainval.txt
with open(os.path.join(image_sets_folder, 'trainval.txt'), 'w') as f:
    for filename in trainval_filenames:
        f.write(f"{filename}\n")

# Write to test.txt
with open(os.path.join(image_sets_folder, 'test.txt'), 'w') as f:
    for filename in test_filenames:
        f.write(f"{filename}\n")