<a href="https://colab.research.google.com/github/joelleslim/memoire/blob/main/Dental_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# IMPORTANT: RUN THIS CELL IN ORDER TO IMPORT YOUR KAGGLE DATA SOURCES,
# THEN FEEL FREE TO DELETE THIS CELL.
# NOTE: THIS NOTEBOOK ENVIRONMENT DIFFERS FROM KAGGLE'S PYTHON
# ENVIRONMENT SO THERE MAY BE MISSING LIBRARIES USED BY YOUR
# NOTEBOOK.
import kagglehub
nguyenttung_dental_bitewing_x_ray_dataset_path = kagglehub.dataset_download('nguyenttung/dental-bitewing-x-ray-dataset')
nguyenttung_dental_periapical_x_ray_dataset_path = kagglehub.dataset_download('nguyenttung/dental-periapical-x-ray-dataset')
nguyenttung_dental_mix_4k_images_path = kagglehub.dataset_download('nguyenttung/dental-mix-4k-images')

print('Data source import complete.')


In [None]:
import os
import shutil
from pathlib import Path

import cv2
import numpy as np
import matplotlib.pyplot as plt

from PIL import Image
from tqdm.notebook import tqdm
from sklearn.model_selection import train_test_split

In [None]:
datasets = ['/kaggle/input/dental-bitewing-x-ray-dataset',
               '/kaggle/input/dental-mix-4k-images',
               '/kaggle/input/dental-periapical-x-ray-dataset']

In [None]:
def get_files(directory, extension):
    results = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(extension):
                results.append(os.path.join(root, file))
    results.sort()
    return results

In [None]:
images = []
annotations = []

for dataset in datasets:
    sub_images = get_files(dataset, '.JPG')
    sub_annotations = get_files(dataset, '.png')
    images.extend(sub_images)
    annotations.extend(sub_annotations)
print(f'Number of images: {len(images)}')
print(f'Number of annotations: {len(annotations)}')

In [None]:
def show_img_annotation(images, annotations, index, figsize=(12, 6)):
    n = len(images)
    if index >= n or index < 0:
        print("Index error")
        return

    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=figsize)

    img_dir = images[index]
    img = plt.imread(img_dir)
    ax1.imshow(img, cmap='gray')

    ann_dir = annotations[index]
    ann = plt.imread(ann_dir)
    ax2.imshow(ann)

    plt.show()

In [None]:
for index in np.random.randint(0, len(images), 3):
    show_img_annotation(images, annotations, index)

In [None]:
labelmap_path = '/kaggle/input/dental-periapical-x-ray-dataset/Dental-periapuical-x-ray-dataset/labelmap.txt'
label_colors = {}

with open(labelmap_path, 'r') as file:
    for line in file:
        if line.startswith('#') or line.strip() == '':
            continue

        parts = line.strip().split(':')
        label = parts[0]
        if label == 'background':
            continue
        color_rgb = tuple(map(int, parts[1].split(',')))

        label_colors[label] = color_rgb

print(f'Số lượng labels: {len(label_colors)}')

for label, color in label_colors.items():
    print(label, ':', color)

In [None]:
# structure_counts = {label: 0 for label in label_colors}

# for mask_file in annotations:
#     mask = cv2.imread(mask_file)
#     mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)
#     for label, color in label_colors.items():

#         if np.any(np.all(mask == color, axis=-1)):
#             structure_counts[label] += 1


# print(structure_counts)

In [None]:
# total_masks = len(annotations)
# structure_percent = {label: (count/ total_masks) * 100 for label, count in structure_counts.items()}

# labels = list(structure_counts.keys())
# counts = list(structure_counts.values())
# percentages = list(structure_percent.values())

# plt.figure(figsize=(10, 6))
# plt.bar(labels, counts, color='skyblue')
# plt.title('Số lượng ảnh có chứa cấu trúc')
# plt.xlabel('Cấu trúc')
# plt.ylabel('Số lượng ảnh')
# plt.xticks(rotation=90)


# plt.figure(figsize=(10, 6))
# plt.bar(labels, percentages, color='lightgreen')
# plt.title('Phần trăm ảnh có chứa cấu trúc')
# plt.xlabel('Cấu trúc')
# plt.ylabel('Phần trăm (%)')
# plt.xticks(rotation=90)
# plt.show()

In [None]:
train_images, test_images, train_annotations, test_annotations = train_test_split(images, annotations, test_size=0.3, random_state=42)
val_images, test_images, val_annotations, test_annotations = train_test_split(test_images, test_annotations, test_size=0.5, random_state=42)

In [None]:
output_dirs = ['train', 'val', 'test']
for d in output_dirs:
    os.makedirs(f'/kaggle/working/{d}/images', exist_ok=True)
    os.makedirs(f'/kaggle/working/{d}/annotations', exist_ok=True)

In [None]:
img_dir = images[0]
img_dir.split('/')[-1].split('.')[0]

In [None]:
def process_and_save(images, annotations, output_dir, dataset_type=None, size=416):
    sums = 0
    sums_squared = 0
    for img_dir, mask_dir in zip(images, annotations):
        img = cv2.imread(img_dir, cv2.IMREAD_GRAYSCALE)

        mask = cv2.imread(mask_dir)
        mask = cv2.cvtColor(mask, cv2.COLOR_BGR2RGB)

        img = img.astype(np.float32)
        mask = mask.astype(np.uint8)

        img = img / 255.0

        img = cv2.resize(img, (size, size), interpolation=cv2.INTER_NEAREST)
        mask = cv2.resize(mask, (size, size), interpolation=cv2.INTER_NEAREST)

        img_name = img_dir.split('/')[-1].split('.')[0]
        mask_name = mask_dir.split('/')[-1].split('.')[0]
        np.save(os.path.join(output_dir, 'images', img_name + '.npy'), img.astype(np.float16))
        np.save(os.path.join(output_dir, 'annotations', mask_name + '.npy'), mask)

        if dataset_type == 'train':
            sums += np.sum(img) / (size * size)
            sums_squared += np.sum(img ** 2) / (size * size)
    return sums, sums_squared

In [None]:
SIZE = 416
sums, sums_squared = process_and_save(train_images, train_annotations, '/kaggle/working/train', 'train', size=SIZE)

In [None]:
import pickle

with open('sums.pkl', 'wb') as file:
    pickle.dump(sums, file)

with open('sums_squared.pkl', 'wb') as file:
    pickle.dump(sums_squared, file)

In [None]:
process_and_save(val_images, val_annotations, '/kaggle/working/val', size=SIZE)
process_and_save(test_images, test_annotations, '/kaggle/working/test', size=SIZE)

In [None]:
# shutil.make_archive('train', 'zip', '/kaggle/working/train')
# shutil.make_archive('val', 'zip', '/kaggle/working/val')
# shutil.make_archive('test', 'zip', '/kaggle/working/test')

In [None]:
# shutil.rmtree('/kaggle/working/train')
# shutil.rmtree('/kaggle/working/val')
# shutil.rmtree('/kaggle/working/test')