# Install packages

# Một số lưu ý về dataset

There are also several things you should note:
The list is arranged in pairs of films, where each pair represents the left (even filename numbers) and right mammograms (odd filename numbers) of a single patient.
The size of all the images is 1024 pixels x 1024 pixels. The images have been centered in the matrix.
When calcifications are present, centre locations and radii apply to clusters rather than individual calcifications. Coordinate system origin is the bottom-left corner.
In some cases calcifications are widely distributed throughout the image rather than concentrated at a single site. In these cases centre locations and radii are inappropriate and have been omitted.

# Import

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

import os, cv2, math
import tensorflow as tf
from shutil import copyfile
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Define load image and show image

In [None]:
def plt_show(figsize=(15, 8), height=2, **kwargs):
    
    n = len(kwargs)
    w = math.ceil(n / height)
    h = height
    plt.figure(figsize=figsize)
    items = list(kwargs.items())
    i = 0
    for title, image in items:
        plt.subplot(h, w, i + 1)
        plt.imshow(image)
        plt.title(title, color='blue', fontsize=12)
#         plt.axis('off')
        i += 1
    
    plt.show()

# Replace 'your_image.pgm' with the actual path to your PGM image file
sample_img = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb019.pgm', 2)
plt_show(figsize=(30, 10), sample_image=sample_img)

## Load all images

In [None]:
dataset_path = "/kaggle/input/mias-mammography/all-mias/"
ext = '.pgm'

In [None]:
raw_df = pd.read_csv('/kaggle/input/mias-mammography/Info.txt', delimiter=' ')
df = raw_df.loc[:,['REFNUM', 'SEVERITY', 'X', 'Y', 'RADIUS']]
# Tọa độ dataset tính từ góc trái dưới, lưu ý
# Cần chuyển về tọa độ theo góc trái trên
df['Y'] = 1024 - df['Y']
df['x1'] = np.round(df['X'] - df['RADIUS'] )
df['y1'] = np.round(df['Y'] - df['RADIUS'] )
df['x2'] = np.round(df['X'] + df['RADIUS'] )
df['y2'] = np.round(df['Y'] + df['RADIUS'] )

df = df[["REFNUM", "SEVERITY", "x1", "y1", "x2", "y2"]]
df

## Show abnormal images with bbox

In [None]:
abnormal_df = df.loc[df.SEVERITY.notnull() & (df.x1.notnull())]
abnormal_df

In [None]:
samples = abnormal_df.sample(5)
samples_dict = dict()
for i in samples.index:
    path = os.path.join(dataset_path, samples.REFNUM.loc[i] + ext)
    img = cv2.imread(path)
    bbox = samples.loc[i, ['x1', 'y1', 'x2', 'y2']].astype(int).tolist()
    
    img = cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 2)
    samples_dict[f'{i} - IMG - {samples.SEVERITY.loc[i]}'] = img

plt_show(figsize=(20, 10), **samples_dict)

# Data Distribution

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

distributor = defaultdict(int)

for i in abnormal_df.index:
    distributor[abnormal_df.SEVERITY.loc[i]] += 1

distributor_x = np.array(list(distributor.keys()))
distributor_y = np.array(list(distributor.values()))

plt.bar(distributor_x, distributor_y, color = "red")
plt.show()

# Transform datasets

## 1. Data augumentation

In [None]:
def read_image(refnum):
    path = os.path.join(dataset_path, refnum + ext)
    img = cv2.imread(path, 1)
#     img = cv2.resize(img, (640, 640), interpolation=cv2.INTER_AREA)
    return img

In [None]:
def combine_images(img1, img2, img3, img4):
    # Resize images to have the same dimensions (if needed)
    # You can use the functions mentioned below to resize images of different sizes
    # Combine images vertically (top left, bottom left)
    top_left = cv2.vconcat([img1, img3])
    # Combine images vertically (top right, bottom right)
    top_right = cv2.vconcat([img2, img4])
    # Combine images horizontally (final result)
    combined_image = cv2.hconcat([top_left, top_right])
    
    return combined_image

def combine_bboxs(bbox1, bbox2, bbox3, bbox4, size=1024):
    for box in bbox2:
        box[0] += size
        box[2] += size
    for box in bbox3:
        box[1] += size
        box[3] += size
    for box in bbox4:
        box[0] += size
        box[1] += size
        box[2] += size
        box[3] += size
    return [*bbox1, *bbox2, *bbox3, *bbox4]

img1 = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb001.pgm', 1)
img2 = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb002.pgm', 1)
img3 = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb003.pgm', 1)
img4 = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb004.pgm', 1)
combine_image = combine_images(img1, img2, img3, img4)

plt_show(figsize=(30, 15), combine_image=combine_image)

In [None]:
selected_ids = np.random.choice(abnormal_df.REFNUM.unique(), 4, False)
images = []
bboxs = []
for refnum in selected_ids:
    img = read_image(refnum)
    boxs = []
    for i in abnormal_df.loc[abnormal_df.REFNUM == refnum].index:
        a, b, c, d = abnormal_df.loc[i, ['x1', 'y1', 'x2', 'y2']]
        boxs.append([a, b, c, d])
    bboxs.append(boxs)
    images.append(img)
    
aug_image = combine_images(images[0], images[1], images[2], images[3])
aug_bboxs = combine_bboxs(bboxs[0], bboxs[1], bboxs[2], bboxs[3])

temp = aug_image
for rect in aug_bboxs:
    rect = list(map(int, rect))
    temp = cv2.rectangle(temp, (rect[0], rect[1]), (rect[2], rect[3]), (255, 0, 0), 2)

plt_show(figsize=(30, 15), aug_image_with_bbox=temp)

In [None]:
data_dict = dict()
train_test_pivot = len(abnormal_df.REFNUM.unique()) * 0.8
for k, refnum in enumerate(abnormal_df.REFNUM.unique()):
    image = read_image(refnum)
    boxs = []
    for i in abnormal_df.loc[abnormal_df.REFNUM == refnum].index:
        a, b, c, d = abnormal_df.loc[i, ['x1', 'y1', 'x2', 'y2']]
        boxs.append([a, b, c, d])
    
    data_dict[refnum] = dict(image=image, targets=boxs, mode='train' if k <= train_test_pivot else 'val')

len(data_dict)

In [None]:
aug_size = 400
train_refnums = abnormal_df.REFNUM.unique().tolist()
for k in range(aug_size):
    selected_ids = np.random.choice(train_refnums, 4, False)
    images = []
    bboxs = []
    for refnum in selected_ids:
        img = read_image(refnum)
        boxs = []
        for i in abnormal_df.loc[abnormal_df.REFNUM == refnum].index:
            a, b, c, d = abnormal_df.loc[i, ['x1', 'y1', 'x2', 'y2']]
            boxs.append([a, b, c, d])
        bboxs.append(boxs)
        images.append(img)

    aug_image = combine_images(images[0], images[1], images[2], images[3])
    aug_bboxs = combine_bboxs(bboxs[0], bboxs[1], bboxs[2], bboxs[3])
    aug_refnum = f'aug_{k}'
    data_dict[aug_refnum] = dict(image=aug_image, targets=aug_bboxs, mode='train')

len(data_dict)

In [None]:
dst_path = '/kaggle/working/datasets/detect/'

for refnum in tqdm(data_dict):
    
    img = data_dict[refnum]['image'] # (1024, 1024, 3)
    targets = data_dict[refnum]['targets']
    mode = data_dict[refnum]['mode']
    img_path = os.path.join(dst_path, mode, "images", refnum + '.png')
    label_path = os.path.join(dst_path, mode, "labels", refnum + '.txt')
    os.makedirs(os.path.dirname(img_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    plt.imsave(img_path, img)
    with open(label_path, 'a') as f:
        for bbox in targets:
            h, w, channel = img.shape
            a, b, c, d = bbox
            x = (a + c) / (2 * w)
            y = (b + d) / (2 * h)
            ww = abs(a - c) / w
            hh = abs(b - d) / h
            f.write(f'{0} {x} {y} {ww} {hh}')
            f.write('\n')
