# Install packages

In [None]:
!pip install seam-carving

# Một số lưu ý về dataset

There are also several things you should note:
The list is arranged in pairs of films, where each pair represents the left (even filename numbers) and right mammograms (odd filename numbers) of a single patient.
The size of all the images is 1024 pixels x 1024 pixels. The images have been centered in the matrix.
When calcifications are present, centre locations and radii apply to clusters rather than individual calcifications. Coordinate system origin is the bottom-left corner.
In some cases calcifications are widely distributed throughout the image rather than concentrated at a single site. In these cases centre locations and radii are inappropriate and have been omitted.

# Import

In [None]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seam_carving

import os, cv2, math
import tensorflow as tf
from shutil import copyfile
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Define load image and show image

In [None]:
def plt_show(figsize=(15, 8), height=2, **kwargs):
    
    n = len(kwargs)
    w = math.ceil(n / height)
    h = height
    plt.figure(figsize=figsize)
    items = list(kwargs.items())
    i = 0
    for title, image in items:
        plt.subplot(h, w, i + 1)
        plt.imshow(image)
        plt.title(title, color='blue', fontsize=12)
#         plt.axis('off')
        i += 1
    
    plt.show()

# Replace 'your_image.pgm' with the actual path to your PGM image file
sample_img = cv2.imread('/kaggle/input/mias-mammography/all-mias/mdb019.pgm')
plt_show(figsize=(30, 10), sample_image=sample_img)

## Load all images

In [None]:
dataset_path = "/kaggle/input/mias-mammography/all-mias/"
ext = '.pgm'

In [None]:
raw_df = pd.read_csv('/kaggle/input/mias-mammography/Info.txt', delimiter=' ')
df = raw_df.loc[:,['REFNUM', 'SEVERITY', 'X', 'Y', 'RADIUS']]
df.SEVERITY.fillna("NORMAL", inplace=True)
df.rename(columns={ "SEVERITY": "label", "REFNUM": "image_id" }, inplace=True)

# Tọa độ dataset tính từ góc trái dưới, lưu ý
# Cần chuyển về tọa độ theo góc trái trên
df['Y'] = 1024 - df['Y']
df['x1'] = df['X'] - df['RADIUS']
df['y1'] = df['Y'] - df['RADIUS']
df['x2'] = df['X'] + df['RADIUS']
df['y2'] = df['Y'] + df['RADIUS']

df = df[["image_id", "label", "x1", "y1", "x2", "y2"]]
error_df = df.loc[(df.label != "NORMAL") & (df.x1.isnull())]
df

## Dection data error

In [None]:
error_df.head()
error_dict = dict()
for i in error_df.index:
    
    path = os.path.join(dataset_path, error_df.image_id.loc[i] + ext)
    img = cv2.imread(path)
    error_dict[f'{i} - IMG'] = img

plt_show(**error_dict)

## Show abnormal images with bbox

In [None]:
samples = df.loc[(df.label != "NORMAL") & (df.x1.notnull())].sample(5)
samples_dict = dict()
for i in samples.index:
    path = os.path.join(dataset_path, samples.image_id.loc[i] + ext)
    img = cv2.imread(path)
    bbox = samples.loc[i, ['x1', 'y1', 'x2', 'y2']].astype(int).tolist()
    
    img = cv2.rectangle(img, (bbox[0], bbox[1]), (bbox[2], bbox[3]), (255, 0, 0), 5)
    samples_dict[f'{i} - IMG - {samples.label.loc[i]}'] = img

plt_show(figsize=(20, 10), **samples_dict)

# Data Distribution

In [None]:
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np

distributor = defaultdict(int)

for i in df.index:
    distributor[df.label[i]] += 1

distributor_x = np.array(list(distributor.keys()))
distributor_y = np.array(list(distributor.values()))

plt.bar(distributor_x, distributor_y, color = "red")
plt.show()

# Transform datasets

## Define preprocessing function

In [None]:
def preprocessing(origin_img):
    
    mask_size = (224, 224)
    origin_size = (origin_img.shape[1], origin_img.shape[0])
    
    resized = cv2.resize(origin_img, mask_size, interpolation=cv2.INTER_AREA)
    h, w, _ = resized.shape
    gray = cv2.cvtColor(resized, cv2.COLOR_BGR2GRAY)
    thresh = 255 - cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    
    # Filter using contour area and remove small noise
    cnts, _ = cv2.findContours(thresh, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
    cnts = list(filter(lambda c: cv2.contourArea(c) < (3000 * w / 224) , cnts))
    contour = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
    cv2.drawContours(contour, cnts, -1, (255,255,255), thickness=cv2.FILLED)
    
    # Morph close and invert image
    kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 3))
    close = 255 - cv2.morphologyEx(contour, cv2.MORPH_CLOSE, kernel, iterations=2)
    
    xor_mask = cv2.bitwise_xor(thresh, close)
    drop_mask = cv2.morphologyEx(xor_mask, cv2.MORPH_OPEN, kernel, iterations=2)
    drop_mask_expanded = cv2.resize(drop_mask, origin_size, interpolation=cv2.INTER_AREA)
    close_expanded = cv2.resize(close, origin_size, interpolation=cv2.INTER_AREA)
    drop_noise = seam_carving.resize(origin_img, drop_mask=drop_mask_expanded, keep_mask=close_expanded)
    expander = (0, origin_size[0] - drop_noise.shape[1])
    out = np.pad(drop_noise, [(0, 0), expander, (0, 0)], mode='constant')
    
    pipeline = {
        'origin': origin_img,
        'thresholding': thresh,
        'filled_contour': contour,
        'morph_close': close,
        'xor_mask': xor_mask,
        'drop_mask': drop_mask,
        'drop_noise': drop_noise,
        'out': out
    }
    
    return pipeline

samples = df.sample(5)
for i in samples.index:
    
    sample_path = os.path.join(dataset_path, df.image_id.loc[i] + ext)
    print(sample_path)
    sample_img = cv2.imread(sample_path)
    sample_pipeline = preprocessing(sample_img)
    plt_show(figsize=(15, 8), **sample_pipeline)

## 1. YOLO format for detection

### Phase 3 - Detection on ABNORMAL

In [None]:
df['x_center'] = ((df['x1'] + df['x2']) / 2048).fillna(0.5)
df['y_center'] = ((df['y1'] + df['y2']) / 2048).fillna(0.5)
df['box_width'] = ((df['x1'] - df['x2']).map(abs) / 1024).fillna(1)
df['box_height'] = ((df['y1'] - df['y2']).map(abs) / 1024).fillna(1)
df

In [None]:
train_df, valid_df = train_test_split(df, test_size=0.2, stratify=df.label)
train_df.shape, valid_df.shape

In [None]:
train_df['split'] = 'train'
valid_df['split'] = 'val'
ph3_df = pd.concat([
    train_df,
    valid_df,
]).reset_index(drop=True)
ph3_df

In [None]:
dst_path = '/kaggle/working/datasets/detect/'
dst_ext = '.png'

for i in tqdm(ph3_df.index):
    
    img_id = ph3_df.image_id.loc[i]
    pgmfile = img_id + ext
    path = os.path.join(dataset_path, pgmfile)
    
    img = cv2.imread(path) # (1024, 1024, 3)
    img_path = os.path.join(dst_path, ph3_df.split.loc[i], "images", img_id + dst_ext)
    label_path = os.path.join(dst_path, ph3_df.split.loc[i], "labels", img_id + '.txt')
    os.makedirs(os.path.dirname(img_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    
    
    class_idx = ['NORMAL', 'B', 'M'].index(ph3_df.label.loc[i])
    x_center = ph3_df.x_center.loc[i]
    y_center = ph3_df.y_center.loc[i]
    box_width = ph3_df.box_width.loc[i]
    box_height = ph3_df.box_height.loc[i]
    if class_idx != 0:
        plt.imsave(img_path, img)
        with open(label_path, 'a') as f:
            f.write(f'{0} {x_center} {y_center} {box_width} {box_height}')
            f.write('\n')

dst_path = '/kaggle/working/processed_datasets/detect/'
for i in tqdm(ph3_df.index):
    
    img_id = ph3_df.image_id.loc[i]
    pgmfile = img_id + ext
    path = os.path.join(dataset_path, pgmfile)
    
    img = cv2.imread(path) # (1024, 1024, 3)
    img = preprocessing(img)['out']
    img_path = os.path.join(dst_path, ph3_df.split.loc[i], "images", img_id + dst_ext)
    label_path = os.path.join(dst_path, ph3_df.split.loc[i], "labels", img_id + '.txt')
    os.makedirs(os.path.dirname(img_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    
    
    class_idx = ['NORMAL', 'B', 'M'].index(ph3_df.label.loc[i])
    x_center = ph3_df.x_center.loc[i]
    y_center = ph3_df.y_center.loc[i]
    box_width = ph3_df.box_width.loc[i]
    box_height = ph3_df.box_height.loc[i]
    if class_idx != 0:
        plt.imsave(img_path, img)
        with open(label_path, 'a') as f:
            f.write(f'{0} {x_center} {y_center} {box_width} {box_height}')
            f.write('\n')
    

## 2.Classification

In [None]:
df

In [None]:
dst_path = '/kaggle/working/datasets/clasify/'

for i in tqdm(df.index):
    
    img_id = df.image_id.loc[i]
    pgmfile = img_id + ext
    path = os.path.join(dataset_path, pgmfile)
    
    img = cv2.imread(path) # (1024, 1024, 3)
    img_path = os.path.join(dst_path, df.label.loc[i], img_id + dst_ext)
    label_path = os.path.join(dst_path, ph3_df.label.loc[i], img_id + '.txt')
    os.makedirs(os.path.dirname(img_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    
    plt.imsave(img_path, img)

dst_path = '/kaggle/working/processed_datasets/clasify/'
for i in tqdm(df.index):
    
    img_id = df.image_id.loc[i]
    pgmfile = img_id + ext
    path = os.path.join(dataset_path, pgmfile)
    
    img = cv2.imread(path) # (1024, 1024, 3)
    img = preprocessing(img)['out']
    img_path = os.path.join(dst_path, df.label.loc[i], img_id + dst_ext)
    label_path = os.path.join(dst_path, ph3_df.label.loc[i], img_id + '.txt')
    os.makedirs(os.path.dirname(img_path), exist_ok=True)
    os.makedirs(os.path.dirname(label_path), exist_ok=True)
    
    plt.imsave(img_path, img)