In [2]:
import pandas as pd
from glob import glob
import os
import re
import cv2
import numpy as np
import math

In [5]:
df = pd.read_csv('img_resize/img_resize.csv')
paths, label = list(df['img_path']), list(df['label'])

In [6]:
df['label'].value_counts()

label
1    85
0    29
Name: count, dtype: int64

In [7]:
list_idx = [i for i, j in enumerate(label) if j == 0]
path_imbalanced = [paths[i] for i in list_idx]
path_imbalanced

In [1]:
def aug_func(img_path, select: int):
    img = cv2.imread(img_path)
    
    if select == 0:
        # Flip horizontally
        augmented = cv2.flip(img, 1)
    elif select == 1:
        # Rotate 90 degrees clockwise
        augmented = cv2.rotate(img, cv2.ROTATE_90_CLOCKWISE)
    elif select == 2:
        # Add Gaussian noise
        noise = np.random.normal(0, 25, img.shape).astype(np.uint8)
        augmented = cv2.add(img, noise)
    elif select == 3:
        # Adjust brightness
        brightness = 50
        augmented = cv2.add(img, (brightness,brightness,brightness,0))
    elif select == 4:
        # Apply Gaussian blur
        augmented = cv2.GaussianBlur(img, (5, 5), 0)
    elif select == 5:
        # Change color space (to grayscale)
        augmented = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    elif select == 6:
        # Adjust contrast
        contrast = 1.5
        augmented = cv2.convertScaleAbs(img, alpha=contrast, beta=0)
    elif select == 7:
        # Thay đổi độ bão hòa
        hsv = cv2.cvtColor(img, cv2.COLOR_BGR2HSV)
        hsv[:,:,1] = hsv[:,:,1] * 1.5  # Tăng độ bão hòa lên 50%
        augmented = cv2.cvtColor(hsv, cv2.COLOR_HSV2BGR)
    elif select == 8:
        # Áp dụng hiệu ứng cartoon
        gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        gray = cv2.medianBlur(gray, 5)
        edges = cv2.adaptiveThreshold(gray, 255, cv2.ADAPTIVE_THRESH_MEAN_C, cv2.THRESH_BINARY, 9, 9)
        color = cv2.bilateralFilter(img, 9, 300, 300)
        augmented = cv2.bitwise_and(color, color, mask=edges)
    elif select == 9:
        # Thay đổi gamma
        gamma = 1.5
        invGamma = 1.0 / gamma
        table = np.array([((i / 255.0) ** invGamma) * 255 for i in np.arange(0, 256)]).astype("uint8")
        augmented = cv2.LUT(img, table)
    elif select == 10:
        # Áp dụng hiệu ứng chồng màu (color overlay)
        overlay_color = np.random.randint(0, 256, 3).tolist()
        overlay = np.full(img.shape, overlay_color, dtype=np.uint8)
        augmented = cv2.addWeighted(img, 0.8, overlay, 0.2, 0)
    else:
        augmented = img  # Trả về ảnh gốc nếu select nằm ngoài phạm vi
    
    return augmented

In [15]:
new_df = pd.read_csv('data_with_augmentation.csv')
new_df

Unnamed: 0,img_path,label
0,img/Thang12/25_220388_well08_zid99_12,1
1,img/Thang12/10_220126_well02_zid99_0,1
2,img/Thang12/11_220276A_well10_zid99_1,1
3,img/Thang12/12_220450_well03_zid99_2,1
4,img/Thang12/13_220448_well02_zid99_3,1
...,...,...
157,img/Thang4/8_230119_well08_zid99_112,1
158,img/Thang4/9_220405_well09_zid99_113,0
159,img/Thang4/9_220405_well09_zid99_113_augmentation,0
160,img/Thang4/9_220405_well09_zid99_113_augmentat...,0


In [16]:
X, y = list(new_df['img_path']), list(new_df['label'])

In [17]:
X_origin, X_aug, y_origin, y_aug = [], [], [], []

In [18]:
for i, path in enumerate(X):
    if 'augmentation' in path:
        X_aug.append(path)
        y_aug.append(y[i])
    else:
        X_origin.append(path)
        y_origin.append(y[i])

In [19]:
def get_stratified_test_set(X, y, n_samples_per_class=10):
    indices_class_0 = np.where(y == 0)[0]
    indices_class_1 = np.where(y == 1)[0]

    test_indices_class_0 = np.random.choice(indices_class_0, n_samples_per_class, replace=False)
    test_indices_class_1 = np.random.choice(indices_class_1, n_samples_per_class, replace=False)

    test_indices = np.concatenate([test_indices_class_0, test_indices_class_1])

    mask = np.zeros(len(y), dtype=bool)
    mask[test_indices] = True

    X_test, X_remainder = X[mask], X[~mask]
    y_test, y_remainder = y[mask], y[~mask]

    return X_remainder, X_test, y_remainder, y_test

In [20]:
from  sklearn.model_selection import train_test_split
import numpy as np
X_origin_arr, X_aug_arr, y_origin_arr, y_aug_arr = np.array(X_origin), np.array(X_aug), np.array(y_origin), np.array(y_aug)
X_remainder, X_test, y_remainder, y_test = get_stratified_test_set(X_origin_arr, y_origin_arr)
X_train, X_val, y_train, y_val = get_stratified_test_set(X_remainder, y_remainder, n_samples_per_class=15)

In [21]:
X_train_augmentation = np.concatenate((X_train, X_aug_arr))
y_train_augmentation = np.concatenate((y_train, y_aug_arr))

In [45]:
X_train_augmentation

array(['img/Thang12/25_220388_well08_zid99_12',
       'img/Thang12/10_220126_well02_zid99_0',
       'img/Thang12/12_220450_well03_zid99_2',
       'img/Thang12/14_220431_well02_zid99_4',
       'img/Thang12/16_220484_well09_zid99_5',
       'img/Thang12/20_220381_well01_zid99_9',
       'img/Thang12/28_220457_well09_zid99_15',
       'img/Thang3/220464_well02_zid99_63',
       'img/Thang3/2_230022A_well02_zid99_34',
       'img/Thang3/11_220276A_well11_zid99_28',
       'img/Thang3/1_220467_well01_zid99_22',
       'img/Thang3/1_230033_well01_zid99_24',
       'img/Thang3/1_230064_well01_zid99_26',
       'img/Thang3/220240_well09_zid99_58',
       'img/Thang3/220375_well13_zid99_59',
       'img/Thang3/220466_well07_zid99_64',
       'img/Thang3/220481_well02_zid99_66',
       'img/Thang3/230013_well04_zid99_37',
       'img/Thang3/230027_well01_zid99_40',
       'img/Thang3/230032_well05_zid99_41',
       'img/Thang3/230043_well08_zid99_42',
       'img/Thang3/2_220243_well02_zid99

In [23]:
X_val.shape

(30,)

In [25]:
X_val

array(['img/Thang12/11_220276A_well10_zid99_1',
       'img/Thang12/19_220427_well10_zid99_7',
       'img/Thang12/1_220192_well07_zid99_8',
       'img/Thang12/21_220245_well10_zid99_10',
       'img/Thang12/26_220393_well11_zid99_13',
       'img/Thang12/27_220377_well04_zid99_14',
       'img/Thang12/33_220402_well09_zid99_17',
       'img/Thang12/9_220417A_well01_zid99_19',
       'img/Thang3/10_220452_well10_zid99_27',
       'img/Thang3/1_220013_well01_zid99_29',
       'img/Thang3/1_230013_well04_zid99_23',
       'img/Thang3/1_230054_well01_zid99_25',
       'img/Thang3/220013_well01_zid99_20',
       'img/Thang3/220417A_well04_zid99_61',
       'img/Thang3/220434_well06_zid99_62',
       'img/Thang3/220474_well02_zid99_65',
       'img/Thang3/230001_well01_zid99_67',
       'img/Thang3/230008_well04_zid99_21',
       'img/Thang3/230014_well04_zid99_38',
       'img/Thang3/3_220009_well03_zid99_43',
       'img/Thang3/3_220358_well03_zid99_44',
       'img/Thang3/4_230008_well0

In [30]:
import os
import shutil

In [34]:
source = '/mnt/sdb2/DaiLe/CLDNN/img/Thang3/hi'
destination = '/mnt/sdb2/DaiLe/CLDNN/dataset/train/hi'

In [48]:
for path in X_train_augmentation:
    name = path.split('/')[-1]
    source = path
    destination = f'dataset/train/{name}'
    shutil.copytree(source, destination)

In [49]:
for path in X_val:
    name = path.split('/')[-1]
    source = path
    destination = f'dataset/val/{name}'
    shutil.copytree(source, destination)

In [50]:
for path in X_test:
    name = path.split('/')[-1]
    source = path
    destination = f'dataset/test/{name}'
    shutil.copytree(source, destination)

In [51]:
# type(X_val)

In [60]:
import pandas as pd

X_train_df = pd.DataFrame(X_train_augmentation, columns=['img_paths'])
y_train_df = pd.DataFrame(y_train_augmentation, columns=['label'])

X_val_df = pd.DataFrame(X_val, columns=['img_paths'])
y_val_df = pd.DataFrame(y_val, columns=['label'])

X_test_df = pd.DataFrame(X_test, columns=['img_paths'])
y_test_df = pd.DataFrame(y_test, columns=['label'])

train_df = pd.concat([X_train_df, y_train_df], axis=1)
val_df = pd.concat([X_val_df, y_val_df], axis=1)
test_df = pd.concat([X_test_df, y_test_df], axis=1)

train_df.to_csv('train_data.csv', index=False)
val_df.to_csv('val_data.csv', index=False)
test_df.to_csv('test_data.csv', index=False)
