# Data Transforming

In [1]:
import os
import cv2
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from pathlib import Path
from skimage.feature import local_binary_pattern, hog
from skimage.color import rgb2gray
from tqdm import tqdm
from sklearn.model_selection import train_test_split,  GridSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.decomposition import PCA

%matplotlib inline

## Xử lý ảnh

### Hàm tiền xử lý ảnh

In [2]:
def preprocess_image(image_input, size=(128, 128), to_gray=True):
    # Nếu là đường dẫn, đọc ảnh
    if isinstance(image_input, str):
        img = cv2.imread(image_input, cv2.IMREAD_UNCHANGED)
        if img is None:
            raise ValueError(f"Không đọc được ảnh từ đường dẫn: {image_input}")
    # Nếu là ảnh ndarray, dùng trực tiếp
    elif isinstance(image_input, np.ndarray):
        img = image_input
    else:
        raise ValueError("Tham số truyền vào phải là đường dẫn hoặc ảnh numpy array")

    # Resize
    img = cv2.resize(img, size)

    # Convert to grayscale nếu cần
    if to_gray:
        if len(img.shape) == 2:
            return img
        elif len(img.shape) == 3 and img.shape[2] == 3:
            return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
        else:
            raise ValueError("Ảnh có số kênh không hợp lệ")
    return img


## Trích xuất vector từ ảnh

### Extract HOG feature từ ảnh

In [3]:
def extract_hog_features_opencv(image, size=(128, 128)):

    gray = preprocess_image(image, size=size, to_gray=True)

    hog = cv2.HOGDescriptor()
    features = hog.compute(gray)
    return features.flatten()

### Extract LBP feature từ ảnh

In [4]:
def extract_lbp_features_blockwise(image_path, size=(128, 128), block_size=(16, 16), P=8, R=1, method='uniform'):
    gray = preprocess_image(image_path, size)

    lbp = local_binary_pattern(gray, P, R, method)

    h, w = size
    bh, bw = block_size
    blocks_y = h // bh
    blocks_x = w // bw

    # Xác định số bins theo method
    # if method == 'uniform':
    #     n_bins = P + 2
    # else:
    #     n_bins = 2 ** P
    n_bins=59

    features = []
    for i in range(blocks_y):
        for j in range(blocks_x):
            block = lbp[i*bh:(i+1)*bh, j*bw:(j+1)*bw]
            hist, _ = np.histogram(block.ravel(), bins=n_bins, range=(0, n_bins))
            hist = hist.astype("float32")
            hist /= (hist.sum() + 1e-6)  # chuẩn hóa
            features.extend(hist)

    return np.array(features)


### Kết hợp HOG và LBP

In [35]:
def extract_hog_lbp_features(image, size=(128, 128)):
    # Trích từng loại đặc trưng
    hog_vec = extract_hog_features_opencv(image, size=size)
    lbp_vec = extract_lbp_features_blockwise(image, size=size)

    # Gộp 2 vector lại
    combined = np.concatenate([hog_vec, lbp_vec])

    return combined

## Trích xuất vector từ các ảnh trong thư mục

### HOG

In [6]:
def load_hog_features_from_folder(folder_path, label_value, size=(128, 128)):
    X_list = []
    y_list = []

    for file in tqdm(os.listdir(folder_path), desc=f"Đang xử lý {folder_path}"):
        file_path = os.path.join(folder_path, file)
        img = cv2.imread(file_path)
        if img is None:
            print(f"Không đọc được ảnh: {file_path}")
            continue
        try:
            hog_vector = extract_hog_features_opencv(img, size=size)
            X_list.append(hog_vector)
            y_list.append(label_value)
        except Exception as e:
            print(f"Lỗi xử lý ảnh {file_path}: {e}")

    # Chuyển sang NumPy array
    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.uint8)
    return X, y


### LBP 

In [7]:
def load_lbp_features_from_folder(folder_path, label_value, size=(128, 128),
                                  block_size=(16, 16), P=8, R=1, method='uniform', max_images=None):
    
    X_list = []
    y_list = []

    files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    if max_images:
        files = files[:max_images]

    for file in tqdm(files, desc=f"Đang xử lý {folder_path}"):
        file_path = os.path.join(folder_path, file)
        try:
            lbp_vector = extract_lbp_features_blockwise(
                file_path, size=size, block_size=block_size,
                P=P, R=R, method=method
            )
            X_list.append(lbp_vector)
            y_list.append(label_value)
        except Exception as e:
            print(f"Lỗi xử lý ảnh {file_path}: {e}")

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.uint8)
    return X, y

### HOG + LBP

In [63]:
def load_hog_lbp_features_from_folder(folder_path, label_value, size=(128, 128),
                                      block_size=(16, 16), P=8, R=1, method='uniform',
                                      max_images=None):
    X_list = []
    y_list = []

    # Lọc file ảnh
    files = [f for f in os.listdir(folder_path) if f.lower().endswith(('.jpg', '.jpeg', '.png'))]
    if max_images:
        files = files[:max_images]

    for file in tqdm(files, desc=f"Đang xử lý {folder_path}"):
        file_path = os.path.join(folder_path, file)
        try:
            combined_vector = extract_hog_lbp_features(file_path, size=size)
            X_list.append(combined_vector)
            y_list.append(label_value)
        except Exception as e:
            print(f"Lỗi xử lý ảnh {file_path}: {e}")

    X = np.array(X_list, dtype=np.float32)
    y = np.array(y_list, dtype=np.uint8)
    return X, y

## Xây dựng dataframe từ vector

### Build HOG Numpy

In [64]:
def build_hog_dataset(real_folder, fake_folder, size=(128, 128)):
    X_real, y_real = load_hog_features_from_folder(real_folder, label_value=1, size=size)
    X_fake, y_fake = load_hog_features_from_folder(fake_folder, label_value=0, size=size)

    X = np.vstack([X_real, X_fake])
    y = np.concatenate([y_real, y_fake])
    return X, y

### Build LBP Numpy

In [6]:
def build_lbp_numpy(real_folder, fake_folder, size=(128, 128),
                      block_size=(16, 16), P=8, R=1, method='uniform', max_images=None):
    X_real, y_real = load_lbp_features_from_folder(real_folder, label_value=1,
                                                   size=size, block_size=block_size,
                                                   P=P, R=R, method=method, max_images=max_images)
    X_fake, y_fake = load_lbp_features_from_folder(fake_folder, label_value=0,
                                                   size=size, block_size=block_size,
                                                   P=P, R=R, method=method, max_images=max_images)
    X = np.vstack([X_real, X_fake])
    y = np.concatenate([y_real, y_fake])
    return X, y

### Build HOG + LBP Numpy

In [4]:
def build_hog_lbp_numpy(real_folder, fake_folder, size=(128, 128), max_images=None):
    # Real -> label 0
    X_real, y_real = load_hog_lbp_features_from_folder(real_folder, label_value=0, size=size, max_images=max_images)

    # Fake -> label 1
    X_fake, y_fake = load_hog_lbp_features_from_folder(fake_folder, label_value=1, size=size, max_images=max_images)
    
    # Ghép lại
    X = np.vstack([X_real, X_fake])
    y = np.concatenate([y_real, y_fake])

    return X, y

## Tạo Vector

In [1]:
real_dir = "../data/cropped/train/cropped_real"
fake_dir = "../data/cropped/train/cropped_fake"
real_test_dir = "../data/cropped/test/cropped_real"
fake_test_dir = "../data/cropped/test/cropped_real"

In [2]:
# Tạo thư mục nếu chưa tồn tại
# os.makedirs('../models/vector', exist_ok=True)

### HOG

In [12]:
# X, y = build_hog_numpy(real_folder="../data/cropped/train/cropped_real", 
#                        fake_folder="../data/cropped/train/cropped_fake", 
#                        size=(128, 128))

In [13]:
# X_test, y_test = build_hog_numpy(real_folder="../data/cropped/test/cropped_real", 
#                        fake_folder="../data/cropped/test/cropped_fake", 
#                        size=(128, 128))

In [14]:
# np.save("../models/vector/X_hog.npy", X)
# np.save("../models/vector/y_hog.npy", y)
# np.save("../models/vector/X_hog_test.npy", X_test)
# np.save("../models/vector/y_hog_test.npy", y_test)

### LBP

In [7]:
X_lbp, y_lbp = build_lbp_numpy(real_dir, fake_dir)

print("X_lbp shape:", X_lbp.shape)
print("y_lbp shape:", y_lbp.shape)
print("Số chiều mỗi vector:", X_lbp.shape[1])

NameError: name 'load_lbp_features_from_folder' is not defined

In [8]:
X_lbp_test, y_lbp_test = build_lbp_numpy(real_test_dir, fake_test_dir)

print("X_lbp test shape:", X_lbp_test.shape)
print("y_lbp test shape:", y_lbp_test.shape)
print("Số chiều mỗi vector:", X_lbp_test.shape[1])

NameError: name 'load_lbp_features_from_folder' is not defined

In [18]:
np.save("../models/vector/X_lbp.npy", X_lbp )
np.save("../models/vector/y_lbp.npy", y_lbp )
np.save("../models/vector/X_lbp_test.npy", X_lbp_test)
np.save("../models/vector/y_lbp_test.npy", y_lbp_test)

### HOG + LBP

In [66]:
X, y = build_hog_lbp_numpy(real_dir, fake_dir)

print("X shape:", X.shape)
print("y shape:", y.shape)
print("Số chiều mỗi vector:", X.shape[1])

Đang xử lý ../data/cropped/train/cropped_real: 100%|██████████████████████████████| 9606/9606 [01:24<00:00, 113.65it/s]
Đang xử lý ../data/cropped/train/cropped_fake: 100%|██████████████████████████████| 9448/9448 [01:22<00:00, 114.98it/s]


X shape: (19054, 37796)
y shape: (19054,)
Số chiều mỗi vector: 37796


In [67]:
X_test, y_test = build_hog_lbp_numpy(real_test_dir, fake_test_dir)

print("X shape:", X_test.shape)
print("y shape:", y_test.shape)
print("Số chiều mỗi vector:", X_test.shape[1])

Đang xử lý ../data/cropped/test/cropped_real: 100%|███████████████████████████████| 2534/2534 [00:22<00:00, 114.43it/s]
Đang xử lý ../data/cropped/test/cropped_fake: 100%|███████████████████████████████| 2478/2478 [00:21<00:00, 115.90it/s]


X shape: (5012, 37796)
y shape: (5012,)
Số chiều mỗi vector: 37796


In [44]:
X_test.shape

(5012, 37796)

## Training Model

In [11]:
# X = df_hog.drop("label", axis=1)  # X: toàn bộ các đặc trưng
# y = df_hog["label"]               # y: nhãn

In [13]:
# X_test_hog = df_hog_test.drop("label", axis=1)  # X: toàn bộ các đặc trưng
# y_test_hog = df_hog_test["label"]               # y: nhãn

In [26]:
# clf = SVC(kernel='linear')  # hoặc rbf
# clf.fit(X, y)
# np_clf_1 = SVC(kernel='rbf')  # hoặc rbf
# np_clf_1.fit(X, y)
# np_y_pred_1 = np_clf_1.predict(X_test)
# print(classification_report(y_test, np_y_pred_1))

              precision    recall  f1-score   support

           0       0.81      0.80      0.80      2478
           1       0.80      0.81      0.81      2534

    accuracy                           0.80      5012
   macro avg       0.80      0.80      0.80      5012
weighted avg       0.80      0.80      0.80      5012



rbf tốt hơn linear

80 > 76

In [2]:
X_hog = np.load("../models/vector/X_hog.npy")
y_hog = np.load("../models/vector/y_hog.npy")
X_hog_test = np.load("../models/vector/X_hog_test.npy")
y_hog_test = np.load("../models/vector/y_hog_test.npy")

In [68]:
pipe = Pipeline([
    ('pca', PCA(n_components=1024, svd_solver='randomized', random_state=42)),
    ('clf', LinearSVC())
])

# Fit trên tập train
pipe.fit(X, y)

# Predict trên tập test
y_pred = pipe.predict(X_test)

# In ra classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.74      0.83      0.79      2534
           1       0.81      0.70      0.75      2478

    accuracy                           0.77      5012
   macro avg       0.77      0.77      0.77      5012
weighted avg       0.77      0.77      0.77      5012



### Train với vector LBP

In [19]:
X_lbp = np.load("../models/vector/X_lbp.npy")
y_lbp = np.load("../models/vector/y_lbp.npy")
X_lbp_test = np.load("../models/vector/X_lbp_test.npy")
y_lbp_test = np.load("../models/vector/y_lbp_test.npy")

In [24]:
y_lbp.shape

(19054, 3776)

In [23]:
# clf_lbp = SVC(kernel='rbf')  # hoặc rbf
# clf_lbp.fit(X_lbp, y_lbp)
# y_pred_lbp = clf_lbp.predict(X_lbp_test)
# print(classification_report(y_lbp_test, y_pred_lbp))

              precision    recall  f1-score   support

           0       0.83      0.82      0.82      2478
           1       0.82      0.83      0.83      2534

    accuracy                           0.83      5012
   macro avg       0.83      0.83      0.83      5012
weighted avg       0.83      0.83      0.83      5012



### Áp dụng PCA 

In [39]:
pipe = Pipeline([
    ('pca', PCA(svd_solver='randomized', random_state=42)),
    ('clf', LinearSVC())
])

param_grid = {'pca__n_components': [1024, 1500, 2000, 3000]}
grid = GridSearchCV(pipe, param_grid, cv=3, n_jobs=-1, scoring='f1')
grid.fit(X_lbp, y_lbp)
print(grid.best_params_, grid.best_score_)

{'pca__n_components': 1024} 0.8093342524654613


In [31]:
# Tạo pipeline với số chiều đã chọn
best_pipe = Pipeline([
    ('pca', PCA(n_components=1024, svd_solver='randomized', random_state=42)),
    ('clf', LinearSVC())
])

# Fit trên tập train
best_pipe.fit(X_lbp, y_lbp)

# Predict trên tập test
y_pred = best_pipe.predict(X_lbp_test)

# In ra classification report
print(classification_report(y_lbp_test, y_pred))


              precision    recall  f1-score   support

           0       0.81      0.84      0.82      2478
           1       0.83      0.80      0.82      2534

    accuracy                           0.82      5012
   macro avg       0.82      0.82      0.82      5012
weighted avg       0.82      0.82      0.82      5012



### TRain LBP + HOG

In [21]:
# from joblib import dump, load

# # Tạo thư mục nếu chưa tồn tại
# os.makedirs('../models/predict_models', exist_ok=True)

# dump(clf, r'../models/predict_models/svc_model.joblib')

['../models/predict_models/svc_model.joblib']

In [27]:
# from joblib import dump, load

# # Tạo thư mục nếu chưa tồn tại
# os.makedirs('../models/predict_models', exist_ok=True)

# dump(np_clf_1, r'../models/predict_models/svc_model_numpy_rbf.joblib')

['../models/predict_models/svc_model_numpy_rbf.joblib']