In [1]:
import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from skimage.feature import hog
import mahotas

# -----------------------------
# 1. Data Loading (Binary Classification)
# -----------------------------
base_path = '../Database/'
data, labels = [], []

# Load Normal images (class = "Normal")
normal_path = os.path.join(base_path, "Normal")
for img_name in os.listdir(normal_path):
    img_path = os.path.join(normal_path, img_name)
    img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
    if img is None:
        continue
    img = cv2.resize(img, (128, 128))
    data.append(img)
    labels.append("Normal")

# Merge Lung Disease images from "Lung_Opacity" and "Viral Pneumonia" into one class ("Lung_Disease")
lung_folders = ["Lung_Opacity", "Viral Pneumonia"]
for folder in lung_folders:
    folder_path = os.path.join(base_path, folder)
    for img_name in os.listdir(folder_path):
        img_path = os.path.join(folder_path, img_name)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            continue
        img = cv2.resize(img, (128, 128))
        data.append(img)
        labels.append("Lung_Disease")

data = np.array(data)
labels = np.array(labels)
print("Total images loaded:", data.shape[0])  # ~3472 images

# Encode labels (e.g., Normal=0, Lung_Disease=1)
label_encoder = LabelEncoder()
labels_encoded = label_encoder.fit_transform(labels)

# Split the dataset (stratified)
X_train, X_test, y_train, y_test = train_test_split(
    data, labels_encoded, test_size=0.2, random_state=42, stratify=labels_encoded
)

# -----------------------------
# 2. Use Original Images for Feature Extraction
# -----------------------------
# The paper reports best performance on original images.
X_train_proc = X_train
X_test_proc  = X_test

# -----------------------------
# 3. Feature Extraction Functions
# -----------------------------
def extract_sift_fixed(images, num_keypoints=25):
    """
    For each image, detect SIFT keypoints and descriptors,
    sort them by response (strongest first), take the top num_keypoints,
    and flatten them into a fixed-length vector.
    If an image has fewer than num_keypoints, pad with zeros.
    """
    sift = cv2.SIFT_create()
    features = []
    for img in images:
        keypoints, descriptors = sift.detectAndCompute(img, None)
        if descriptors is None or len(keypoints) == 0:
            feat = np.zeros(num_keypoints * 128)
        else:
            kp_desc = list(zip(keypoints, descriptors))
            kp_desc.sort(key=lambda x: x[0].response, reverse=True)
            selected = kp_desc[:num_keypoints]
            descs = [d for kp, d in selected]
            if len(descs) < num_keypoints:
                pad = [np.zeros(128) for _ in range(num_keypoints - len(descs))]
                descs.extend(pad)
            else:
                descs = descs[:num_keypoints]
            feat = np.hstack(descs)
        features.append(feat)
    return np.array(features)

def extract_hog_features(images):
    """
    Extract HOG features using standard parameters.
    """
    features = []
    for img in images:
        hog_feat = hog(img, orientations=9, pixels_per_cell=(8, 8),
                       cells_per_block=(2, 2), block_norm='L2-Hys', visualize=False)
        features.append(hog_feat)
    return np.array(features)

def extract_lbp_features(images):
    """
    Extract LBP features by computing a normalized histogram of LBP codes.
    We use a radius of 2, 16 points, and bins from 0 to 10.
    """
    features = []
    for img in images:
        lbp = mahotas.features.lbp(img, radius=2, points=16, ignore_zeros=False)
        hist, _ = np.histogram(lbp, bins=np.arange(0, 11), density=True)
        features.append(hist)
    return np.array(features)

# Extract features from training images
sift_train = extract_sift_fixed(X_train_proc, num_keypoints=25)
hog_train  = extract_hog_features(X_train_proc)
lbp_train  = extract_lbp_features(X_train_proc)
X_train_features = np.hstack((sift_train, hog_train, lbp_train))

# Extract features from test images
sift_test = extract_sift_fixed(X_test_proc, num_keypoints=25)
hog_test  = extract_hog_features(X_test_proc)
lbp_test  = extract_lbp_features(X_test_proc)
X_test_features = np.hstack((sift_test, hog_test, lbp_test))

# -----------------------------
# 4. Feature Scaling
# -----------------------------
scaler = StandardScaler()
X_train_features = scaler.fit_transform(X_train_features)
X_test_features = scaler.transform(X_test_features)

# -----------------------------
# 5. Model Training with Hyperparameter Tuning
# -----------------------------
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100], 'max_iter': [1000, 2000, 3000]}
grid = GridSearchCV(
    LogisticRegression(random_state=42, solver='lbfgs'),
    param_grid, cv=5, n_jobs=-1
)
grid.fit(X_train_features, y_train)
print("Best parameters:", grid.best_params_)
model = grid.best_estimator_

# -----------------------------
# 6. Evaluation
# -----------------------------
y_pred = model.predict(X_test_features)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))

# -----------------------------
# 7. Prediction on a New Image
# -----------------------------
def preprocess_and_extract(image):
    """
    Apply the same pipeline to a new image:
    extract fixed SIFT features, HOG, and LBP, then scale the vector.
    """
    sift_feat = extract_sift_fixed([image], num_keypoints=25)
    hog_feat  = extract_hog_features([image])
    lbp_feat  = extract_lbp_features([image])
    feat = np.hstack((sift_feat, hog_feat, lbp_feat))
    return scaler.transform(feat)

new_image = cv2.imread('../Sample_Test/VN1/5.jpeg', cv2.IMREAD_GRAYSCALE)
new_image = cv2.resize(new_image, (128, 128))
new_features = preprocess_and_extract(new_image)
prediction = model.predict(new_features)
print("Predicted Class:", label_encoder.inverse_transform(prediction))


Total images loaded: 3472
Best parameters: {'C': 0.001, 'max_iter': 1000}
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.92      0.92       445
           1       0.86      0.85      0.86       250

    accuracy                           0.90       695
   macro avg       0.89      0.89      0.89       695
weighted avg       0.90      0.90      0.90       695

Accuracy: 0.897841726618705
Predicted Class: ['Lung_Disease']
