In [None]:
# Version 13 – Enhanced Data Augmentation
# -----------------------------------------
# Changes compared to Version 4 Final:
# 1. Data Augmentation: For each training image, we add three augmented variants:
#    - Horizontal flip,
#    - Rotation by +10 degrees,
#    - Rotation by -10 degrees.
#    This increases the effective training set size.
# 2. The rest of the pipeline is identical to Version 4:
#    - Use original 128x128 grayscale images.
#    - Feature Extraction:
#         * SIFT Fixed: Top 23 keypoints (sorted by response) flattened.
#         * SIFT Average: Average SIFT descriptor (128-d).
#         * HOG: Standard parameters (9 orientations, 8x8 cell, 2x2 block, L2-Hys).
#         * LBP: Normalized histogram (radius=2, 16 points, bins 0–10).
#    - Features are concatenated and scaled using StandardScaler.
#    - Logistic Regression is tuned using GridSearchCV (solver lbfgs).
# 3. No PCA is applied (since PCA previously hurt performance).
# -----------------------------------------

import os
import cv2
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from skimage.feature import hog
import mahotas
import time

#########################
# Data Loading Function
#########################
def load_data(base_path='../Database/'):
    data, labels = [], []
    # Load Normal images
    normal_path = os.path.join(base_path, "Normal")
    for img_name in os.listdir(normal_path):
        img_path = os.path.join(normal_path, img_name)
        img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
        if img is None:
            continue
        img = cv2.resize(img, (128, 128))
        data.append(img)
        labels.append("Normal")
    # Merge Lung Disease images from "Lung_Opacity" and "Viral Pneumonia"
    lung_folders = ["Lung_Opacity", "Viral Pneumonia"]
    for folder in lung_folders:
        folder_path = os.path.join(base_path, folder)
        for img_name in os.listdir(folder_path):
            img_path = os.path.join(folder_path, img_name)
            img = cv2.imread(img_path, cv2.IMREAD_GRAYSCALE)
            if img is None:
                continue
            img = cv2.resize(img, (128, 128))
            data.append(img)
            labels.append("Lung_Disease")
    return np.array(data), np.array(labels)

#########################
# Data Augmentation Functions
#########################
def horizontal_flip(image):
    return cv2.flip(image, 1)

def rotate_image(image, angle):
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_LINEAR, borderMode=cv2.BORDER_REFLECT)
    return rotated

#########################
# Feature Extraction Functions
#########################
def extract_sift_fixed(images, num_keypoints=23):
    sift = cv2.SIFT_create()
    features = []
    for img in images:
        keypoints, descriptors = sift.detectAndCompute(img, None)
        if descriptors is None or len(keypoints) == 0:
            feat = np.zeros(num_keypoints * 128)
        else:
            kp_desc = list(zip(keypoints, descriptors))
            kp_desc.sort(key=lambda x: x[0].response, reverse=True)
            selected = kp_desc[:num_keypoints]
            descs = [d for kp, d in selected]
            if len(descs) < num_keypoints:
                pad = [np.zeros(128) for _ in range(num_keypoints - len(descs))]
                descs.extend(pad)
            feat = np.hstack(descs)
        features.append(feat)
    return np.array(features)

def extract_sift_avg(images):
    sift = cv2.SIFT_create()
    features = []
    for img in images:
        keypoints, descriptors = sift.detectAndCompute(img, None)
        if descriptors is None or len(descriptors) == 0:
            feat = np.zeros(128)
        else:
            feat = np.mean(descriptors, axis=0)
        features.append(feat)
    return np.array(features)

def extract_hog_features(images):
    features = []
    for img in images:
        hog_feat = hog(img, orientations=9, pixels_per_cell=(8, 8),
                       cells_per_block=(2, 2), block_norm='L2-Hys', visualize=False)
        features.append(hog_feat)
    return np.array(features)

def extract_lbp_features(images):
    features = []
    for img in images:
        lbp = mahotas.features.lbp(img, radius=2, points=16, ignore_zeros=False)
        hist, _ = np.histogram(lbp, bins=np.arange(0, 11), density=True)
        features.append(hist)
    return np.array(features)

def extract_features(images, num_keypoints=23):
    sift_fixed = extract_sift_fixed(images, num_keypoints=num_keypoints)
    sift_avg = extract_sift_avg(images)
    hog_feats = extract_hog_features(images)
    lbp_feats = extract_lbp_features(images)
    sift_combined = np.hstack((sift_fixed, sift_avg))
    return np.hstack((sift_combined, hog_feats, lbp_feats))

#########################
# Experiment Pipeline
#########################
def run_experiment(augmentation=False, num_keypoints=23):
    # Load data
    X, y = load_data()
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y)
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
    
    # Augmentation (for training images only)
    if augmentation:
        X_train_aug = []
        y_train_aug = []
        for img, label in zip(X_train, y_train):
            X_train_aug.append(img)
            y_train_aug.append(label)
            # Add horizontal flip
            X_train_aug.append(horizontal_flip(img))
            y_train_aug.append(label)
            # Add rotation +10 degrees
            X_train_aug.append(rotate_image(img, 10))
            y_train_aug.append(label)
            # Add rotation -10 degrees
            X_train_aug.append(rotate_image(img, -10))
            y_train_aug.append(label)
        X_train = np.array(X_train_aug)
        y_train = np.array(y_train_aug)
    
    # Use original images for feature extraction
    X_train_proc = X_train
    X_test_proc = X_test
    
    # Extract features
    X_train_features = extract_features(X_train_proc, num_keypoints=num_keypoints)
    X_test_features = extract_features(X_test_proc, num_keypoints=num_keypoints)
    
    # Feature Scaling
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train_features)
    X_test_scaled = scaler.transform(X_test_features)
    
    # Model training with grid search
    param_grid = {'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                  'max_iter': [1000, 2000, 3000]}
    grid = GridSearchCV(LogisticRegression(random_state=42, solver='lbfgs'),
                        param_grid, cv=5, n_jobs=-1)
    grid.fit(X_train_scaled, y_train)
    best_params = grid.best_params_
    model = grid.best_estimator_
    y_pred = model.predict(X_test_scaled)
    acc = accuracy_score(y_test, y_pred)
    report = classification_report(y_test, y_pred, target_names=label_encoder.classes_)
    return acc, best_params, report, scaler, label_encoder

#########################
# Run Experiments
#########################
experiments = [
    {"augmentation": False, "num_keypoints": 23},
    {"augmentation": True, "num_keypoints": 23},
    {"augmentation": False, "num_keypoints": 25},
    {"augmentation": True, "num_keypoints": 25},
    {"augmentation": False, "num_keypoints": 30},
    {"augmentation": True, "num_keypoints": 30},
]

results = {}
for exp in experiments:
    key = f"Aug={exp['augmentation']}, KP={exp['num_keypoints']}"
    print(f"Running experiment: {key}")
    start = time.time()
    acc, best_params, report, scaler, label_encoder = run_experiment(
        augmentation=exp['augmentation'],
        num_keypoints=exp['num_keypoints']
    )
    duration = time.time() - start
    results[key] = {"accuracy": acc, "best_params": best_params, "report": report, "time": duration}
    print(f"Experiment {key}: Accuracy={acc:.4f}, Best Params={best_params}, Time={duration:.1f}s")
    print(report)
    print("-" * 80)

print("Summary of Experiments:")
for key, res in results.items():
    print(f"{key}: Accuracy={res['accuracy']:.4f}, Best Params={res['best_params']}, Time={res['time']:.1f}s")


Running experiment: Aug=False, KP=23
Experiment Aug=False, KP=23: Accuracy=0.8932, Best Params={'C': 0.001, 'max_iter': 1000}, Time=103.8s
              precision    recall  f1-score   support

Lung_Disease       0.92      0.91      0.91       418
      Normal       0.85      0.86      0.86       247

    accuracy                           0.89       665
   macro avg       0.89      0.89      0.89       665
weighted avg       0.89      0.89      0.89       665

--------------------------------------------------------------------------------
Running experiment: Aug=True, KP=23
