#### Feature Extraction

In [None]:

import os, json, gc
import tensorflow as tf
from tensorflow.keras import layers as L
from tensorflow.keras import Model
from tensorflow.keras.callbacks import ReduceLROnPlateau, ModelCheckpoint


os.environ.pop("XLA_FLAGS", None)
tf.config.set_soft_device_placement(True)
tf.config.threading.set_intra_op_parallelism_threads(2)
tf.config.threading.set_inter_op_parallelism_threads(2)

gpus = tf.config.list_physical_devices("GPU")
for g in gpus:
    try:
        tf.config.experimental.set_memory_growth(g, True)
    except Exception:
        pass

if gpus:
    try:
        tf.keras.mixed_precision.set_global_policy("mixed_float16")
        print("Mixed precision ON")
    except Exception:
        print("Mixed precision not enabled; continuing in float32.")


OUT_DIR   = r'D:\Research\Custom CNN\Without Augmented'
os.makedirs(OUT_DIR, exist_ok=True)

data_root = r'D:\Research\Custom CNN\Without Augmented\Original Image'  # class subfolders
IMG_SIZE  = (224, 224)
BATCH     = 64
VAL_SPLIT = 0.2
SEED      = 42

FULL_MODEL_H5      = os.path.join(OUT_DIR, "custom_cnn_full.h5")
WEIGHTS_H5         = os.path.join(OUT_DIR, "custom_cnn.weights.h5")
FEAT_EXTRACTOR_H5  = os.path.join(OUT_DIR, "custom_cnn_feature_extractor.h5")
META_JSON          = os.path.join(OUT_DIR, "custom_cnn_meta.json")
CKPT_WEIGHTS       = os.path.join(OUT_DIR, "custom_cnn_best.weights.h5")

train_ds = tf.keras.utils.image_dataset_from_directory(
    data_root, labels="inferred", label_mode="int",
    image_size=IMG_SIZE, batch_size=BATCH,
    validation_split=VAL_SPLIT, subset="training", seed=SEED, shuffle=True
)
val_ds = tf.keras.utils.image_dataset_from_directory(
    data_root, labels="inferred", label_mode="int",
    image_size=IMG_SIZE, batch_size=BATCH,
    validation_split=VAL_SPLIT, subset="validation", seed=SEED, shuffle=False
)

class_names = train_ds.class_names
num_classes = len(class_names)

def norm(x, y):
    x = tf.cast(x, tf.float32) / 255.0
    return x, y

train_ds = train_ds.map(norm, num_parallel_calls=2).prefetch(2)
val_ds   = val_ds.map(norm,   num_parallel_calls=2).prefetch(2)


aug = tf.keras.Sequential([
    L.RandomFlip("horizontal"),
    L.RandomRotation(0.05),
    L.RandomZoom(0.1),
], name="aug")


def conv_block(x, filters, k=3, s=1, p="same"):
    x = L.Conv2D(filters, k, strides=s, padding=p, use_bias=False)(x)
    x = L.BatchNormalization()(x)
    x = L.ReLU()(x)
    return x

def build_custom_cnn(input_shape=(224,224,3), n_classes=3, feature_dim=256, dropout=0.5):
    inputs = L.Input(shape=input_shape)
    x = aug(inputs)

    x = conv_block(x, 32); x = conv_block(x, 32); x = L.MaxPooling2D(2)(x)
    x = conv_block(x, 64); x = conv_block(x, 64); x = L.MaxPooling2D(2)(x)
    x = conv_block(x, 128); x = conv_block(x, 128); x = L.MaxPooling2D(2)(x)

    gap = L.GlobalAveragePooling2D()(x)
    se  = L.Dense(128//4, activation="relu", dtype="float32")(gap)
    se  = L.Dense(128, activation="sigmoid", dtype="float32")(se)
    x   = L.Multiply()([x, L.Reshape((1,1,128))(se)])

    x = L.GlobalAveragePooling2D(name="gap")(x)
    feat = L.Dense(feature_dim, activation="relu", name="feature_dense", dtype="float32")(x)
    x = L.Dropout(dropout)(feat)
    outputs = L.Dense(n_classes, activation="softmax", name="logits", dtype="float32")(x)

    model = Model(inputs, outputs, name="custom_cnn")
    feat_model = Model(inputs, feat, name="custom_cnn_feature_extractor")
    return model, feat_model

model, feat_model = build_custom_cnn(
    input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3),
    n_classes=num_classes,
    feature_dim=256,
    dropout=0.5
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(1e-3),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=["accuracy"]
)


callbacks = [
    ReduceLROnPlateau(monitor="val_accuracy", factor=0.5, patience=3, verbose=1, min_lr=1e-6),
    ModelCheckpoint(
        CKPT_WEIGHTS, monitor="val_accuracy",
        save_best_only=True, save_weights_only=True, verbose=1
    ),
]

EPOCHS = 20
history = model.fit(
    train_ds,
    validation_data=val_ds,
    epochs=EPOCHS,
    callbacks=callbacks,
    verbose=1
)

val_acc = history.history.get("val_accuracy", [])
if len(val_acc) > 0 and max(val_acc) > val_acc[-1] and os.path.exists(CKPT_WEIGHTS):
    print(f"Loading best weights (val_acc {max(val_acc):.4f} > last {val_acc[-1]:.4f})")
    model.load_weights(CKPT_WEIGHTS)
else:
    print("Keeping final epoch weights (no improvement over last OR no checkpoint).")

model.save(FULL_MODEL_H5)
model.save_weights(WEIGHTS_H5)
feat_model.save(FEAT_EXTRACTOR_H5)

with open(META_JSON, "w") as f:
    json.dump({
        "img_size": IMG_SIZE,
        "num_classes": num_classes,
        "class_names": class_names,
        "feature_dim": 256,
        "epochs_trained": int(len(history.history.get("loss", []))),
        "best_val_accuracy": float(max(val_acc)) if len(val_acc) else None,
        "last_val_accuracy": float(val_acc[-1]) if len(val_acc) else None,
        "out_dir": OUT_DIR
    }, f, indent=2)

print(f"Saved full model (.h5): {FULL_MODEL_H5}")
print(f"Saved weights (.weights.h5): {WEIGHTS_H5}")
print(f"Saved feature extractor (.h5): {FEAT_EXTRACTOR_H5}")
print(f"Saved meta: {META_JSON}")

gc.collect()
tf.keras.backend.clear_session()



Found 2195 files belonging to 6 classes.
Using 1756 files for training.
Found 2195 files belonging to 6 classes.
Using 439 files for validation.
Epoch 1/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30s/step - accuracy: 0.4000 - loss: 1.5029 
Epoch 1: val_accuracy improved from None to 0.00000, saving model to D:\Research\Custom CNN\Without Augmented\custom_cnn_best.weights.h5
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m896s[0m 32s/step - accuracy: 0.4710 - loss: 1.3375 - val_accuracy: 0.0000e+00 - val_loss: 1.8443 - learning_rate: 0.0010
Epoch 2/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38s/step - accuracy: 0.5564 - loss: 1.0974 
Epoch 2: val_accuracy did not improve from 0.00000
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1101s[0m 39s/step - accuracy: 0.5598 - loss: 1.0944 - val_accuracy: 0.0000e+00 - val_loss: 2.2533 - learning_rate: 0.0010
Epoch 3/20
[1m28/28[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s



Keeping final epoch weights (no improvement over last OR no checkpoint).




Saved full model (.h5): D:\Research\Custom CNN\Without Augmented\custom_cnn_full.h5
Saved weights (.weights.h5): D:\Research\Custom CNN\Without Augmented\custom_cnn.weights.h5
Saved feature extractor (.h5): D:\Research\Custom CNN\Without Augmented\custom_cnn_feature_extractor.h5
Saved meta: D:\Research\Custom CNN\Without Augmented\custom_cnn_meta.json






#### Feature extraction pipeline 

In [None]:
import os, json, glob, math, random
import numpy as np
import pandas as pd
from tqdm import tqdm

import tensorflow as tf
from tensorflow.keras import Model
from tensorflow.keras.layers import (
    Input, Conv2D, MaxPooling2D, BatchNormalization, Dropout,
    GlobalAveragePooling2D, Dense
)
from tensorflow.keras.applications import EfficientNetB0
from tensorflow.keras.applications.efficientnet import preprocess_input as eff_preprocess


SEED = 42
np.random.seed(SEED)
random.seed(SEED)
tf.keras.utils.set_random_seed(SEED)
try:
    tf.config.experimental.enable_op_determinism()
except Exception:
    pass  


USE_MIXED_PRECISION = True and bool(tf.config.list_physical_devices('GPU'))
if USE_MIXED_PRECISION:
    try:
        tf.keras.mixed_precision.set_global_policy('mixed_float16')
        print("Mixed precision enabled (global_policy = 'mixed_float16').")
    except Exception:
        print("Could not enable mixed precision; proceeding in float32.")



IMG_SIZE = (224, 224)
BATCH_SIZE = 64
NUM_WORKERS = tf.data.AUTOTUNE


data_root = r'D:\Research\Custom CNN\Without Augmented\Original Image'

OUT_DIR = r'D:\Research\Custom CNN\Without Augmented'


class_dirs = sorted([d for d in os.listdir(data_root) if os.path.isdir(os.path.join(data_root, d))])
if not class_dirs:
    raise RuntimeError(f"No class folders found under {data_root}")

class_indices = {c: i for i, c in enumerate(class_dirs)}
index_to_class = {i: c for c, i in class_indices.items()}

BACKBONE = 'custom'

CUSTOM_WEIGHTS_PATH = 'custom_cnn.weights.h5'
EFF_WEIGHTS = 'imagenet'

OUT_BASENAME = 'features_256d_' + BACKBONE
CSV_PATH = os.path.join(OUT_DIR, f'{OUT_BASENAME}.csv')
PARQUET_PATH = os.path.join(OUT_DIR, f'{OUT_BASENAME}.parquet')
META_JSON = os.path.join(OUT_DIR, f'{OUT_BASENAME}_meta.json')


def build_custom_cnn(num_classes: int, input_shape=(224, 224, 3)):

    inputs = Input(shape=input_shape)
    # Block 1
    x = Conv2D(32, 3, padding='same', activation='relu')(inputs)
    x = BatchNormalization()(x)
    x = MaxPooling2D(2)(x)
    # Block 2
    x = Conv2D(64, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(2)(x)
    # Block 3
    x = Conv2D(128, 3, padding='same', activation='relu')(x)
    x = BatchNormalization()(x)
    x = MaxPooling2D(2)(x)
    
    x = GlobalAveragePooling2D(name='gap', dtype='float32')(x)
    penultimate = Dense(256, activation='relu', dtype='float32', name='feature_dense')(x)  
    x = Dropout(0.5)(penultimate)
    outputs = Dense(num_classes, activation='softmax', name='logits')(x)

    cls_model = Model(inputs, outputs, name='custom_cnn')
    feat_model = Model(inputs, penultimate, name='custom_cnn_feature_extractor')
    return cls_model, feat_model


def build_efficientnet_feature_model(output_dim=256, input_shape=(224, 224, 3), weights='imagenet'):
    
    base = EfficientNetB0(include_top=False, weights=weights, input_shape=input_shape)
    inputs = base.input
    x = base.output
    
    x = GlobalAveragePooling2D(name='gap', dtype='float32')(x)
    penultimate = Dense(output_dim, activation='relu', dtype='float32', name='feature_dense')(x) 
    feat_model = Model(inputs, penultimate, name='efficientnet_feature_extractor')
    return feat_model



if BACKBONE == 'custom':
    cls_model, feature_model = build_custom_cnn(num_classes=len(class_dirs), input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3))
    if CUSTOM_WEIGHTS_PATH and os.path.exists(CUSTOM_WEIGHTS_PATH):
        try:
            
            cls_model.load_weights(CUSTOM_WEIGHTS_PATH, by_name=True, skip_mismatch=True)
            print(f"Loaded custom CNN weights from: {CUSTOM_WEIGHTS_PATH}")
        except Exception as e:
            print(f"Could not load custom weights: {e}\nProceeding with random init (features will be weak).")
    preprocess_fn = lambda x: tf.cast(x, tf.float32) / 255.0
else:
    feature_model = build_efficientnet_feature_model(
        output_dim=256, input_shape=(IMG_SIZE[0], IMG_SIZE[1], 3), weights=EFF_WEIGHTS
    )
    preprocess_fn = eff_preprocess  

records = []
for cls in class_dirs:
    folder = os.path.join(data_root, cls)
    files = sorted(
        glob.glob(os.path.join(folder, '*.jpg')) +
        glob.glob(os.path.join(folder, '*.jpeg')) +
        glob.glob(os.path.join(folder, '*.png'))
    )
    for fp in files:
        records.append((fp, cls, class_indices[cls]))

if not records:
    raise RuntimeError(f"No images found under {data_root} (searched *.jpg, *.jpeg, *.png).")

paths = [r[0] for r in records]
labels = [r[2] for r in records]
fnames = [os.path.basename(r[0]) for r in records]
N = len(paths)
print(f"Found {N} images across {len(class_dirs)} classes.")


def load_and_preprocess(path, label, fname):
    img_bytes = tf.io.read_file(path)
   
    img = tf.image.decode_image(img_bytes, channels=3, expand_animations=False)
    img.set_shape([None, None, 3])
    img = tf.image.resize(img, IMG_SIZE, method='bilinear', antialias=True)
    img = preprocess_fn(img)
    return img, label, fname

ds = tf.data.Dataset.from_tensor_slices((paths, labels, fnames))
ds = ds.map(load_and_preprocess, num_parallel_calls=NUM_WORKERS)
ds = ds.batch(BATCH_SIZE).prefetch(NUM_WORKERS)

all_feats = []
all_labels = []
all_fnames = []

num_batches = math.ceil(N / BATCH_SIZE)
for batch_imgs, batch_labels, batch_names in tqdm(ds, total=num_batches, desc="Extracting"):
    
    feats = feature_model(batch_imgs, training=False)  
    
    feats = tf.cast(feats, tf.float32)
    all_feats.append(feats.numpy())
    all_labels.extend(batch_labels.numpy().tolist())
    all_fnames.extend(batch_names.numpy().astype(str).tolist())

features = np.vstack(all_feats)  # [N, 256]
assert features.shape[0] == N, "Feature count mismatch"

df = pd.DataFrame(features, columns=[f'f{i:03d}' for i in range(features.shape[1])])
df['class_idx'] = all_labels
df['label'] = [index_to_class[i] for i in all_labels]
df['filename'] = all_fnames

df.to_csv(CSV_PATH, index=False)
try:
    df.to_parquet(PARQUET_PATH, index=False)
except Exception as e:
    print(f"Parquet save failed ({e}); CSV still saved.")

with open(META_JSON, 'w') as f:
    json.dump({
        'img_size': IMG_SIZE,
        'backbone': BACKBONE,
        'feature_dim': int(features.shape[1]),
        'class_indices': class_indices,
        'num_images': int(features.shape[0]),
        'data_root': data_root,
        'mixed_precision': bool(USE_MIXED_PRECISION),
    }, f, indent=2)

print(f"Saved CSV: {CSV_PATH}")
if os.path.exists(PARQUET_PATH):
    print(f"Saved Parquet: {PARQUET_PATH}")
print(f"Saved meta: {META_JSON}")


Could not load custom weights: `by_name` only supports loading legacy '.h5' or '.hdf5' files. Received: custom_cnn.weights.h5
Proceeding with random init (features will be weak).
Found 2195 images across 6 classes.


Extracting: 100%|██████████| 35/35 [01:07<00:00,  1.91s/it]


Parquet save failed (Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.); CSV still saved.
Saved CSV: D:\Research\Custom CNN\Without Augmented\features_256d_custom.csv
Saved meta: D:\Research\Custom CNN\Without Augmented\features_256d_custom_meta.json
