# 1. Library Imports and Setup

In [1]:
import os
import h5py
import cv2
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from google.colab import drive

# --- 2. Tensorflow and keras
import tensorflow as tf
from tensorflow.keras.models import Sequential, Model, load_model
from tensorflow.keras.layers import (
    Conv2D, MaxPooling2D, Flatten, Dense, Dropout,
    BatchNormalization, GlobalAveragePooling2D
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.image import ImageDataGenerator
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

# --- 3. Architectures and preprocessing
from tensorflow.keras.applications import ResNet50, DenseNet121, EfficientNetB5
from tensorflow.keras.applications.resnet50 import preprocess_input as resnet_preprocess
from tensorflow.keras.applications.densenet import preprocess_input as densenet_preprocess
from tensorflow.keras.applications.efficientnet import preprocess_input as efficientnet_preprocess

# --- 4. Metrics etc
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight

print(f"TensorFlow Version: {tf.__version__}")
print("All necessary libraries imported successfully.")

TensorFlow Version: 2.19.0
All necessary libraries imported successfully.


So here we can see that the models do same errors in the same places and do not perform significantly deifferently.

# 2. Mount Google Drive

In [2]:
# CONNECT GOOGLE DRIVE
from google.colab import drive

drive.mount('/content/drive')
print("Google Drive mounted successfully.")

DRIVE_PATH = '/content/drive/MyDrive/Galaxy_Morphology_Project'
MODEL_DIR = os.path.join(DRIVE_PATH, 'models')

!mkdir -p {MODEL_DIR}
# Optional: !mkdir -p {os.path.join(DRIVE_PATH, 'logs')}

print(f"Project path defined: {DRIVE_PATH}")
print(f"Models will be stored in: {MODEL_DIR}")

Mounted at /content/drive
Google Drive mounted successfully.
Project path defined: /content/drive/MyDrive/Galaxy_Morphology_Project
Models will be stored in: /content/drive/MyDrive/Galaxy_Morphology_Project/models


In [6]:
# DATA LOADING, NORMALIZATION AND STRATIFIED SPLIT

# 1. Load data from local VM storage
try:
    with h5py.File('data/Galaxy10.h5', 'r') as f:
        images = np.array(f['images'])
        labels = np.array(f['ans']) # Using 'ans' key for labels
except Exception as e:
    print(f"Error reading file: {e}")
    exit()

# 2. Image Normalization
# Convert to float32 and scale to [0, 1] range
images = images.astype('float32') / 255.0

# 3. One-Hot Encoding for labels
NUM_CLASSES = 10
labels_ohe = to_categorical(labels, num_classes=NUM_CLASSES)

# 4. Stratified Data Split: Train (70%) / Validation (15%) / Test (15%)
# Stratification is critical due to the significant class imbalance in Galaxy10.

X_train_val, X_test, y_train_val_ohe, y_test_ohe = train_test_split(
    images, labels_ohe, test_size=0.15, random_state=42, stratify=labels
)

# Split ratio: 0.15 / 0.85 ≈ 0.1765
X_train, X_val, y_train_ohe, y_val_ohe = train_test_split(
    X_train_val, y_train_val_ohe, test_size=(0.15 / 0.85),
    random_state=42,
    stratify=np.argmax(y_train_val_ohe, axis=1)
)

print("\n--- Dataset Sizes ---")
print(f"Train Set:      {X_train.shape[0]} images")
print(f"Validation Set: {X_val.shape[0]} images")
print(f"Test Set:       {X_test.shape[0]} images")
print(f"Original Shape: {X_train.shape[1:]}")


--- Dataset Sizes ---
Train Set:      12414 images
Validation Set: 2661 images
Test Set:       2661 images
Original Shape: (256, 256, 3)


# 1. Vanilla CNN exploration

In [11]:
# CUSTOM VANILLA CNN ARCHITECTURE ---
# This was the initial experiment using a custom-built CNN architecture
# to establish a baseline without using pre-trained weights.
# Configuration used during training
INPUT_SHAPE = (256, 256, 3)
NUM_CLASSES = 10
BATCH_SIZE = 32
EPOCHS_TRAINED = 20

baseline_model = Sequential([
    # Block 1: 32 filters, BatchNormalization for faster convergence
    Conv2D(32, (3, 3), activation='relu', padding='same', input_shape=INPUT_SHAPE),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # Block 2: 64 filters
    Conv2D(64, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # Block 3: 128 filters
    Conv2D(128, (3, 3), activation='relu', padding='same'),
    BatchNormalization(),
    MaxPooling2D((2, 2)),
    Dropout(0.25),

    # Classification Head
    Flatten(),
    Dense(512, activation='relu'),
    Dropout(0.5),
    Dense(NUM_CLASSES, activation='softmax')
])

baseline_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print("Custom Vanilla CNN architecture defined.")

Custom Vanilla CNN architecture defined.


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [14]:
# EVALUATING THE BASELINE MODEL

BASELINE_FILE = os.path.join(MODEL_DIR, 'baseline_checkpoint.h5')

try:
    # We load the weights into our defined architecture or load the whole model
    baseline_model = load_model(BASELINE_FILE)
    print(f"Successfully loaded baseline model from: {BASELINE_FILE}")

    y_pred_probs = baseline_model.predict(X_test, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- BASELINE MODEL PERFORMANCE ---")
    print(f"Weighted F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading model: {e}")



Successfully loaded baseline model from: /content/drive/MyDrive/Galaxy_Morphology_Project/models/baseline_checkpoint.h5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 17ms/step

--- BASELINE MODEL PERFORMANCE ---
Weighted F1-Score: 0.0952

Classification Report:
              precision    recall  f1-score   support

           0     0.0788    0.0802    0.0795       162
           1     0.2353    0.0144    0.0271       278
           2     0.1967    0.6599    0.3031       397
           3     0.1094    0.0230    0.0380       304
           4     0.0000    0.0000    0.0000        50
           5     0.0864    0.0228    0.0361       307
           6     0.0791    0.0401    0.0533       274
           7     0.1360    0.2944    0.1860       394
           8     0.0000    0.0000    0.0000       214
           9     0.1250    0.0036    0.0069       281

    accuracy                         0.1582      2661
   macro avg     0.1047    0.1139    0.0730      2661
weighted avg    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


**Low Training Accuracy (Train Accuracy ≈ 16%):**
The model performs only slightly better than random guessing (10%), even on the training data. This indicates that the model either lacks sufficient capacity or is not properly optimized to extract meaningful features from high-resolution 256×256 images.

# 2. ResNet50
## Experiment 1: Training the Classification Head

In this stage, we transition from a custom CNN to ResNet50. To preserve the pre-trained ImageNet features, we freeze the base model layers and only train a custom classification head. We also introduce Class Weights to address the significant dataset imbalance.

In [17]:
# RESNET50 HEAD ARCHITECTURE SETUP

# 1. Initialize ResNet50 with ImageNet weights (excluding top layer)
base_model = ResNet50(weights='imagenet', include_top=False, input_shape=(256, 256, 3))

# 2. Freeze the base model layers
base_model.trainable = False

# 3. Add Custom Classification Head
x = base_model.output
x = GlobalAveragePooling2D()(x)
x = Dense(512, activation='relu')(x)
x = Dropout(0.5)(x)
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

model_resnet_head = Model(inputs=base_model.input, outputs=predictions)

# 4. Compile with a standard Learning Rate for the initial head training
model_resnet_head.compile(
    optimizer=Adam(learning_rate=1e-3),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# 5. Class Weight Calculation
y_train_labels = np.argmax(y_train_ohe, axis=1)
weights = class_weight.compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_labels),
    y=y_train_labels
)
CLASS_WEIGHTS_DICT = dict(enumerate(weights))

print("ResNet50 head architecture defined.")
print(f"Calculated Class Weights: {CLASS_WEIGHTS_DICT}")

 ResNet50 head architecture defined.
Calculated Class Weights: {0: np.float64(1.639894319682959), 1: np.float64(0.9571318427139552), 2: np.float64(0.6706645056726094), 3: np.float64(0.8748414376321353), 4: np.float64(5.305128205128205), 5: np.float64(0.8687193841847446), 6: np.float64(0.96984375), 7: np.float64(0.6746739130434782), 8: np.float64(1.2463855421686747), 9: np.float64(0.9469107551487415)}


In [18]:
# EVALUATING

# File: resnet50_head_trained.h5 (Trained for 5 epochs)

HEAD_MODEL_PATH = os.path.join(MODEL_DIR, 'resnet50_head_trained.h5')

try:
    # Load the model state after the first 5 epochs
    model_resnet_head = load_model(HEAD_MODEL_PATH)
    print(f"Successfully loaded head-trained model from: {HEAD_MODEL_PATH}")

    # Using raw X_test (normalized to [0,1])
    y_pred_probs = model_resnet_head.predict(X_test, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- RESNET50 HEAD-ONLY PERFORMANCE ---")
    print(f"Weighted F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
    print("\nClassification Report (Focus on rare classes improvement):")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading head model: {e}")



Successfully loaded head-trained model from: /content/drive/MyDrive/Galaxy_Morphology_Project/models/resnet50_head_trained.h5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 100ms/step

--- RESNET50 HEAD-ONLY PERFORMANCE ---
Weighted F1-Score: 0.0192

Classification Report (Focus on rare classes improvement):
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       162
           1     0.0000    0.0000    0.0000       278
           2     0.0000    0.0000    0.0000       397
           3     0.0000    0.0000    0.0000       304
           4     0.0000    0.0000    0.0000        50
           5     0.0000    0.0000    0.0000       307
           6     0.1030    1.0000    0.1867       274
           7     0.0000    0.0000    0.0000       394
           8     0.0000    0.0000    0.0000       214
           9     0.0000    0.0000    0.0000       281

    accuracy                         0.1030      2661
   macro avg     0.010

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


## Experiment 2: Technical Insight — Fixing Input
During initial ResNet50 trials, the model failed to converge (Accuracy ~0.10).

I identified that the pre-trained weights require specific ImageNet-style normalization (centering and scaling pixel values) provided by resnet_preprocess. Simply scaling to $[0, 1]$ was insufficient, I reverted the images to $[0, 255]$ and applied the official preprocessing.

In [19]:
# CORRECTING PREPROCESSING FOR IMAGENET WEIGHTS

# We revert normalized [0, 1] data back to [0, 255] for the official preprocessor
# Using the split data from Section 1.4
X_train_val_raw = X_train_val * 255.0
X_test_raw = X_test * 255.0

# Apply ImageNet normalization (Mean subtraction and scaling)
X_train_val_processed = resnet_preprocess(X_train_val_raw)
X_test_processed = resnet_preprocess(X_test_raw)

# Re-split Train and Validation with correctly processed data
X_train, X_val, y_train_ohe, y_val_ohe = train_test_split(
    X_train_val_processed, y_train_val_ohe,
    test_size=(0.15 / 0.85),
    random_state=42,
    stratify=np.argmax(y_train_val_ohe, axis=1)
)

print("Data successfully re-processed using ImageNet standards.")
print(f"Sample pixel value after ResNet preprocessing: {X_train[0,0,0,0]:.2f}")
# (Values will now be centered around zero, not 0-1)

Data successfully re-processed using ImageNet standards.
Sample pixel value after ResNet preprocessing: -76.94



After applying the resnet_preprocess function, we re-evaluated the head-trained model. The difference was dramatic: the model immediately regained its ability to recognize features, jumping from random guessing to meaningful classification.

Evaluation with Proper Preprocessing:

In [20]:
# EVALUATION
# test the 'resnet50_head_trained.h5' model again
# but this time using the correctly preprocessed X_test_processed.

try:
    y_pred_probs = model_resnet_head.predict(X_test_processed, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- RESULTS AFTER CORRECT PREPROCESSING ---")
    current_f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Weighted F1-Score: {current_f1:.4f}")
    print(f"Improvement over random guessing: +{(current_f1 - 0.1) * 100:.2f}%")

    print("\nClassification Report (Baseline Transfer Learning):")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error during evaluation: {e}")

[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step

--- RESULTS AFTER CORRECT PREPROCESSING ---
Weighted F1-Score: 0.5534
Improvement over random guessing: +45.34%

Classification Report (Baseline Transfer Learning):
              precision    recall  f1-score   support

           0     0.1818    0.2716    0.2178       162
           1     0.4733    0.4784    0.4758       278
           2     0.7587    0.6020    0.6713       397
           3     0.5570    0.5461    0.5515       304
           4     0.5192    0.5400    0.5294        50
           5     0.5588    0.4332    0.4881       307
           6     0.5146    0.5146    0.5146       274
           7     0.4197    0.4645    0.4410       394
           8     0.7909    0.8131    0.8018       214
           9     0.7115    0.7722    0.7406       281

    accuracy                         0.5475      2661
   macro avg     0.5486    0.5436    0.5432      2661
weighted avg     0.5655    0.5475    0.5534      2661



 ## Experiment 3: Learning Rate: Fine-Tuning
  After successfully training the classification head, I unfroze the entire ResNet50 backbone. To prevent destroying the pre-trained features during this phase, I used a significantly lower learning rate ($10^{-5}$). This allows the model to subtly adjust the internal filters to better recognize specific galaxy morphologies while retaining the general knowledge from ImageNet.

In [21]:
# CONFIGURATION

# 1. Unfreeze all layers of the backbone
for layer in model_resnet_head.layers:
    layer.trainable = True

# Low LR is critical during fine-tuning to maintain stability
NEW_LEARNING_RATE = 1e-5

model_resnet_head.compile(
    optimizer=Adam(learning_rate=NEW_LEARNING_RATE),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

# Configuration used during this stage:
# Batch Size: 8 (reduced to prevent Out-Of-Memory errors during full backpropagation)
# Epochs: 20
print(f"ResNet50 unfrozen and re-compiled with LR={NEW_LEARNING_RATE}")

ResNet50 unfrozen and re-compiled with LR=1e-05


In [22]:
# Evaluation
# File: resnet50_final_fine_tuned.h5

FINE_TUNED_PATH = os.path.join(MODEL_DIR, 'resnet50_final_fine_tuned.h5')

try:
    # Load the model after the full 20-epoch fine-tuning session
    model_resnet_final = load_model(FINE_TUNED_PATH)
    print(f"Loaded fine-tuned ResNet50: {FINE_TUNED_PATH}")

    # Final evaluation on correctly preprocessed test data
    y_pred_probs = model_resnet_final.predict(X_test_processed, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- FINAL FINE-TUNED RESNET50 PERFORMANCE ---")
    fine_tuned_f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Weighted F1-Score: {fine_tuned_f1:.4f}")

    # We document the jump in performance from Phase 1 to Phase 2
    print(f"Improvement from Fine-Tuning: +{(fine_tuned_f1 - current_f1):.4f}")

    print("\nDetailed Classification Report:")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading fine-tuned model: {e}")



Loaded fine-tuned ResNet50: /content/drive/MyDrive/Galaxy_Morphology_Project/models/resnet50_final_fine_tuned.h5
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m10s[0m 68ms/step

--- FINAL FINE-TUNED RESNET50 PERFORMANCE ---
Weighted F1-Score: 0.8001
Improvement from Fine-Tuning: +0.2467

Detailed Classification Report:
              precision    recall  f1-score   support

           0     0.4839    0.4630    0.4732       162
           1     0.8776    0.7734    0.8222       278
           2     0.9093    0.9345    0.9217       397
           3     0.8889    0.8947    0.8918       304
           4     0.6667    0.7200    0.6923        50
           5     0.8253    0.7850    0.8047       307
           6     0.7069    0.7482    0.7270       274
           7     0.6800    0.6472    0.6632       394
           8     0.8869    0.9159    0.9011       214
           9     0.8476    0.9502    0.8960       281

    accuracy                         0.8016      2661
   macro avg     0

## Experiment 4: Optimization — Resolution Change & Regularization

Resolution Adjustment and Dropout Tuning (V4).

To optimize training speed and memory usage I shifted the input resolution from 256x256 to 128x128, as observed overfitting in previous runs, I implemented a more aggressive Dropout (0.4) in the classification head to improve generalization on the test set.

**Backbone:** ResNet50 (Full Fine-Tuning).

**Key change:** Added Dropout(0.4) after Global Average Pooling.

**Training:** 10 epochs, Adam (LR 1e-5), Balanced Class Weights.

In [23]:
# OPTIMIZED RESNET50 (V4) ARCHITECTURE
# input size changed to 128x128 for better efficiency
TARGET_SIZE = 128

base_model_v4 = ResNet50(weights='imagenet', include_top=False, input_shape=(TARGET_SIZE, TARGET_SIZE, 3))
base_model_v4.trainable = True # Full fine-tuning enabled

x = GlobalAveragePooling2D()(base_model_v4.output)
x = Dropout(0.4)(x) # increased dropout for better regularization
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

model_resnet_v4 = Model(inputs=base_model_v4.input, outputs=predictions)

model_resnet_v4.compile(
    optimizer=Adam(learning_rate=1e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

print(f"Optimized ResNet50 (V4) defined for {TARGET_SIZE}x{TARGET_SIZE} input.")

Optimized ResNet50 (V4) defined for 128x128 input.


In [25]:
# EVALUATING RESNET50 dropout
# File: resnet50_best_dropout_v4.keras

RESNET_V4_PATH = os.path.join(MODEL_DIR, 'resnet50_best_dropout_v4.keras')

try:
    # load the best weights for this specific architecture
    model_resnet_v4 = load_model(RESNET_V4_PATH)
    print(f"Loaded ResNet50 V4: {RESNET_V4_PATH}")

    # resize X_test to 128x128 for this specific model
    import cv2
    X_test_128 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_test_raw])
    X_test_128_processed = resnet_preprocess(X_test_128)

    y_pred_probs = model_resnet_v4.predict(X_test_128_processed, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- RESNET50 V4 PERFORMANCE (128x128) ---")
    v4_f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Weighted F1-Score: {v4_f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading V4 model: {e}")

Loaded ResNet50 V4: /content/drive/MyDrive/Galaxy_Morphology_Project/models/resnet50_best_dropout_v4.keras
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 58ms/step

--- RESNET50 V4 PERFORMANCE (128x128) ---
Weighted F1-Score: 0.0193
              precision    recall  f1-score   support

           0     0.0000    0.0000    0.0000       162
           1     0.0000    0.0000    0.0000       278
           2     0.0000    0.0000    0.0000       397
           3     0.0000    0.0000    0.0000       304
           4     0.0000    0.0000    0.0000        50
           5     0.0000    0.0000    0.0000       307
           6     0.1032    1.0000    0.1872       274
           7     0.0000    0.0000    0.0000       394
           8     0.0000    0.0000    0.0000       214
           9     0.0000    0.0000    0.0000       281

    accuracy                         0.1030      2661
   macro avg     0.0103    0.1000    0.0187      2661
weighted avg     0.0106    0.1030    0.0193   

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [30]:
# CORRECTED EVALUATION Resnet50 dropout
try:
    print("Re-evaluating V4 with correct preprocessing...")
    # Convert values back to the 0–255 range and apply ResNet preprocessing
    X_test_v4_ready = resnet_preprocess(X_test_128 * 255.0)

    y_pred_probs = model_resnet_v4.predict(X_test_v4_ready, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print(f"Corrected Weighted F1-Score: {f1_score(y_true, y_pred, average='weighted'):.4f}")
except Exception as e:
    print(f"Error: {e}")


Re-evaluating V4 with correct preprocessing...
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 10ms/step
Corrected Weighted F1-Score: 0.7559



## Experiment 5: Overcoming Instability with Data Augmentation and Class Balancing

**Dynamic Augmentation:** Using ImageDataGenerator for 360-degree rotations and flips to help the model learn rotation-invariant galaxy features.

**Manual Class Weighting:** Calculating precise weights to force the model to pay attention to rare galaxy types.

**Native Keras Format:** Switched to .keras saving for better architectural stability.

**Architecture:** Standard ResNet50 with a GlobalAveragePooling2D head (no extra Dense layers to minimize parameters).

**Training Strategy:** Full model fine-tuning from the start with a very low Learning Rate ($1e-5$).

**Optimization:** 10 Epochs using the val_loss monitor to capture the most stable state.

**Data Handling:** 128x128 resolution, heavy ImageNet-style augmentation, and balanced class weights.

**Result:** This model achieved significantly better stability and was chosen for the final Ensemble.

In [27]:
# DATA PREPARATION & AUGMENTATION SETUP

# 1. Image Resizing to 128x128
# We use X_train, X_val, X_test which were defined in the first Data Loading cell
TARGET_SIZE = 128

X_train_128 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_train])
X_val_128 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_val])
X_test_128 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_test])

# 2. Class Weight Calculation
y_train_labels = np.argmax(y_train_ohe, axis=1)
weights = class_weight.compute_class_weight('balanced', classes=np.unique(y_train_labels), y=y_train_labels)
FINAL_CLASS_WEIGHTS = {i: weights[i] for i in range(len(weights))}

# 3. Augmentation Pipeline
train_datagen = ImageDataGenerator(
    preprocessing_function=resnet_preprocess,
    rotation_range=360,
    horizontal_flip=True,
    vertical_flip=True,
    zoom_range=0.1,
    fill_mode='nearest'
)

# we mltiply by 255.0 as resnet_preprocess expects raw pixel values
train_generator = train_datagen.flow(X_train_128 * 255.0, y_train_ohe, batch_size=32)

print("Data generators and Class Weights are ready using correct variable names.")

Data generators and Class Weights are ready using correct variable names.


In [28]:
# File: resnet50_best_final_chance.keras

from tensorflow.keras.models import load_model

MODEL_PATH = os.path.join(MODEL_DIR, 'resnet50_best_final_chance.keras')

try:
    # Load the model that achieved stability
    final_resnet_model = load_model(MODEL_PATH)
    print(f"Successfully loaded: {MODEL_PATH}")

    # Process test data
    X_test_processed = resnet_preprocess(X_test_128 * 255.0)

    # Predict
    y_pred = np.argmax(final_resnet_model.predict(X_test_processed), axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- RESNET50 FINAL CHANCE PERFORMANCE ---")
    f1_res = f1_score(y_true, y_pred, average='weighted')
    print(f"Weighted F1-Score: {f1_res:.4f}")
    print("\nClassification Report:")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f" Error: {e}")

Successfully loaded: /content/drive/MyDrive/Galaxy_Morphology_Project/models/resnet50_best_final_chance.keras
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step

--- RESNET50 FINAL CHANCE PERFORMANCE ---
Weighted F1-Score: 0.7673

Classification Report:
              precision    recall  f1-score   support

           0     0.4633    0.5062    0.4838       162
           1     0.8217    0.7626    0.7910       278
           2     0.8278    0.9446    0.8824       397
           3     0.8386    0.8717    0.8548       304
           4     0.4500    0.9000    0.6000        50
           5     0.7958    0.7492    0.7718       307
           6     0.6571    0.7482    0.6997       274
           7     0.7210    0.5051    0.5940       394
           8     0.9038    0.8785    0.8910       214
           9     0.9044    0.8754    0.8897       281

    accuracy                         0.7693      2661
   macro avg     0.7384    0.7741    0.7458      2661
weighted avg     0.

# 3. DenseNet121

This model was used in the latest aricles of the researchers working with this data. However sometimes the F1 score was not stated in their results, so I tried conduct my own experiments.

In [15]:
NUM_CLASSES = 10

# input size 128x128
base_model_densenet = DenseNet121(
    weights='imagenet',
    include_top=False,
    input_shape=(128, 128, 3)
)

base_model_densenet.trainable = True # all layers were unfrozen for fine-tuning

# Custom Classifier Head
x = base_model_densenet.output
x = GlobalAveragePooling2D()(x)
x = Dropout(0.3)(x)
predictions = Dense(NUM_CLASSES, activation='softmax')(x)

model_densenet_final = Model(inputs=base_model_densenet.input, outputs=predictions)

# The model was compiled with the standard low Learning Rate for fine-tuning.
model_densenet_final.compile(
    optimizer=Adam(learning_rate=1e-5),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

BEST_MODEL_DENSENET_PATH = os.path.join(DRIVE_PATH, 'models', 'densenet121_best_final_v5.keras')
print("DenseNet121 Model Architecture defined.")
print(f"Final Model was saved to: {BEST_MODEL_DENSENET_PATH}")

DenseNet121 Model Architecture defined.
Final Model was saved to: /content/drive/MyDrive/Galaxy_Morphology_Project/models/densenet121_best_final_v5.keras


In [9]:
# EVALUATION DENSENET121 MODEL
try:
    DENSENET_MODEL = load_model(BEST_MODEL_DENSENET_PATH)
    print(f"Successfully loaded saved DenseNet121 model.")
except Exception as e:
    print(f"ERROR loading model. Check path: {BEST_MODEL_DENSENET_PATH}. Error: {e}")
    exit()

# We must resize it back to 128x128 for a fair comparison.
TARGET_SIZE_DENSENET = 128
X_test_densenet_input = np.array([cv2.resize(img, (TARGET_SIZE_DENSENET, TARGET_SIZE_DENSENET)) for img in X_test])
X_test_raw255_densenet = X_test_densenet_input * 255.0

# Prediction (using ResNet Preprocessor, which works for DenseNet)
X_test_processed = resnet_preprocess(X_test_raw255_densenet)
y_pred_probs = DENSENET_MODEL.predict(X_test_processed)

y_pred = np.argmax(y_pred_probs, axis=1)
y_true = np.argmax(y_test_ohe, axis=1) # Using y_test_ohe from the Data Split section

final_f1_densenet = f1_score(y_true, y_pred, average='weighted')

print("\n--- DENSENET121 EXPERIMENT EVALUATION ---")
print(f" Weighted F1-score (DenseNet121 @ 128x128): {final_f1_densenet:.4f}")
print("--- Classification Report ---")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

Successfully loaded saved DenseNet121 model.
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 136ms/step

--- DENSENET121 EXPERIMENT EVALUATION ---
 Weighted F1-score (DenseNet121 @ 128x128): 0.7507
--- Classification Report ---
              precision    recall  f1-score   support

           0     0.3641    0.4877    0.4169       162
           1     0.7887    0.8058    0.7972       278
           2     0.8300    0.9471    0.8847       397
           3     0.8835    0.7730    0.8246       304
           4     0.4752    0.9600    0.6358        50
           5     0.7240    0.7264    0.7252       307
           6     0.6931    0.7007    0.6969       274
           7     0.6870    0.4569    0.5488       394
           8     0.9061    0.9019    0.9040       214
           9     0.8964    0.8932    0.8948       281

    accuracy                         0.7520      2661
   macro avg     0.7248    0.7653    0.7329      2661
weighted avg     0.7624    0.7520    0.7507      26

#4. EfficientNetB5 - High-Resolution Training (224x224) and EfficientNetB5

To reach the next level of accuracy, I tried EfficientNetB5, a model known for its superior scaling and efficiency.

**Changes:** Increased input resolution from 128x128 to 224x224 to capture finer morphological details of galaxies.

**Preprocessing:** Switched to efficientnet_preprocess (critical for this architecture).

**Training:** Full fine-tuning for 15 epochs with a very low learning rate ($1e-5$).

**Data Strategy:** Applied heavy augmentation and re-calculated balanced class weights for the 224x224 dataset.

In [31]:
# EFFICIENTNETB5 SETUP & RE-SIZING

TARGET_SIZE = 224

# Re-sizing to the optimal resolution for B5
# Assuming X_train, X_val, X_test are our base [0, 1] arrays
X_train_224 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_train])
X_val_224 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_val])
X_test_224 = np.array([cv2.resize(img, (TARGET_SIZE, TARGET_SIZE)) for img in X_test])

# Architecture matching the V6 experiment
base_model_effnet = EfficientNetB5(weights='imagenet', include_top=False, input_shape=(TARGET_SIZE, TARGET_SIZE, 3))
base_model_effnet.trainable = True

x = GlobalAveragePooling2D()(base_model_effnet.output)
predictions = Dense(10, activation='softmax')(x)

model_effnet_v6 = Model(inputs=base_model_effnet.input, outputs=predictions)
model_effnet_v6.compile(optimizer=Adam(learning_rate=1e-5), loss='categorical_crossentropy', metrics=['accuracy'])

print(f"EfficientNetB5 architecture prepared for {TARGET_SIZE}x{TARGET_SIZE} inputs.")

Downloading data from https://storage.googleapis.com/keras-applications/efficientnetb5_notop.h5
[1m115263384/115263384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 0us/step
EfficientNetB5 architecture prepared for 224x224 inputs.


In [33]:
# EVALUATING THE V6 RECORD MODEL

# File: efficientnetb5_best_final_v6.keras

V6_PATH = os.path.join(MODEL_DIR, 'efficientnetb5_best_final_v6.keras')

try:
    # Load the best state of B5
    model_effnet_v6 = load_model(V6_PATH)
    print(f"Successfully loaded EfficientNetB5 V6: {V6_PATH}")

    # predict using the specialized EfficientNet preprocessor on [0, 255] data
    X_test_v6_ready = efficientnet_preprocess(X_test_224 * 255.0)

    y_pred_probs = model_effnet_v6.predict(X_test_v6_ready, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- EFFICIENTNETB5 V6 PERFORMANCE (224x224) ---")
    v6_f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"Record Weighted F1-Score: {v6_f1:.4f}")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading V6: {e}")

Successfully loaded EfficientNetB5 V6: /content/drive/MyDrive/Galaxy_Morphology_Project/models/efficientnetb5_best_final_v6.keras
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 178ms/step

--- EFFICIENTNETB5 V6 PERFORMANCE (224x224) ---
Record Weighted F1-Score: 0.7990
              precision    recall  f1-score   support

           0     0.4162    0.5062    0.4568       162
           1     0.8496    0.8129    0.8309       278
           2     0.9062    0.9244    0.9152       397
           3     0.9307    0.8388    0.8824       304
           4     0.6104    0.9400    0.7402        50
           5     0.7969    0.8436    0.8196       307
           6     0.6676    0.8358    0.7423       274
           7     0.7674    0.5025    0.6074       394
           8     0.8943    0.9486    0.9206       214
           9     0.9170    0.9431    0.9298       281

    accuracy                         0.8008      2661
   macro avg     0.7756    0.8096    0.7845      2661
weighted

# 6. The DLR Simulation (V7)

Then I decided to do some "polishing" of the last experiment, so I re-compiled the model with an even lower Learning Rate      
($3 \times 10^{-6}$).

**Logic:** This acts as a manual Learning Rate Decay. It allows the model to adjust the weights of the classification head more precisely without distorting the well-trained feature extractors in the base EfficientNet layers. 5 final epochs to capture the absolute minimum validation loss.

In [34]:
#  EVALUATING THE DLR-REFINED MODEL


# Hyperparameters:
# - Base LR: 3e-6 (Ultra-low for precision)
# - Epochs: 5 (Extra)
# - File: efficientnetb5_dlr_final_v7.keras

V7_PATH = os.path.join(MODEL_DIR, 'efficientnetb5_dlr_final_v7.keras')

try:
    model_effnet_v7 = load_model(V7_PATH)
    print(f"Successfully loaded DLR-refined EfficientNetB5: {V7_PATH}")

    # Predict (using the same 224x224 processed test set from Step 5.1)
    # X_test_v6_ready (from previous cell) is already 224x224 and preprocessed
    y_pred_probs = model_effnet_v7.predict(X_test_v6_ready, verbose=1)
    y_pred = np.argmax(y_pred_probs, axis=1)
    y_true = np.argmax(y_test_ohe, axis=1)

    print("\n--- EFFICIENTNETB5 V7 (DLR) PERFORMANCE ---")
    v7_f1 = f1_score(y_true, y_pred, average='weighted')
    print(f"DLR Final Weighted F1-Score: {v7_f1:.4f}")

    # Comparison with V6
    improvement = v7_f1 - v6_f1
    print(f"Delta from refinement: {improvement:+.4f}")

    print("\nDetailed classification of morphologies (DLR Optimized):")
    print(classification_report(y_true, y_pred, digits=4))

except Exception as e:
    print(f"Error loading V7: {e}")

Successfully loaded DLR-refined EfficientNetB5: /content/drive/MyDrive/Galaxy_Morphology_Project/models/efficientnetb5_dlr_final_v7.keras
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 178ms/step

--- EFFICIENTNETB5 V7 (DLR) PERFORMANCE ---
DLR Final Weighted F1-Score: 0.8057
Delta from refinement: +0.0067

Detailed classification of morphologies (DLR Optimized):
              precision    recall  f1-score   support

           0     0.4159    0.5494    0.4734       162
           1     0.8667    0.8417    0.8540       278
           2     0.9089    0.9295    0.9191       397
           3     0.9066    0.8618    0.8836       304
           4     0.6765    0.9200    0.7797        50
           5     0.8377    0.8241    0.8309       307
           6     0.6705    0.8540    0.7512       274
           7     0.7846    0.4898    0.6031       394
           8     0.9009    0.9346    0.9174       214
           9     0.9119    0.9573    0.9340       281

    accuracy        

# 7. Ensemble Learning


Then I thought since models have different "blind spots", by combining predictions from our best ResNet50 and EfficientNetB5, I can cancel out individual errors and boost overall stability.

#### 2-Way Weighted Ensemble: ResNet50 (128x128) + EfficientNetB5 (224x224).

In [42]:
#  RESNET50 + EFFNETB5

RESNET_PATH = os.path.join(MODEL_DIR, 'resnet50_best_final_chance.keras')
EFFNET_PATH = os.path.join(MODEL_DIR, 'efficientnetb5_dlr_final_v7.keras')

model_resnet = load_model(RESNET_PATH)
model_effnet = load_model(EFFNET_PATH)

# 2. preparing data for ResNet50 (128x128)
X_test_128 = np.array([cv2.resize(img, (128, 128)) for img in X_test])
prob_resnet = model_resnet.predict(resnet_preprocess(X_test_128 * 255.0), verbose=1)

# 3. for EfficientNetB5 (224x224)
X_test_224 = np.array([cv2.resize(img, (224, 224)) for img in X_test])
prob_effnet = model_effnet.predict(efficientnet_preprocess(X_test_224 * 255.0), verbose=1)

# 4. weights
ENSEMBLE_WEIGHTS = [0.3, 0.7] # 30% ResNet, 70% EffNet
prob_ensemble = (prob_resnet * ENSEMBLE_WEIGHTS[0]) + (prob_effnet * ENSEMBLE_WEIGHTS[1])

y_pred_ensemble = np.argmax(prob_ensemble, axis=1)
y_true = np.argmax(y_test_ohe, axis=1)

final_f1 = f1_score(y_true, y_pred_ensemble, average='weighted')

print(f"\n FINAL ENSEMBLE RESULT")
print(f"Weighted F1-Score: {final_f1:.4f}")
print(f"Improvement over Single Best Model: +{(final_f1 - 0.7990)*100:.2f}%")

print("\n Detailed Report:")
print(classification_report(y_true, y_pred_ensemble, digits=4))

[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 54ms/step
[1m84/84[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 179ms/step

 FINAL ENSEMBLE RESULT
Weighted F1-Score: 0.8151
Improvement over Single Best Model: +1.61%

 Detailed Report:
              precision    recall  f1-score   support

           0     0.4604    0.5741    0.5110       162
           1     0.8613    0.8489    0.8551       278
           2     0.8964    0.9370    0.9163       397
           3     0.9122    0.8882    0.9000       304
           4     0.6912    0.9400    0.7966        50
           5     0.8498    0.8111    0.8300       307
           6     0.6890    0.8650    0.7670       274
           7     0.7915    0.5203    0.6279       394
           8     0.9041    0.9252    0.9145       214
           9     0.9210    0.9537    0.9371       281

    accuracy                         0.8174      2661
   macro avg     0.7977    0.8263    0.8055      2661
weighted avg     0.8251    0.8174 

# Conclusion:

Reaching the target accuracy of 0.85 would likely require moving to hybrid Transformer-based models (e.g., CvT), as shown in recent research. However, this transition would require much larger computational resources, access to large-scale pretraining datasets, and more complex training strategies.