<a href="https://colab.research.google.com/github/la2015-hw/Group_10/blob/main/CNN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import h5py
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, models
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical
import matplotlib.pyplot as plt


In [None]:
FILE_PATH = "processed_sdss_balanced.h5"
EPOCHS = 10
MAX_SAMPLES = 47600

In [None]:
with h5py.File(FILE_PATH, "r") as f:
    total = f["images"].shape[0]
    n = min(total, MAX_SAMPLES)
    print(f"Dataset contains {total} samples, loading {n} samples (limit={MAX_SAMPLES})")

    X = f["images"][:n]
    y_raw = f["broad_class"][:n]

    # robust decoding: handle bytes or already-str types
    if (hasattr(y_raw, "dtype") and (np.issubdtype(y_raw.dtype, np.bytes_) or y_raw.dtype.kind == 'S')):
        y = np.array([s.decode("utf-8") if isinstance(s, (bytes, np.bytes_)) else str(s) for s in y_raw])
    else:
        y = y_raw.astype(str)

print("Data loaded:", X.shape, y.shape)

In [None]:
encoder = LabelEncoder()
y_encoded = encoder.fit_transform(y)
y_cat = to_categorical(y_encoded)
print("Number of classes:", y_cat.shape[1])
print("Classes:", encoder.classes_)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y_cat, test_size=0.2, random_state=42, stratify=y_encoded
)

# Normalize images using training max
train_max = X_train.max() if X_train.size else 1.0
if train_max == 0:
    train_max = 1.0
X_train = X_train.astype("float32") / train_max
X_test = X_test.astype("float32") / train_max

In [None]:
model = models.Sequential([
    layers.Conv2D(64, (3, 3), activation='relu', input_shape=(107, 107, 5)),
    layers.MaxPooling2D((2, 2)),

    layers.Conv2D(64, (3, 3), activation='relu'),
    layers.MaxPooling2D((2, 2)),

    layers.Flatten(),
    layers.Dense(128, activation='relu'),
    layers.Dense(y_cat.shape[1], activation='softmax')
])
model.compile(optimizer='adam',
              loss='categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(
    X_train, y_train,
    epochs=EPOCHS,
    validation_data=(X_test, y_test)
)

In [None]:
test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
print("\nTest accuracy:", test_acc)

predictions = model.predict(X_test)
pred_classes = np.argmax(predictions, axis=1)
true_classes = np.argmax(y_test, axis=1)

for i in range(min(5, X_test.shape[0])):
    print(f"\nTrue class: {encoder.classes_[true_classes[i]]}")
    print(f"Predicted class: {encoder.classes_[pred_classes[i]]}")
    print(f"Prediction confidence: {np.max(predictions[i])*100:.2f}%")

model.save("galaxy_classifier_gz2class.h5")
print("\nModel saved as galaxy_classifier_gz2class.h5")