<a href="https://colab.research.google.com/github/la2015-hw/Group_10/blob/main/extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---


**Please Read Before Running following Code Cells**


Get Kaggle API key
1. Navigate to Kaggle
2. Naviagte to the Settings Page (Ensure you have a Kaggle account)
3. Scroll to API and click "Create New Token"
4. A JSON file downloads on you local machine

Set Up Kaggle API
1. Navigate to Google colab
2. Click on files and click upload to session storage
3. Once uploaded run the following commands in terminal

```
mkdir -p ~/.kaggle
cp -f /content/kaggle.json ~/.kaggle/kaggle.json
chmod 600 ~/.kaggle/kaggle.json
```

**If that does not work try this**

```
mkdir -p ~/.kaggle
cp -f kaggle.json ~/.kaggle/kaggle.json
chmod 600 ~/.kaggle/kaggle.json
```
4. API key has been set up and now you can read and write.

**Reminder - Setup is necessary after each Session**



---





In [None]:
!pip install kaggle --quiet

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mustansireranpurwala/sdss-image-dataset")

print("Path to dataset files:", path)



---


**Image data Manipulation**

In [None]:
import h5py
import numpy as np
import pandas as pd

file_path = '/home/mte/sdss_w_specz_train.h5'
gz2_path = 'zoo2MainSpecz.csv'
output_path = 'processed_sdss_only.h5'
chunk_size = 25000

# --- HDF5 helper functions ---
def create_ds(f, name, dtype, str_len=None):
    if dtype == 'S':
        return f.create_dataset(name, shape=(0,), maxshape=(None,), dtype=f'S{str_len}', compression='gzip')
    else:
        return f.create_dataset(name, shape=(0,), maxshape=(None,), dtype=dtype, compression='gzip')

def append_chunk(ds, data):
    old_len = ds.shape[0]
    new_len = old_len + data.shape[0]
    ds.resize((new_len, *ds.shape[1:]))
    ds[old_len:] = data


# --- Magnitude & color functions ---
def flux_to_magnitude(flux, zero_point=22.5):
    flux = np.maximum(flux, 1e-10)
    return -2.5 * np.log10(flux) + zero_point

def extract_magnitudes_from_image(image_5band):
    mags = []
    for band_idx in range(5):
        band_image = image_5band[:, :, band_idx]
        total_flux = np.sum(band_image)
        mags.append(flux_to_magnitude(total_flux))
    return mags

def calculate_colors(u, g, r, i, z, e_bv):
    A_u = 5.155 * e_bv
    A_g = 3.793 * e_bv
    A_r = 2.751 * e_bv
    A_i = 2.086 * e_bv
    A_z = 1.479 * e_bv
    u_corr = u - A_u
    g_corr = g - A_g
    r_corr = r - A_r
    i_corr = i - A_i
    z_corr = z - A_z
    return {
        'u_mag': u_corr, 'g_mag': g_corr, 'r_mag': r_corr,
        'i_mag': i_corr, 'z_mag': z_corr,
        'u_minus_g': u_corr - g_corr, 'g_minus_r': g_corr - r_corr,
        'r_minus_i': r_corr - i_corr, 'i_minus_z': i_corr - z_corr
    }

def classify_galaxy_death(u_minus_g, g_minus_r):
    labels = []
    for ug, gr in zip(u_minus_g, g_minus_r):
        if ug > 1.5 and gr > 0.8:
            labels.append('DEAD')
        elif ug < 1.0 and gr < 0.6:
            labels.append('ALIVE')
        else:
            labels.append('TRANSITIONAL')
    return labels

# --- Load Galaxy Zoo 2 dataset ---
gz2 = pd.read_csv(gz2_path)
gz2 = gz2[pd.to_numeric(gz2['specobjid'], errors='coerce').notna()]
gz2['specobjid'] = gz2['specobjid'].astype(np.int64)
gz2_specids = set(gz2['specobjid'].values)

# --- Process SDSS in chunks and merge on specObjID ---
with h5py.File(file_path, 'r') as f_in, h5py.File(output_path, 'w') as f_out:
    N = f_in['images'].shape[0]
    print(f"Processing {N} galaxies in chunks of {chunk_size}...")

    # Create output datasets
    images_ds = f_out.create_dataset('images', shape=(0, 107, 107, 5), maxshape=(None, 107, 107, 5), dtype='float32', compression='gzip')
    objid_ds = create_ds(f_out, 'ObjID', 'S', 20)
    ra_ds = create_ds(f_out, 'ra', 'f8')
    dec_ds = create_ds(f_out, 'dec', 'f8')
    e_bv_ds = create_ds(f_out, 'e_bv', 'f4')
    spec_ds = create_ds(f_out, 'specObjID', 'i8')
    red_ds = create_ds(f_out, 'specz_redshift', 'f4')
    red_err_ds = create_ds(f_out, 'specz_redshift_err', 'f4')
    u_ds = create_ds(f_out, 'u_mag', 'f4')
    g_ds = create_ds(f_out, 'g_mag', 'f4')
    r_ds = create_ds(f_out, 'r_mag', 'f4')
    i_ds = create_ds(f_out, 'i_mag', 'f4')
    z_ds = create_ds(f_out, 'z_mag', 'f4')
    ug_ds = create_ds(f_out, 'u_minus_g', 'f4')
    gr_ds = create_ds(f_out, 'g_minus_r', 'f4')
    ri_ds = create_ds(f_out, 'r_minus_i', 'f4')
    iz_ds = create_ds(f_out, 'i_minus_z', 'f4')
    death_ds = create_ds(f_out, 'death_status', 'S', 15)

    for start in range(0, N, chunk_size):
        end = min(start + chunk_size, N)
        print(f"Chunk {start}-{end-1}")

        # Load chunk
        img_chunk = f_in['images'][start:end]
        if img_chunk.shape[1] == 5 and img_chunk.shape[2] == 107:
            img_chunk = np.transpose(img_chunk, (0, 2, 3, 1))

        spec_chunk = np.array([int(x.decode('utf-8')) for x in f_in['specObjID'][start:end]])

        # Filter only galaxies present in Galaxy Zoo 2
        mask = np.array([s in gz2_specids for s in spec_chunk])
        if np.sum(mask) == 0:
            continue

        img_chunk = img_chunk[mask]
        obj_chunk = f_in['ObjID'][start:end][mask]
        ra_chunk = f_in['ra'][start:end][mask]
        dec_chunk = f_in['dec'][start:end][mask]
        e_bv_chunk = f_in['e_bv'][start:end][mask]
        spec_chunk = spec_chunk[mask]
        red_chunk = f_in['specz_redshift'][start:end][mask]
        red_err_chunk = f_in['specz_redshift_err'][start:end][mask]

        # Magnitudes
        chunk_mags = np.array([extract_magnitudes_from_image(img) for img in img_chunk])
        u_c, g_c, r_c, i_c, z_c = chunk_mags.T

        # Colors & death labels
        colors = calculate_colors(u_c, g_c, r_c, i_c, z_c, e_bv_chunk)
        deaths = classify_galaxy_death(colors['u_minus_g'], colors['g_minus_r'])

        # Append to datasets
        append_chunk(images_ds, img_chunk)
        append_chunk(objid_ds, np.array([x.decode('utf-8').encode('utf-8') for x in obj_chunk]))
        append_chunk(ra_ds, ra_chunk.astype(float))
        append_chunk(dec_ds, dec_chunk.astype(float))
        append_chunk(e_bv_ds, e_bv_chunk)
        append_chunk(spec_ds, spec_chunk)
        append_chunk(red_ds, red_chunk)
        append_chunk(red_err_ds, red_err_chunk)
        append_chunk(u_ds, colors['u_mag'])
        append_chunk(g_ds, colors['g_mag'])
        append_chunk(r_ds, colors['r_mag'])
        append_chunk(i_ds, colors['i_mag'])
        append_chunk(z_ds, colors['z_mag'])
        append_chunk(ug_ds, colors['u_minus_g'])
        append_chunk(gr_ds, colors['g_minus_r'])
        append_chunk(ri_ds, colors['r_minus_i'])
        append_chunk(iz_ds, colors['i_minus_z'])
        append_chunk(death_ds, np.array([d.encode('utf-8') for d in deaths]))

print(f"Saved matched SDSS images and metadata to {output_path}")



---


**To push data back to kaggle**

1. Run the following code to create a folder to be pushed back which contains a metadata file

In [None]:
import json
import os

dataset_folder = "/content/processed_dataset"

os.makedirs(dataset_folder, exist_ok=True)

metadata = {
    "title": "SDSS Image Dataset - Processed",
    "id": "mustansireranpurwala/sdss-image-dataset",
    "licenses": [
        {"name": "CC0-1.0"}
    ]
}

with open(f"{dataset_folder}/dataset-metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("dataset-metadata.json created in", dataset_folder)

2. Open Terminal and run the following commands

```
cp -r /root/.cache/kagglehub/datasets/mustansireranpurwala/sdss-image-dataset/versions/1/sdss_w_specz_valid.h5 /content/processed_dataset/
```
This command moves the dataset from the root to the processed_data folder
```
kaggle datasets version -p /content/my_dataset_to_push -m "Test push from Colab"
```
This command initiates the push to kaggle

3. The processed dataset has sucessfully been pushed