<a href="https://colab.research.google.com/github/la2015-hw/Group_10/blob/main/extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>



---


**Please Read Before Running following Code Cells**


Get Kaggle API key
1. Navigate to Kaggle
2. Naviagte to the Settings Page (Ensure you have a Kaggle account)
3. Scroll to API and click "Create New Token"
4. A JSON file downloads on you local machine

Set Up Kaggle API
1. Navigate to Google colab
2. Click on files and click upload to session storage
3. Once uploaded run the following commands in terminal

```
mkdir -p ~/.kaggle
cp -f /content/kaggle.json ~/.kaggle/kaggle.json
chmod 600 ~/.kaggle/kaggle.json
```

**If that does not work try this**

```
mkdir -p ~/.kaggle
cp -f kaggle.json ~/.kaggle/kaggle.json
chmod 600 ~/.kaggle/kaggle.json
```
4. API key has been set up and now you can read and write.

**Reminder - Setup is necessary after each Session**



---





In [None]:
!pip install kaggle --quiet

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("mustansireranpurwala/sdss-image-dataset")

print("Path to dataset files:", path)



---


**Image data Manipulation**

In [None]:
import h5py
import numpy as np
import pandas as pd

file_path = '/home/mte/sdss_w_specz_train.h5'
chunk_size = 10000
output_csv = 'sdss_galaxies_with_colors.csv'

def flux_to_magnitude(flux, zero_point=22.5):
    flux = np.maximum(flux, 1e-10)
    return -2.5 * np.log10(flux) + zero_point

def extract_magnitudes_from_image(image_5band):
    magnitudes = []
    for band_idx in range(5):  # u, g, r, i, z
        band_image = image_5band[:, :, band_idx]
        total_flux = np.sum(band_image)
        mag = flux_to_magnitude(total_flux)
        magnitudes.append(mag)
    return magnitudes

def calculate_colors(u, g, r, i, z, e_bv):
    A_u = 5.155 * e_bv
    A_g = 3.793 * e_bv
    A_r = 2.751 * e_bv
    A_i = 2.086 * e_bv
    A_z = 1.479 * e_bv

    u_corr = u - A_u
    g_corr = g - A_g
    r_corr = r - A_r
    i_corr = i - A_i
    z_corr = z - A_z

    return {
        'u_mag': u_corr,
        'g_mag': g_corr,
        'r_mag': r_corr,
        'i_mag': i_corr,
        'z_mag': z_corr,
        'u_minus_g': u_corr - g_corr,
        'g_minus_r': g_corr - r_corr,
        'r_minus_i': r_corr - i_corr,
        'i_minus_z': i_corr - z_corr
    }

def classify_galaxy_death(u_minus_g, g_minus_r):
    labels = []
    for ug, gr in zip(u_minus_g, g_minus_r):
        if ug > 1.5 and gr > 0.8:
            labels.append('DEAD')
        elif ug < 1.0 and gr < 0.6:
            labels.append('ALIVE')
        else:
            labels.append('TRANSITIONAL')
    return np.array(labels)


with h5py.File(file_path, 'r') as f:
    total_rows = f['ObjID'].shape[0]
    print(f"Total rows: {total_rows}")

    first_chunk = True
    for start in range(0, total_rows, chunk_size):
        end = min(start + chunk_size, total_rows)
        print(f"Processing rows {start} to {end}")

        objid = f['ObjID'][start:end]
        ra = f['ra'][start:end]
        dec = f['dec'][start:end]
        e_bv = f['e_bv'][start:end]
        spec_objid = f['specObjID'][start:end]
        redshift = f['specz_redshift'][start:end]
        redshift_err = f['specz_redshift_err'][start:end]
        images = f['images'][start:end]

        if images.shape[1] == 5 and images.shape[2] == 107:
            images = np.transpose(images, (0, 2, 3, 1))

        all_magnitudes = np.array([extract_magnitudes_from_image(img) for img in images])
        u_mag, g_mag, r_mag, i_mag, z_mag = all_magnitudes.T

        colors_dict = calculate_colors(u_mag, g_mag, r_mag, i_mag, z_mag, e_bv)

        death_labels = classify_galaxy_death(colors_dict['u_minus_g'], colors_dict['g_minus_r'])


        df = pd.DataFrame({
            'ObjID': [x.decode('utf-8') for x in objid],
            'ra': [float(x.decode('utf-8')) for x in ra],
            'dec': [float(x.decode('utf-8')) for x in dec],
            'e_bv': e_bv,
            'specObjID': [int(x.decode('utf-8')) for x in spec_objid],
            'specz_redshift': redshift,
            'specz_redshift_err': redshift_err,
            'u_mag': colors_dict['u_mag'],
            'g_mag': colors_dict['g_mag'],
            'r_mag': colors_dict['r_mag'],
            'i_mag': colors_dict['i_mag'],
            'z_mag': colors_dict['z_mag'],
            'u_minus_g': colors_dict['u_minus_g'],
            'g_minus_r': colors_dict['g_minus_r'],
            'r_minus_i': colors_dict['r_minus_i'],
            'i_minus_z': colors_dict['i_minus_z'],
            'death_status': death_labels
        })

        df.to_csv(output_csv, mode='a', index=False, header=first_chunk)
        first_chunk = False

print("Finished processing SDSS file. Data saved to CSV.")



print("Loading Galaxy Zoo 2 data...")
gz2 = pd.read_csv('zoo2MainSpecz.csv')

gz2 = gz2[pd.to_numeric(gz2['specobjid'], errors='coerce').notna()]
gz2['specobjid'] = gz2['specobjid'].astype(np.int64)

df = pd.read_csv(output_csv)
df['specObjID'] = df['specObjID'].astype(np.int64)


df_matched = df.merge(gz2, left_on='specObjID', right_on='specobjid', how='inner')
print(f"Successfully matched {len(df_matched)} galaxies with Galaxy Zoo 2")


df_matched.to_csv('sdss_gz2_matched.csv', index=False)

print("All processed data saved!")



---


**To push data back to kaggle**

1. Run the following code to create a folder to be pushed back which contains a metadata file

In [None]:
import json
import os

dataset_folder = "/content/processed_dataset"

os.makedirs(dataset_folder, exist_ok=True)

metadata = {
    "title": "SDSS Image Dataset - Processed",
    "id": "mustansireranpurwala/sdss-image-dataset",
    "licenses": [
        {"name": "CC0-1.0"}
    ]
}

with open(f"{dataset_folder}/dataset-metadata.json", "w") as f:
    json.dump(metadata, f, indent=4)

print("dataset-metadata.json created in", dataset_folder)

2. Open Terminal and run the following commands

```
cp -r /root/.cache/kagglehub/datasets/mustansireranpurwala/sdss-image-dataset/versions/1/sdss_w_specz_valid.h5 /content/processed_dataset/
```
This command moves the dataset from the root to the processed_data folder
```
kaggle datasets version -p /content/my_dataset_to_push -m "Test push from Colab"
```
This command initiates the push to kaggle

3. The processed dataset has sucessfully been pushed