In [10]:
import os
import csv
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin

# Base URL and epochs to scan
top_url = "https://archive-new.nrao.edu/vlass/quicklook/"
epoch_dirs = ["VLASS1.1v2/", "VLASS1.2v2/", "VLASS2.1/", "VLASS2.2/", "VLASS3.1/", "VLASS3.2/"]

# Patterns
fits_pattern = re.compile(r".*\.tt0\.subim\.fits$", re.IGNORECASE)
target_pattern = re.compile(r"[Jj]\d{6}[+-]\d{6}")

# Output file
output_csv = "vlass_catalog.csv"


def load_processed(csv_file):
    # Load processed data from existing CSV to skip them on resume
    processed = set()
    if os.path.exists(csv_file):
        with open(csv_file, newline='') as f:
            reader = csv.DictReader(f)
            for row in reader:
                processed.add(row['Link'])
    return processed


def list_tiles(epoch_url):
    # Return list of tile subdirectories under the given epoch
    print(f"[PARSE] Getting tiles in {epoch_url}")
    resp = requests.get(epoch_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [a['href'] for a in soup.find_all('a', href=True)
            if a['href'].endswith('/') and a['href'].startswith('T')]


def list_targets(tile_url):
    # Return list of target subdirectories under the given tile
    print(f"[PARSE] Getting targets in {tile_url}")
    resp = requests.get(tile_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')
    return [a['href'] for a in soup.find_all('a', href=True)
            if a['href'].endswith('/') and a['href'].lower().startswith('vlass')]


def process_target(epoch, tile, target_dir, tile_url, writer, processed):
    # Process a target directory: if processed - skip; else - record its path data
    target_url = urljoin(tile_url, target_dir)
    if target_url in processed:
        print(f"[SKIP] Already processed {target_url}")
        return
    print(f"[PARSE] Opening {target_url}")
    resp = requests.get(target_url)
    resp.raise_for_status()
    soup = BeautifulSoup(resp.text, 'html.parser')

    for a in soup.find_all('a', href=True):
        href = a['href']
        if fits_pattern.match(href):
            # Write row for target's folder URL
            folder_url = target_url
            match = target_pattern.search(href)
            target_name = match.group(0) if match else ''
            writer.writerow([folder_url, epoch, tile, target_name])
            print(f"[FOUND] {folder_url} | Epoch {epoch} | Tile {tile} | Target {target_name}")
            processed.add(folder_url)
            break


def main():
    # Open CSV in append or write mode, iterate through epochs, tiles, targets
    processed = load_processed(output_csv)
    mode = 'a' if processed else 'w'

    with open(output_csv, mode, newline='') as csvfile:
        writer = csv.writer(csvfile)
        if mode == 'w':
            writer.writerow(['Link', 'Epoch', 'Tile', 'Target'])

        for epoch_dir in epoch_dirs:
            epoch = epoch_dir.replace('VLASS', '').strip('/')
            epoch_url = urljoin(top_url, epoch_dir)
            for tile_dir in list_tiles(epoch_url):
                tile = tile_dir.rstrip('/')
                tile_url = urljoin(epoch_url, tile_dir)
                for target_dir in list_targets(tile_url):
                    process_target(epoch, tile, target_dir, tile_url, writer, processed)

    print(f"[COMPLETE] Created {output_csv}")


if __name__ == '__main__':
    main()

[PARSE] Opening https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050228-363000.10.2048.v2/
[FOUND] https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050228-363000.10.2048.v2/ | Epoch 1.1v2 | Tile T01t11 | Target J050228-363000
[PARSE] Opening https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050230-373000.10.2048.v2/
[FOUND] https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050230-373000.10.2048.v2/ | Epoch 1.1v2 | Tile T01t11 | Target J050230-373000
[PARSE] Opening https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050232-383000.10.2048.v2/
[FOUND] https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050232-383000.10.2048.v2/ | Epoch 1.1v2 | Tile T01t11 | Target J050232-383000
[PARSE] Opening https://archive-new.nrao.edu/vlass/quicklook/VLASS1.1v2/T01t11/VLASS1.1.ql.T01t11.J050234-393000.10.20

KeyboardInterrupt: 

In [None]:
import os

# Output directory
outdir = "vlass_fits"

def download_fits(csv_file=output_csv, outdir=outdir):
    # Download all .tt0.subim.fits files listed in the catalog
    
    # Ensure directory exists
    os.makedirs(outdir, exist_ok=True)

    with open(csv_file, newline='') as f:
        reader = csv.DictReader(f)
        for row in reader:
            folder_url = row['Link']
            print(f"[SCAN] {folder_url}")
            resp = requests.get(folder_url)
            resp.raise_for_status()
            soup = BeautifulSoup(resp.text, 'html.parser')

            # Find and download matching FITS files
            for a in soup.find_all('a', href=True):
                href = a['href']
                if fits_pattern.match(href):
                    file_url = urljoin(folder_url, href)
                    filename = os.path.basename(href)
                    dest = os.path.join(outdir, filename)

                    if os.path.exists(dest):
                        print(f"[SKIP] {filename}")
                        continue

                    print(f"[GET] {file_url}")
                    file_resp = requests.get(file_url, stream=True)
                    file_resp.raise_for_status()
                    with open(dest, 'wb') as out_f:
                        for chunk in file_resp.iter_content(chunk_size=8192):
                            out_f.write(chunk)
                    print(f"[SAVE] {dest}")

def main_download():
    # Download FITS using the catalog
    download_fits()

if __name__ == '__main__':
    main_download()

In [None]:
from astropy.io import fits
from astropy.coordinates import SkyCoord
from astropy.wcs import WCS
import astropy.units as u
import numpy as np
import os
import csv

def update_catalog(csv_file=output_csv, fits_folder=outdir):
    # Verify FITS directory exists
    if not os.path.isdir(fits_folder):
        print(f"[ERROR] Not found: {fits_folder}")
        return

    # Read existing CSV into memory
    with open(csv_file, newline='') as f_in:
        reader = csv.DictReader(f_in)
        rows = list(reader)
        # Determine which additional columns to add
        new_cols = ['Date', 'Time']
        # Only append missing columns
        fieldnames = reader.fieldnames + [col for col in new_cols if col not in reader.fieldnames]

    # Overwrite original CSV with updated data
    with open(csv_file, 'w', newline='') as f_out:
        writer = csv.DictWriter(f_out, fieldnames=fieldnames)
        writer.writeheader()

        for row in rows:
            target = row.get('Target', '')
            # Find FITS
            matches = [fn for fn in os.listdir(fits_folder)
                       if fn.endswith('.tt0.subim.fits') and target in fn]

            if matches:
                filepath = os.path.join(fits_folder, matches[0])
                try:
                    # Read header for dates
                    hdr = fits.getheader(filepath)
                    date_obs = hdr.get('DATE-OBS', '')

                    # Read full data array (auto‐decompresses if needed)
                    data = fits.getdata(filepath, memmap=False)
                    header2 = fits.getheader(filepath)
                    data = np.squeeze(data)
                    wcs_full = WCS(header2)
                    try:
                        wcs = wcs_full.celestial
                    except Exception:
                        wcs = wcs_full
                    
                except Exception as e:
                    print(f"[ERROR] Cannot read {filepath}: {e}")
                    date_obs = ''
                # Parse date and time separately
                if 'T' in date_obs:
                    obs_date, obs_time = date_obs.split('T', 1)
                else:
                    obs_date = obs_time = ''
                # Log successful extraction
                print(f"[SUCESS] {target} | Date={obs_date} | Time={obs_time}")
            else:
                print(f"[ERROR] No file found for {target}")
                obs_date = obs_time = end_date = end_time = ''

            # Append new values to the row
            row['Date'] = obs_date
            row['Time'] = obs_time
            writer.writerow(row)

    print(f"[COMPLETE] Updated {csv_file}")


def main_update():
    # Extract header info
    update_catalog()

if __name__ == '__main__':
    main_update()