# Data Processing - Satellite and In Situ

In order to analyze and compare in situ field data with ECOSTRESS and EMIT satellite data, you have to prepare your data. These steps will take you from post-field collection to having ready-to-analyze field data. 

## Part 1 Convert .sed files to .csv

Visualize your data as a csv by converting your .sed files. These files may be in a different form depending on the type of instrument that you will be using. A Spectral Evolution instrument was used for the making of this tutorial. 

In [None]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.cluster import KMeans

def read_sed_file(filepath, use_column=2):
    metadata = {}
    wavelengths = []
    radiances = []

    with open(filepath, 'r') as f:
        lines = f.readlines()

    # Extract metadata
    for line in lines:
        if line.startswith('Latitude:'):
            val = line.split(':')[1].strip()
            metadata['latitude'] = float(val) if val.lower() != 'n/a' else None
        elif line.startswith('Longitude:'):
            val = line.split(':')[1].strip()
            metadata['longitude'] = float(val) if val.lower() != 'n/a' else None
        elif line.startswith('GPS Time:'):
            metadata['gps_time'] = line.split(':', 1)[1].strip()
        elif line.strip().startswith('Data:'):
            data_start_idx = lines.index(line) + 2
            break
    else:
        raise ValueError(f"No 'Data:' section found in {filepath}")

    for line in lines[data_start_idx:]:
        if line.strip():
            parts = line.strip().split()
            if len(parts) > use_column:
                try:
                    wavelengths.append(float(parts[0]))
                    radiances.append(float(parts[use_column]))
                except ValueError:
                    continue

    return metadata, wavelengths, radiances

def process_sed_directory(directory, output_dir=None, use_column=2):
    sed_files = [f for f in os.listdir(directory) if f.lower().endswith('.sed')]
    if not sed_files:
        raise ValueError(f"No .sed files found in {directory}.")

    spectra_dict = {}
    wavelengths_master = None
    meta_records = []

    for filename in sed_files:
        filepath = os.path.join(directory, filename)
        metadata, wavelengths, radiances = read_sed_file(filepath, use_column=use_column)
        basename = os.path.splitext(filename)[0]

        if wavelengths_master is None:
            wavelengths_master = wavelengths
        elif wavelengths != wavelengths_master:
            raise ValueError(f"Wavelength mismatch in file: {filename}")

        spectra_dict[basename] = radiances

        meta_records.append({
            'filename': basename,
            'latitude': metadata.get('latitude'),
            'longitude': metadata.get('longitude'),
            'gps_time': metadata.get('gps_time')
        })

    df = pd.DataFrame(spectra_dict, index=wavelengths_master)
    df.index.name = 'wavelength'
    meta_df = pd.DataFrame(meta_records)

    if output_dir:
        os.makedirs(output_dir, exist_ok=True)
        df.to_csv(os.path.join(output_dir, 'spectra_data.csv'))
        meta_df.to_csv(os.path.join(output_dir, 'spectra_metadata.csv'), index=False)

    return df, meta_df

def extract_features(spectra_df):
    features = []
    for col in spectra_df.columns:
        spectrum = spectra_df[col]
        features.append([
            np.mean(spectrum),
            np.max(spectrum),
            spectrum.idxmax(),
            spectrum.iloc[-1] / spectrum.iloc[0] if spectrum.iloc[0] != 0 else 0,
            np.std(spectrum) / np.mean(spectrum) if np.mean(spectrum) != 0 else 0
        ])
    feature_names = ['mean', 'max', 'peak_wavelength', 'slope', 'flatness']
    return pd.DataFrame(features, index=spectra_df.columns, columns=feature_names)

def cluster_spectra(features_df, n_clusters=3):
    kmeans = KMeans(n_clusters=n_clusters, random_state=42, n_init='auto')
    labels = kmeans.fit_predict(features_df)
    features_df['cluster'] = labels
    return features_df

def plot_clusters(spectra_df, cluster_labels, output_dir=None):
    unique_clusters = np.unique(cluster_labels)
    colors = plt.cm.get_cmap('tab10', len(unique_clusters))
    fig, ax = plt.subplots(figsize=(12, 6))

    for idx, col in enumerate(spectra_df.columns):
        cluster_id = cluster_labels[idx]
        ax.plot(spectra_df.index, spectra_df[col], color=colors(cluster_id), alpha=0.5)

    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('Radiance (W/m²/sr/nm)')
    ax.set_title('Spectra colored by cluster')
    plt.grid(True)
    if output_dir:
        plt.savefig(os.path.join(output_dir, 'clustered_spectra.png'), dpi=300)
    plt.close()

def plot_cluster_means(spectra_df, cluster_labels, output_dir=None):
    unique_clusters = np.unique(cluster_labels)
    colors = plt.cm.get_cmap('tab10', len(unique_clusters))
    fig, ax = plt.subplots(figsize=(12, 6))

    for cluster_id in unique_clusters:
        spectra_in_cluster = spectra_df.iloc[:, np.where(cluster_labels == cluster_id)[0]]
        mean_spectrum = spectra_in_cluster.mean(axis=1)
        ax.plot(spectra_df.index, mean_spectrum, label=f'Cluster {cluster_id}', color=colors(cluster_id))

    ax.set_xlabel('Wavelength (nm)')
    ax.set_ylabel('Mean Radiance (W/m²/sr/nm)')
    ax.set_title('Cluster Mean Spectra')
    ax.legend()
    plt.grid(True)
    if output_dir:
        plt.savefig(os.path.join(output_dir, 'cluster_means.png'), dpi=300)
    plt.close()

############################################
# Process Sed:
directory = ".../sedData" #change to your directory
output_dir = '.../sedDataOutputs' #change to your directory
spectra_df, metadata_df = process_sed_directory(directory, output_dir=output_dir)

## Part 2: Normalize Your Data

Normalize the data you collected through the code below, this code assumes that the first 6 columns of your csv will not be normalized due to the first column representing "wavelength" while columns 1-5 will be the reference used to calibrate the data.

In [None]:
'''
This code assumes that the first 6 columns of your csv will not be normalized due to the first column representing 
"wavelength" while columns 1-5 will be the reference used to calibrate the data. 

'''

import pandas as pd

# Step 1: Load your data
df = pd.read_csv(".../Input.csv")  # Replace with processed csv filename

# Step 2: Filter to keep only rows with wavelength between 400–900 nm
df_filtered = df[df["wavelength"].between(400, 900)]

# Step 3: Preserve the following:
# - Wavelength column (index 0)
# - Columns B to F (index 1 to 5)
preserved = df_filtered.iloc[:, 0:6]

# Step 4: Compute row-wise average from columns B to F (index 1 to 5)
row_avg = df_filtered.iloc[:, 1:6].mean(axis=1)

# Step 5: Normalize columns from index 6 onward (column G and beyond)
cols_to_normalize = df_filtered.columns[6:]
normalized = df_filtered[cols_to_normalize].div(row_avg, axis=0)

# Step 6: Sort normalized columns numerically
sorted_cols = sorted(normalized.columns, key=lambda x: float(x.split('_')[-1]))
normalized = normalized[sorted_cols]

# Step 7: Combine everything together
final_df = pd.concat([preserved, normalized], axis=1)

# Step 8: Export to CSV (optional)
final_df.to_csv(".../Output.csv", index=False) #change filepath to export normalized data.

# Preview output
print(final_df.head())


## Part 3: Pull Coordinates from Photos

If needed, you can pull quadrat coordinates. from the photos taken in the field to have accurate coordinates. 

In [None]:
import exifread
import os
import pandas as pd

def get_decimal_from_dms(dms, ref):
    degrees = dms.values[0].num / dms.values[0].den
    minutes = dms.values[1].num / dms.values[1].den
    seconds = dms.values[2].num / dms.values[2].den
    decimal = degrees + (minutes / 60.0) + (seconds / 3600.0)
    if ref in ['S', 'W']:
        decimal = -decimal
    return decimal

def extract_gps(image_path):
    with open(image_path, 'rb') as f:
        tags = exifread.process_file(f, details=False)
        if 'GPS GPSLatitude' in tags and 'GPS GPSLongitude' in tags:
            lat = get_decimal_from_dms(tags['GPS GPSLatitude'], tags['GPS GPSLatitudeRef'].printable)
            lon = get_decimal_from_dms(tags['GPS GPSLongitude'], tags['GPS GPSLongitudeRef'].printable)
            return lat, lon
    return None, None

def process_images(folder_path):
    data = []
    for filename in os.listdir(folder_path):
        if filename.lower().endswith(('.jpg', '.jpeg', '.heic')):
            filepath = os.path.join(folder_path, filename)
            lat, lon = extract_gps(filepath)
            if lat is not None and lon is not None:
                data.append({'filename': filename, 'latitude': lat, 'longitude': lon})
    df = pd.DataFrame(data)
    df.to_csv('photo_locations.csv', index=False) #change folder name/path to your preference.
    print(f"Extracted {len(df)} locations and saved to photo_locations.csv")

# Example usage
process_images('.../quadrat_images_') #folder with images for site.




## Part 4: Finding and Downloading ECOSTRESS and EMIT Data

Follow the code below to download concurent ECOSTRESS and EMIT Data. You can choose to have the concurrent granules automatically download to a file or manually download it throught the .txt file.

The code below is based on VITALS Tutorial 1, but was revised to search for three regions of interest rather than just one. 

Based on the type and location of the data you want to collect, you can choose whether to get the "Tiled" or "Swath" land surface temperature and emissivity. 

The data that will be downloaded can be altered in the "Filter by file types" section of each code block. 

Tiled data is best suited for large, regularly gridded datasets where you want to efficently manage and process individual sections of the overall data. Whereas swath data is used to represent data collected along a specific path or track, often with varying resolution and coverage. 

In [None]:
#Code to extract Tiled Data
import os
import pandas as pd
import geopandas as gpd
import earthaccess

def process_site(site_name, polygon_path, out_dir, start_date="2025-05-01", end_date="2025-06-30"):
    """
    For a given site:
    - Loads the polygon
    - Searches ECOSTRESS and EMIT granules
    - Filters to relevant file types
    - Saves result URLs in a .txt and .csv file in the site-specific folder
    """
    # Load polygon
    poly = gpd.read_file(polygon_path)
    minx, miny, maxx, maxy = poly.total_bounds

    # Set up temporal range and concept IDs
    date_range = (start_date, end_date)
    concept_ids = [
        "C2408750690-LPCLOUD",  # EMIT L2A Reflectance Tiled
        "C2076090826-LPCLOUD"   # ECOSTRESS L2T LST/Emissivity Tiled
    ]

    # Search for granules in bounding box
    print(f" Date range searched: {start_date} to {end_date}")

    results = earthaccess.search_data(
        concept_id=concept_ids,
        bounding_box=(minx, miny, maxx, maxy),
        temporal=date_range,
        cloud_hosted=True
    )

    # Split ECOSTRESS and EMIT
    emit_granules = [g for g in results if g["umm"]["CollectionReference"]["ShortName"] == "EMITL2ARFL"]
    eco_granules  = [g for g in results if g["umm"]["CollectionReference"]["ShortName"] == "ECO_L2T_LSTE"]

    # Filter by file types
    desired_ecostress_assets = ['_LST.tif', '_QC.tif', '_cloud.tif']
    desired_emit_extensions = ['.nc']

    filtered_urls = []

    for g in eco_granules:
        for url in g.data_links(access="https"):
            if any(url.endswith(tag) for tag in desired_ecostress_assets):
                filtered_urls.append(url)

    for g in emit_granules:
        for url in g.data_links(access="https"):
            if any(url.endswith(ext) for ext in desired_emit_extensions):
                filtered_urls.append(url)

    # Make output folder if needed
    os.makedirs(out_dir, exist_ok=True)

    # Save to .txt
    txt_path = os.path.join(out_dir, f"{site_name}_granules.txt")
    with open(txt_path, "w") as f:
        for url in filtered_urls:
            f.write(url + "\n")

    print(f"🔍 {site_name} search:")
    print(f"   ECOSTRESS granules found: {len(eco_granules)}")
    print(f"   EMIT granules found:     {len(emit_granules)}")
    print(f"   Total granules found:     {len(emit_granules + eco_granules)}")
    
    
    # # Save to .csv - Uncomment to download data frame.
    # df = pd.DataFrame({'url': filtered_urls})
    # df.to_csv(os.path.join(out_dir, f"{site_name}_granules.csv"), index=False)
    # print(f"✅ {site_name}: Found {len(filtered_urls)} granule files. Saved to {out_dir}")

    return eco_granules, emit_granules





In [None]:
#Code for Swath Data
import os
import pandas as pd
import geopandas as gpd
import earthaccess

def process_site(site_name, polygon_path, out_dir, start_date="2025-04-01", end_date="2025-05-31"):
    """
    For a given site:
    - Loads the polygon
    - Searches ECOSTRESS and EMIT granules
    - Filters to relevant file types
    - Saves result URLs in a .txt and .csv file in the site-specific folder
    """
    # Load polygon
    poly = gpd.read_file(polygon_path)
    minx, miny, maxx, maxy = poly.total_bounds

    # Set up temporal range and concept IDs 
    date_range = (start_date, end_date)
    concept_ids = [
        "C2408750690-LPCLOUD",   # EMIT L2A Reflectance (swath)
        "C2076114664-LPCLOUD"    # ECOSTRESS L2 Swath LSTE
    ]

    # Search for granules in bounding box
    print(f" Date range searched: {start_date} to {end_date}")

    swath_results = earthaccess.search_data(
        concept_id=concept_ids,
        bounding_box=(minx, miny, maxx, maxy),
        temporal=(start_date, end_date),
        cloud_hosted=True
    )

    print(f"Total SWATH granules found: {len(swath_results)}")


    # Split ECOSTRESS and EMIT
    emit_granules = [g for g in swath_results if g["umm"]["CollectionReference"]["ShortName"].startswith("EMIT")]
    eco_granules  = [g for g in swath_results if g["umm"]["CollectionReference"]["ShortName"].startswith("ECO_L2")]


    # Filter by file types
    desired_swath_ecostress_extensions = ['.h5', '.nc', '.hdf5']
    desired_swath_emit_extensions = ['.nc']

    filtered_urls = []

    for g in eco_granules:
        for url in g.data_links(access="https"):
            if any(url.endswith(tag) for tag in desired_swath_ecostress_extensions):
                filtered_urls.append(url)

    for g in emit_granules:
        for url in g.data_links(access="https"):
            if any(url.endswith(ext) for ext in desired_swath_emit_extensions):
                filtered_urls.append(url)

    # Make output folder if needed
    os.makedirs(out_dir, exist_ok=True)

    # Save to .txt
    txt_path = os.path.join(out_dir, f"{site_name}_granules.txt")
    with open(txt_path, "w") as f:
        for url in filtered_urls:
            f.write(url + "\n")

    print(f"🔍 {site_name} search:")
    print(f"   ECOSTRESS granules found: {len(eco_granules)}")
    print(f"   EMIT granules found:     {len(emit_granules)}")
    print(f"   Total granules found:     {len(emit_granules + eco_granules)}")

    # Save to .csv
    df = pd.DataFrame({'url': filtered_urls})
    df.to_csv(os.path.join(out_dir, f"{site_name}swath_granules.csv"), index=False)

    print(f"✅ {site_name}: Found {len(filtered_urls)} granule files. Saved to {out_dir}")

    return eco_granules, emit_granules



In [None]:
#Update these with your actual file paths - This can be used for either functio above. Remember the last function you run will be the type of data searched. 
process_site(
    site_name="LCDM",
    polygon_path="/Users/kylamonique/Desktop/JPLFiles/SpectralEvolution/GIS/Waypoints/LCDMQuadrat.geojson",
    out_dir="/Users/kylamonique/Desktop/JPLFiles/data/LCDM"
)

process_site(
    site_name="SD",
    polygon_path="/Users/kylamonique/Desktop/JPLFiles/SpectralEvolution/GIS/Waypoints/SDQuadrat.geojson",
    out_dir="/Users/kylamonique/Desktop/JPLFiles/data/SD"
)

process_site(
    site_name="PV",
    polygon_path="/Users/kylamonique/Desktop/JPLFiles/SpectralEvolution/GIS/Waypoints/PVQuadrat.geojson",
    out_dir="/Users/kylamonique/Desktop/JPLFiles/data/PV"
)


 Date range searched: 2025-05-01 to 2025-06-30
🔍 SD search:
   ECOSTRESS granules found: 30
   EMIT granules found:     1
   Total granules found:     31
✅ SD: Found 93 granule files. Saved to /Users/kylamonique/Desktop/JPLFiles/data/SD


([Collection: {'ShortName': 'ECO_L2T_LSTE', 'Version': '002'}
  Spatial coverage: {'HorizontalSpatialDomain': {'Geometry': {'BoundingRectangles': [{'WestBoundingCoordinate': -118.07619650467669, 'EastBoundingCoordinate': -116.89542756500555, 'NorthBoundingCoordinate': 33.439530979212954, 'SouthBoundingCoordinate': 32.444992583931025}]}}}
  Temporal coverage: {'RangeDateTime': {'BeginningDateTime': '2025-05-04T13:37:49.647Z', 'EndingDateTime': '2025-05-04T13:38:41.617Z'}}
  Size(MB): 0.8
  Data: ['https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/ECO_L2T_LSTE.002/ECOv002_L2T_LSTE_38704_013_11SMS_20250504T133749_0713_01/ECOv002_L2T_LSTE_38704_013_11SMS_20250504T133749_0713_01_water.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/ECO_L2T_LSTE.002/ECOv002_L2T_LSTE_38704_013_11SMS_20250504T133749_0713_01/ECOv002_L2T_LSTE_38704_013_11SMS_20250504T133749_0713_01_cloud.tif', 'https://data.lpdaac.earthdatacloud.nasa.gov/lp-prod-protected/ECO_L2T_LSTE.002/ECOv002

# Optional: Photo Conversion (HEIC to JPG)

If you want to analyze the images you have collected to compare them to your in situ spectra, it is ideal to have it process as JPG or JPEG files, but in the case that your images are in HEIC, use the code below to convert images. 

In [None]:
import os
from PIL import Image
import pillow_heif

# Register HEIF opener
pillow_heif.register_heif_opener()

# Set paths
source_folder = "/Users/kylamonique/Desktop/JPLFiles/SpectralEvolution/FieldData/PV0504/Quadrat Images" #Folder for HEIC inputs
destination_folder = "/Users/kylamonique/Desktop/JPLFiles/SpectralEvolution/FieldData/PV0504/Quadrat Images/JPG" #Folder for JPG outputs
os.makedirs(destination_folder, exist_ok=True)

# Convert all HEIC files
for filename in os.listdir(source_folder):
    if filename.lower().endswith(".heic"):
        heic_path = os.path.join(source_folder, filename)
        jpg_filename = os.path.splitext(filename)[0] + ".jpg"
        jpg_path = os.path.join(destination_folder, jpg_filename)

        try:
            image = Image.open(heic_path)
            image.save(jpg_path, "JPEG")
            print(f"Converted: {filename} -> {jpg_filename}")
        except Exception as e:
            print(f"Failed to convert {filename}: {e}")
