In [29]:
#csv summarizer
import pandas as pd

# Function to analyze CSV file
def analyze_csv(file_path):
    # Read the CSV file
    df = pd.read_csv(file_path)

    # Get column names and data types
    column_info = df.dtypes.reset_index()
    column_info.columns = ["Column Name", "Data Type"]

    # Get dataset size
    dataset_size = df.shape

    # Display results
    print("Column Names and Data Types:")
    print(column_info.to_string(index=False))
    print("\nTotal Rows:", dataset_size[0])
    print("Total Columns:", dataset_size[1])

# Example usage
file_path = "Dataset/raw_cleaned/LarchCN_256x256_2022_monthly_patch_0_0_cleaned.csv"  # Change to your CSV file path
analyze_csv(file_path)


Column Names and Data Types:
Column Name Data Type
        AOT     int64
         B1     int64
        B11     int64
        B12     int64
         B2     int64
         B3     int64
         B4     int64
         B5     int64
         B6     int64
         B7     int64
         B8     int64
        B8A     int64
         B9     int64
        EVI   float64
       NDVI   float64
       NDWI   float64
      TCI_B     int64
      TCI_G     int64
      TCI_R     int64
        WVP     int64
 image_date    object
  longitude   float64
   latitude   float64

Total Rows: 788480
Total Columns: 23


In [26]:
import pandas as pd

# Load your CSV file
df = pd.read_csv('Dataset/groundtruth_raw/Original Groundtruth/CQ_HBY_GroundTruth_Cleaned0213_Cat7.csv')

# Remove the 'reference' column if it exists
if 'reference' in df.columns:
    df = df.drop(columns=['ref'])

# Rename columns:
# 'forest' -> 'Class', 'lon' -> 'longitude', 'lat' -> 'latitude'
rename_map = {
    'forest': 'Class',
    'lon': 'longitude',
    'lat': 'latitude'
}
df = df.rename(columns=rename_map)

# Ensure 'Class' is a string
if 'Class' in df.columns:
    df['Class'] = df['Class'].astype(str)

# Convert 'latitude' and 'longitude' columns to numeric (if they exist)
if 'latitude' in df.columns:
    df['latitude'] = pd.to_numeric(df['latitude'], errors='coerce')
if 'longitude' in df.columns:
    df['longitude'] = pd.to_numeric(df['longitude'], errors='coerce')

# Save the cleaned CSV to a new file
df.to_csv('cleaned.csv', index=False)


In [27]:
import pandas as pd

# Load the cleaned CSV file
df = pd.read_csv('Dataset/groundtruth_raw/Original Groundtruth/0213-blind-cleaned.csv')

# Calculate the counts of each unique class in the 'Class' column
class_counts = df['Class'].value_counts()

# Print the unique class types along with the number of data points (rows) for each class
print("Unique Class Types and their counts:")
print(class_counts)


Unique Class Types and their counts:
Class
larch_JP          1340
deci_broad        1212
ever_coni          819
mix_coni_broad     789
larch_CN           229
shrubland          199
ever_broad          39
Name: count, dtype: int64


GEE code for larch_CN
/***************************************
 * GEE Script: Sample a 2×2 patch around
 * "larch_CN" ground‐truth points
 * with a small buffer + earliest monthly
 * S2 images, exporting to a single CSV.
 ***************************************/

// -------------------- 1. Load Ground-Truth Asset --------------------
// Update this to your correct asset ID:
var groundTruthAsset = "projects/ee-my-josh-ai/assets/cleaned";
// Example columns: ["Class","longitude","latitude"]
var allPoints = ee.FeatureCollection(groundTruthAsset);
print("DEBUG: # of allPoints:", allPoints.size());

// -------------------- 2. Filter to larch_CN Only --------------------
var larchCN = allPoints.filter(ee.Filter.eq("Class", "larch_CN"));
print("DEBUG: # of larch_CN points:", larchCN.size());

// If you want to see them on the map, do:
Map.addLayer(larchCN, {color: "red"}, "larch_CN Points");

// -------------------- 3. Buffer Each larch_CN Point (2×2 patch) --------------------
// We'll buffer by 10 m, then take the .bounds(...) to get a small rectangle ~20×20 m.
// Use maxError in both operations to avoid geometry errors.
var bufferDist = 10; // 10 m
var larchBuffered = larchCN.map(function(feat) {
  var geom = feat.geometry();
  var buffered = geom.buffer({distance: bufferDist, maxError: 1})
                     .bounds({maxError: 1});
  return feat.setGeometry(buffered);
});
Map.addLayer(larchBuffered, {color: "blue"}, "Buffered larch_CN patches");

// -------------------- 4. Define Date Range & S2 Collection --------------------
var startDate = "2022-01-01";
var endDate   = "2022-12-31";

var s2 = ee.ImageCollection("COPERNICUS/S2_SR_HARMONIZED")
            .filterDate(startDate, endDate)
            .filterBounds(larchBuffered.geometry())
            .filter(ee.Filter.lt("CLOUDY_PIXEL_PERCENTAGE", 20));

// (A) Optional cloud mask:
function maskClouds(image) {
  var qa = image.select("QA60");
  var cloudBitMask = 1 << 10;
  var cirrusBitMask = 1 << 11;
  var mask = qa.bitwiseAnd(cloudBitMask).eq(0)
              .and(qa.bitwiseAnd(cirrusBitMask).eq(0));
  return image.updateMask(mask);
}

// (B) Compute NDVI, EVI, NDWI:
function addIndices(image) {
  var ndvi = image.normalizedDifference(["B8", "B4"]).rename("NDVI");
  var evi = image.expression(
    "2.5*((NIR-RED)/(NIR+6*RED-7.5*BLUE+1))", {
      "NIR": image.select("B8"),
      "RED": image.select("B4"),
      "BLUE": image.select("B2")
    }
  ).rename("EVI");
  var ndwi = image.expression(
    "(GREEN - NIR)/(GREEN + NIR)", {
      "GREEN": image.select("B3"),
      "NIR": image.select("B8")
    }
  ).rename("NDWI");
  return image.addBands([ndvi, evi, ndwi]);
}

// (C) Select final bands:
function selectBands(image) {
  var originalBands = [
    "B1","B2","B3","B4","B5","B6","B7","B8","B8A",
    "B9","B11","B12","AOT","WVP"
  ];
  var rgbBands = ["TCI_R","TCI_G","TCI_B"];
  var indexBands = ["NDVI","EVI","NDWI"];
  return image.select(originalBands.concat(rgbBands).concat(indexBands));
}

var s2Processed = s2
  .map(maskClouds)
  .map(addIndices)
  .map(selectBands)
  // unmask to fill missing with -999
  .map(function(img) { return img.unmask(-999); });

// -------------------- 5. Sample Earliest Monthly Image for Each Point --------------------
var months = ee.List.sequence(1, 12);

// We'll build a single FeatureCollection containing all points × months:
var allSamples = months.map(function(m) {
  // For each month, we pick the earliest image & sample all larch_CN patches:
  var monthlyImages = s2Processed.filter(ee.Filter.calendarRange(m, m, "month"))
                                 .sort("system:time_start");
  var firstImage = ee.Image(monthlyImages.first());

  // If no image found, return an empty FeatureCollection:
  var hasImage = monthlyImages.size().gt(0);
  var sampled = ee.FeatureCollection(ee.Algorithms.If(
    hasImage,
    // If an image exists, sample each geometry in larchBuffered:
    firstImage.sampleRegions({
      collection: larchBuffered,    // each geometry = small 2×2 patch
      scale: 10,
      projection: "EPSG:4326",
      geometries: true // preserve lat/lon geometry
    }).map(function(feat) {
      // Attach the image_date & the month, etc.
      return feat.set({
        "image_date": firstImage.date().format("YYYY-MM-dd"),
        "month": m
      });
    }),
    // else no image => empty
    ee.FeatureCollection([])
  ));

  return sampled;
});

// Flatten into one FeatureCollection:
var combinedSamples = ee.FeatureCollection(allSamples).flatten();
print("DEBUG: Combined samples count:", combinedSamples.size());

// -------------------- 6. Export to Drive --------------------
// Single CSV for all "larch_CN" points across 12 months:
Export.table.toDrive({
  collection: combinedSamples,
  description: "larchCN_smallPatch_timeSeries",
  fileFormat: "CSV",
  folder: "my_groundtruth_exports"
});

print("Script finished. Check Tasks tab for your export.");


Cleaning the time series data, specifically the .geo loc


In [10]:
import pandas as pd
import re
import os
import glob
import csv
from tqdm import tqdm


# -------------------------------------------------------------------
# DO NOT MODIFY THE FOLLOWING clean_csv FUNCTION (including its comments)
# -------------------------------------------------------------------
def clean_csv(input_path):
    # Read the CSV file
    df = pd.read_csv(input_path)

    # Extract latitude and longitude from .geo column using regex
    def extract_coordinates(geo_str):
        match = re.search(r'\[([-+]?\d*\.\d+),\s*([-+]?\d*\.\d+)\]', geo_str)
        if match:
            return float(match.group(1)), float(match.group(2))
        return None, None

    df['longitude'], df['latitude'] = zip(*df['.geo'].apply(extract_coordinates))

    # Drop unnecessary columns
    df = df.drop(columns=['system:index', 'month', '.geo'])

    # Ensure all feature columns are numeric
    feature_cols = [
        "AOT", "B1", "B11", "B12", "B2", "B3", "B4", "B5",
        "B6", "B7", "B8", "B8A", "B9", "EVI", "NDVI", "NDWI",
        "TCI_B", "TCI_G", "TCI_R", "WVP"
    ]
    for col in feature_cols:
        df[col] = pd.to_numeric(df[col], errors='coerce')

    # Handle missing values (forward fill)
    df[feature_cols] = df[feature_cols].ffill()

    # Save the cleaned CSV
    output_path = os.path.splitext(input_path)[0] + "_cleaned.csv"
    df.to_csv(output_path, index=False)
    print(f"Cleaned CSV saved to: {output_path}")
    return output_path

# -------------------------------------------------------------------
# Function to Detect Non-Numeric Columns
# -------------------------------------------------------------------
def detect_non_numeric_columns(csv_file):
    """
    Reads csv_file and checks each column for non-numeric values.
    Returns a dictionary mapping column names to the number of entries that become NaN
    after converting to numeric (i.e. problematic entries).
    """
    df = pd.read_csv(csv_file, engine='python', on_bad_lines='skip', quoting=csv.QUOTE_NONE)
    non_numeric = {}
    for col in df.columns:
        try:
            converted = pd.to_numeric(df[col], errors='coerce')
            additional_nans = converted.isna() & ~df[col].isna()
            count = additional_nans.sum()
            if count > 0:
                non_numeric[col] = count
        except Exception as e:
            non_numeric[col] = str(e)
    return non_numeric

# -------------------------------------------------------------------
# Main Processing: Iterate Through All CSV Files in the Google Drive Folder,
# Clean Each One, and Detect Non-Numeric Columns.
# -------------------------------------------------------------------
folder_path = "Dataset/groundtruth_raw/Raw_time_series"  # Update if necessary.
# Get list of all CSV files in the folder.
csv_files = glob.glob(os.path.join(folder_path, "*.csv"))
print(f"Found {len(csv_files)} CSV files in {folder_path}")

# Iterate through each CSV file using tqdm.
for csv_file in tqdm(csv_files, desc="Processing CSV files"):
    # If the file is already cleaned (ends with _cleaned.csv), skip cleaning.
    if not csv_file.endswith("_cleaned.csv"):
        cleaned_path = clean_csv(csv_file)
    else:
        cleaned_path = csv_file
    print(f"\nDetecting non-numeric columns in: {cleaned_path}")
    problems = detect_non_numeric_columns(cleaned_path)
    if problems:
        print("Columns with non-numeric issues:")
        for col, count in problems.items():
            print(f"  {col}: {count}")
    else:
        print("All columns are numeric (or properly handled).")

Found 5 CSV files in Dataset/groundtruth_raw/Raw_time_series


Processing CSV files:   0%|          | 0/5 [00:00<?, ?it/s]

Cleaned CSV saved to: Dataset/groundtruth_raw/Raw_time_series\deci_broad_timeSeries_raw_cleaned.csv

Detecting non-numeric columns in: Dataset/groundtruth_raw/Raw_time_series\deci_broad_timeSeries_raw_cleaned.csv


Processing CSV files:  20%|██        | 1/5 [00:00<00:02,  1.73it/s]

Columns with non-numeric issues:
  Class: 21341
  image_date: 21341
Cleaned CSV saved to: Dataset/groundtruth_raw/Raw_time_series\ever_coni_timeSeries_raw_cleaned.csv

Detecting non-numeric columns in: Dataset/groundtruth_raw/Raw_time_series\ever_coni_timeSeries_raw_cleaned.csv


Processing CSV files:  40%|████      | 2/5 [00:01<00:02,  1.25it/s]

Columns with non-numeric issues:
  Class: 37067
  image_date: 37067
Cleaned CSV saved to: Dataset/groundtruth_raw/Raw_time_series\larchCN_timeSeries_raw_cleaned.csv

Detecting non-numeric columns in: Dataset/groundtruth_raw/Raw_time_series\larchCN_timeSeries_raw_cleaned.csv


Processing CSV files:  60%|██████    | 3/5 [00:01<00:01,  1.82it/s]

Columns with non-numeric issues:
  Class: 10340
  image_date: 10340
Cleaned CSV saved to: Dataset/groundtruth_raw/Raw_time_series\larchJP_timeSeries_raw_cleaned.csv

Detecting non-numeric columns in: Dataset/groundtruth_raw/Raw_time_series\larchJP_timeSeries_raw_cleaned.csv


Processing CSV files:  80%|████████  | 4/5 [00:03<00:00,  1.08it/s]

Columns with non-numeric issues:
  Class: 61542
  image_date: 61542
Cleaned CSV saved to: Dataset/groundtruth_raw/Raw_time_series\shrubland_timeSeries_raw_cleaned.csv

Detecting non-numeric columns in: Dataset/groundtruth_raw/Raw_time_series\shrubland_timeSeries_raw_cleaned.csv


Processing CSV files: 100%|██████████| 5/5 [00:03<00:00,  1.43it/s]

Columns with non-numeric issues:
  Class: 9020
  image_date: 9020





concat into a single csv


In [13]:
import pandas as pd
import os

def concatenate_groundtruth_files():
    # 1. List of input CSV file paths
    csv_files = [
        r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\deci_broad_timeSeries_raw_cleaned.csv",
        r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\ever_coni_timeSeries_raw_cleaned.csv",
        r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\larchCN_timeSeries_raw_cleaned.csv",
        r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\larchJP_timeSeries_raw_cleaned.csv",
        r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\shrubland_timeSeries_raw_cleaned.csv"
    ]

    # 2. Output CSV file path
    output_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\Raw_time_series\groundtruth_cleaned.csv"

    all_dfs = []

    # 3. Read each CSV and store in a list
    for path in csv_files:
        print(f"Reading: {path}")
        df = pd.read_csv(path, engine='python', on_bad_lines='skip')
        print(f"  Shape: {df.shape}, Columns: {df.columns.to_list()}")
        all_dfs.append(df)

    # 4. Concatenate all DataFrames into one
    combined_df = pd.concat(all_dfs, ignore_index=True)
    print(f"\nCombined DataFrame shape: {combined_df.shape}")
    print("Combined DataFrame columns:", combined_df.columns.to_list())

    # 5. Save the combined DataFrame to a single CSV file
    combined_df.to_csv(output_path, index=False)
    print(f"Combined CSV saved to: {output_path}")

if __name__ == "__main__":
    concatenate_groundtruth_files()


Reading: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\deci_broad_timeSeries_raw_cleaned.csv
  Shape: (21341, 24), Columns: ['AOT', 'B1', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'Class', 'EVI', 'NDVI', 'NDWI', 'TCI_B', 'TCI_G', 'TCI_R', 'WVP', 'image_date', 'longitude', 'latitude']
Reading: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\ever_coni_timeSeries_raw_cleaned.csv
  Shape: (37067, 24), Columns: ['AOT', 'B1', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'Class', 'EVI', 'NDVI', 'NDWI', 'TCI_B', 'TCI_G', 'TCI_R', 'WVP', 'image_date', 'longitude', 'latitude']
Reading: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\larchCN_timeSeries_raw_cleaned.csv
  Shape: (10340, 24), Columns: ['AOT', 'B1', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'Class', 'EVI', 'NDVI', 'NDWI', 'TCI_B', 'TCI_G', 'TCI_R', 'WVP', 'image_date', 

check for quality of csv if any errors or wrongful inputations

In [14]:
import pandas as pd
import numpy as np

def check_groundtruth_cleaned(csv_path):
    print(f"Reading CSV from: {csv_path}")
    df = pd.read_csv(csv_path, engine='python', on_bad_lines='skip')

    # 1. Print basic info
    print("\n--- Basic Info ---")
    print(f"DataFrame shape: {df.shape}")
    print(f"Columns: {df.columns.tolist()}")
    print(df.info())  # includes dtypes & non-null counts

    # 2. Identify numeric vs. non-numeric columns
    print("\n--- Numeric vs Non-Numeric Columns ---")
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            print(f"Column '{col}' is numeric.")
        else:
            print(f"Column '{col}' is non-numeric.")

    # 3. (Optional) If you have an expected set of columns, compare them
    # Example of typical columns for your time-series data
    expected_cols = {
        "AOT","B1","B11","B12","B2","B3","B4","B5","B6","B7","B8","B8A","B9",
        "EVI","NDVI","NDWI","TCI_B","TCI_G","TCI_R","WVP","image_date","longitude","latitude","Class"
    }
    found_cols = set(df.columns)
    missing = expected_cols - found_cols
    extra = found_cols - expected_cols

    print("\n--- Column Comparison ---")
    if missing:
        print(f"Missing columns (expected but not found): {missing}")
    else:
        print("No missing columns from the expected set.")
    if extra:
        print(f"Extra columns (found but not in expected set): {extra}")
    else:
        print("No extra columns beyond the expected set.")

    # 4. Check the 'Class' column if present
    if "Class" in df.columns:
        unique_classes = df["Class"].unique()
        print("\n--- Unique Classes ---")
        print(f"Found {len(unique_classes)} unique class(es): {unique_classes}")
    else:
        print("\nNo 'Class' column found in the DataFrame.")

    print("\n--- Done Checking Groundtruth Cleaned CSV ---")

if __name__ == "__main__":
    csv_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\Raw_time_series\groundtruth_cleaned.csv"
    check_groundtruth_cleaned(csv_path)


Reading CSV from: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\Raw_time_series\groundtruth_cleaned.csv

--- Basic Info ---
DataFrame shape: (139310, 24)
Columns: ['AOT', 'B1', 'B11', 'B12', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'Class', 'EVI', 'NDVI', 'NDWI', 'TCI_B', 'TCI_G', 'TCI_R', 'WVP', 'image_date', 'longitude', 'latitude']
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 139310 entries, 0 to 139309
Data columns (total 24 columns):
 #   Column      Non-Null Count   Dtype  
---  ------      --------------   -----  
 0   AOT         139310 non-null  int64  
 1   B1          139310 non-null  int64  
 2   B11         139310 non-null  int64  
 3   B12         139310 non-null  int64  
 4   B2          139310 non-null  int64  
 5   B3          139310 non-null  int64  
 6   B4          139310 non-null  int64  
 7   B5          139310 non-null  int64  
 8   B6          139310 non-null  int64  
 9   B7          139310 non-null  int64  
 10  B8          139310

Transformer code adopted to our groundtruth

In [15]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm

# -------------------------------
# 1. Set the Device (CUDA)
# -------------------------------
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using device:", device)
if device.type == 'cuda':
    print("GPU Name:", torch.cuda.get_device_name())

# -------------------------------
# 2. Dataset Class for Groundtruth CSV
# -------------------------------
class GroundtruthDataset(Dataset):
    """
    Reads a groundtruth CSV with columns:
      [AOT, B1, B11, B12, B2, B3, B4, B5, B6, B7, B8, B8A, B9,
       EVI, NDVI, NDWI, TCI_B, TCI_G, TCI_R, WVP,
       Class, image_date, longitude, latitude]

    Creates time-series samples by grouping on an 'id' column,
    which is made from rounding (longitude, latitude).
    Each sample has shape (12, 20) if desired_length=12.
    The dataset also stores the 'Class' plus the (latitude, longitude)
    for each group for later reference, but these are *not* used
    as Transformer inputs.
    """
    def __init__(self, csv_path, desired_length=12, feature_cols=None):
        print(f"\nReading CSV file: {csv_path}")
        self.df = pd.read_csv(csv_path, parse_dates=["image_date"])

        # If 'id' column not present, create from (longitude, latitude).
        if 'id' not in self.df.columns:
            if 'longitude' not in self.df.columns or 'latitude' not in self.df.columns:
                raise ValueError("CSV must contain 'longitude' and 'latitude' columns.")
            # Create an 'id' by rounding to 6 decimals
            self.df['id'] = (self.df['longitude'].round(6).astype(str)
                             + "_"
                             + self.df['latitude'].round(6).astype(str))

        # By default, we have 20 feature columns for the Transformer:
        if feature_cols is None:
            self.feature_cols = [
                "AOT","B1","B11","B12","B2","B3","B4","B5",
                "B6","B7","B8","B8A","B9","EVI","NDVI","NDWI",
                "TCI_B","TCI_G","TCI_R","WVP"
            ]
        else:
            self.feature_cols = feature_cols

        self.desired_length = desired_length

        # We'll store: time-series (T=12, D=20), plus the first row's (lat, lon, Class).
        self.samples = []
        self.coords = []    # list of (latitude, longitude)
        self.classes = []   # list of class labels

        groups = self.df.groupby('id', sort=False)
        for group_id, group in groups:
            group_sorted = group.sort_values('image_date')

            # Convert the 20 feature columns to float array:
            arr = group_sorted[self.feature_cols].to_numpy(dtype=float)  # shape (T, D)
            T, D = arr.shape

            # Pad or truncate to 'desired_length':
            if T < desired_length:
                # pad with the last row
                pad = np.tile(arr[-1:], (desired_length - T, 1))
                arr = np.vstack([arr, pad])
            elif T > desired_length:
                arr = arr[:desired_length]

            # Convert to PyTorch tensor
            ts_tensor = torch.tensor(arr, dtype=torch.float)  # shape: (desired_length, D)

            # We'll store a dummy label for unsupervised training
            label = 0

            # Take the first row's lat/lon/class for reference:
            lat_0 = group_sorted.iloc[0]['latitude']
            lon_0 = group_sorted.iloc[0]['longitude']
            class_0 = str(group_sorted.iloc[0]['Class'])  # store as string

            self.samples.append((ts_tensor, label))
            self.coords.append((lat_0, lon_0))
            self.classes.append(class_0)

        print(f"Constructed {len(self.samples)} time series samples from {csv_path}")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        ts_data, label = self.samples[idx]
        coord = self.coords[idx]
        class_str = self.classes[idx]
        return ts_data, torch.tensor(label, dtype=torch.long), coord, class_str

# -------------------------------
# 3. Transformer Components
# -------------------------------
class SimplePosEncoding(nn.Module):
    """
    A simple positional encoding that adds a sin(...) offset
    based on (time, feature_index). Not the standard approach,
    but works for demonstration.
    """
    def forward(self, x):
        # x: (B, T, D)
        B, T, D = x.shape
        for t in range(T):
            for i in range(D):
                offset = math.sin((t+1)*(i+1)/30.0)
                x[:, t, i] += offset
        return x

class MultiHeadSelfAttention(nn.Module):
    def __init__(self, d_model, num_heads=1):
        super().__init__()
        assert d_model % num_heads == 0, "d_model must be divisible by num_heads"
        self.num_heads = num_heads
        self.head_dim = d_model // num_heads
        self.q_linear = nn.Linear(d_model, d_model)
        self.k_linear = nn.Linear(d_model, d_model)
        self.v_linear = nn.Linear(d_model, d_model)
        self.out_linear = nn.Linear(d_model, d_model)

    def forward(self, x):
        # x: (B, T, D)
        B, T, D = x.shape
        Q = self.q_linear(x)
        K = self.k_linear(x)
        V = self.v_linear(x)

        # Reshape to (B, num_heads, T, head_dim)
        Q = Q.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        K = K.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)
        V = V.view(B, T, self.num_heads, self.head_dim).transpose(1, 2)

        # Compute attention
        scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attn = torch.softmax(scores, dim=-1)  # (B, num_heads, T, T)
        out = torch.matmul(attn, V)          # (B, num_heads, T, head_dim)

        # Recombine heads
        out = out.transpose(1, 2).contiguous().view(B, T, D)
        out = self.out_linear(out)
        return out

class TransformerBlock(nn.Module):
    def __init__(self, d_model, num_heads=1, ff_hidden=64):
        super().__init__()
        self.attn = MultiHeadSelfAttention(d_model, num_heads)
        self.norm1 = nn.LayerNorm(d_model)
        self.ff = nn.Sequential(
            nn.Linear(d_model, ff_hidden),
            nn.ReLU(),
            nn.Linear(ff_hidden, d_model)
        )
        self.norm2 = nn.LayerNorm(d_model)

    def forward(self, x):
        # x: (B, T, D)
        x_attn = self.attn(x)
        x = x + x_attn
        x = self.norm1(x)
        x_ff = self.ff(x)
        x = x + x_ff
        x = self.norm2(x)
        return x

class MiniTransformerClassifier(nn.Module):
    """
    A minimal Transformer that does:
      - Simple positional encoding
      - N layers of (MHSA + feedforward)
      - Mean-pool across time
      - Output a 2-class logit (dummy usage)
    """
    def __init__(self, d_model, num_heads=1, ff_hidden=64, num_layers=2, n_classes=2):
        super().__init__()
        self.posenc = SimplePosEncoding()
        self.layers = nn.ModuleList([
            TransformerBlock(d_model, num_heads, ff_hidden) for _ in range(num_layers)
        ])
        self.classifier = nn.Linear(d_model, n_classes)

    def forward(self, x, return_latent=False):
        # x: (B, T, D)
        x = self.posenc(x)
        for layer in self.layers:
            x = layer(x)
        # Mean-pool across time dimension
        x_mean = x.mean(dim=1)  # shape (B, d_model)
        logits = self.classifier(x_mean)
        if return_latent:
            # Return the final latent vector (x_mean)
            return logits, x_mean
        else:
            return logits

# -------------------------------
# 4. Training & Latent Extraction
# -------------------------------
def train_transformer_on_groundtruth(csv_path, epochs=50, save_folder="."):
    print(f"\n--- Training on groundtruth CSV: {csv_path} ---")
    dataset = GroundtruthDataset(csv_path, desired_length=12)
    ds_len = len(dataset)
    print("Dataset contains", ds_len, "samples.")
    if ds_len < 2:
        print("Not enough samples to split. Using entire dataset for both train & test.")
        train_ds = dataset
        test_ds = dataset
    else:
        train_size = int(0.8 * ds_len)
        test_size = ds_len - train_size
        train_ds, test_ds = random_split(dataset, [train_size, test_size])

    train_loader = DataLoader(train_ds, batch_size=8, shuffle=True)
    test_loader = DataLoader(test_ds, batch_size=8, shuffle=False)

    d_model = len(dataset.feature_cols)  # Should be 20 features
    seq_len = 12
    print(f"Using d_model={d_model}, seq_len={seq_len}. Training for {epochs} epochs.")
    model = MiniTransformerClassifier(d_model=d_model, num_heads=1, ff_hidden=64, num_layers=2, n_classes=2)
    model.to(device)
    print("Model is on device:", next(model.parameters()).device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    print("Starting training...")
    for epoch in range(1, epochs+1):
        model.train()
        total_loss = 0.0
        for X, _, _, _ in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} training", leave=False):
            X = X.to(device)
            optimizer.zero_grad()
            logits = model(X)
            # Dummy label = 0
            labels = torch.zeros(X.size(0), dtype=torch.long, device=device)
            loss = criterion(logits, labels)
            loss.backward()
            nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            optimizer.step()
            total_loss += loss.item() * X.size(0)
        avg_loss = total_loss / len(train_ds)

        # Evaluate
        model.eval()
        test_loss = 0.0
        correct, total = 0, 0
        with torch.no_grad():
            for X, _, _, _ in tqdm(test_loader, desc=f"Epoch {epoch}/{epochs} testing", leave=False):
                X = X.to(device)
                logits = model(X)
                labels = torch.zeros(X.size(0), dtype=torch.long, device=device)
                loss = criterion(logits, labels)
                test_loss += loss.item() * X.size(0)
                preds = torch.argmax(logits, dim=1)
                correct += (preds == 0).sum().item()
                total += X.size(0)
        avg_test_loss = test_loss / len(test_ds)
        test_acc = correct/total if total>0 else 1.0
        print(f"Epoch {epoch}/{epochs}, Train Loss={avg_loss:.4f}, Test Loss={avg_test_loss:.4f}, Dummy Acc={test_acc:.2f}")

    # Output folder for weights & latents
    output_folder = os.path.join(save_folder, "transformer_output_groundtruth")
    os.makedirs(output_folder, exist_ok=True)

    # Save model weights
    base_name = os.path.basename(csv_path).split(".")[0]
    save_path = os.path.join(output_folder, base_name + "_transformer.pth")
    torch.save(model.state_dict(), save_path)
    print("Model weights saved to:", save_path)

    # Extract latents for entire dataset
    model.eval()
    all_latents = []
    all_coords = dataset.coords
    all_classes = dataset.classes

    with torch.no_grad():
        loader = DataLoader(dataset, batch_size=16, shuffle=False)
        for X, _, _, _ in tqdm(loader, desc="Extracting latents", leave=True):
            X = X.to(device)
            _, latent = model(X, return_latent=True)
            all_latents.append(latent.cpu().numpy())
    all_latents = np.concatenate(all_latents, axis=0)
    print("Extracted latent shape:", all_latents.shape)

    # Build a final DataFrame with 20 latent columns + lat/lon + Class
    latent_df = pd.DataFrame(all_latents)  # columns = 0..19
    lat_list = [coord[0] for coord in all_coords]
    lon_list = [coord[1] for coord in all_coords]

    latent_df["latitude"] = lat_list
    latent_df["longitude"] = lon_list
    latent_df["Class"] = all_classes

    # Save final CSV
    latent_csv_path = os.path.join(output_folder, base_name + "_latent_features.csv")
    latent_df.to_csv(latent_csv_path, index=False)
    print("Latent features with co  ordinates + Class saved to:", latent_csv_path)


# -------------------------------
# 5. Main Execution (Single CSV)
# -------------------------------
if __name__ == "__main__":
    # Example usage:
    csv_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final.csv"
    train_transformer_on_groundtruth(csv_path, epochs=50, save_folder=".")


Using device: cuda
GPU Name: NVIDIA RTX 2000 Ada Generation

--- Training on groundtruth CSV: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final.csv ---

Reading CSV file: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final.csv
Constructed 16464 time series samples from C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final.csv
Dataset contains 16464 samples.
Using d_model=20, seq_len=12. Training for 50 epochs.
Model is on device: cuda:0
Starting training...


                                                                         

Epoch 1/50, Train Loss=0.0053, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 2/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 3/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 4/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 5/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 6/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 7/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 8/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                        

Epoch 9/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 10/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 11/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 12/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 13/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 14/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 15/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 16/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 17/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 18/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 19/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 20/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 21/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 22/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 23/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 24/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 25/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 26/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 27/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 28/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 29/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 30/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 31/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 32/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 33/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 34/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 35/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 36/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 37/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 38/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 39/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 40/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 41/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 42/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 43/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 44/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 45/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 46/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 47/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 48/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 49/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00


                                                                         

Epoch 50/50, Train Loss=0.0000, Test Loss=0.0000, Dummy Acc=1.00
Model weights saved to: .\transformer_output_groundtruth\groundtruth_cleaned_final_transformer.pth


Extracting latents: 100%|██████████| 1029/1029 [00:13<00:00, 75.48it/s]


Extracted latent shape: (16464, 20)
Latent features with coordinates + Class saved to: .\transformer_output_groundtruth\groundtruth_cleaned_final_latent_features.csv


check CSV quality of both groundtruth and transformer output

In [16]:
import pandas as pd
import os

# List of CSV file paths
file_paths = [
    r"C:\Users\jmm267\Downloads\Binbin\transformer_output_groundtruth\groundtruth_cleaned_final_latent_features.csv",
    r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part1.csv",
    r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part2.csv",
    r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part3.csv"
]

total_rows = 0

for file in file_paths:
    if os.path.exists(file):
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file)
        print(f"File: {file}")

        # Print columns and their data types
        print("Columns and data types:")
        print(df.dtypes)

        # Print number of rows in the file
        num_rows = len(df)
        print(f"Number of rows: {num_rows}")
        print("-" * 60)

        total_rows += num_rows
    else:
        print(f"File not found: {file}")

print(f"Total combined rows across all files: {total_rows}")


File: C:\Users\jmm267\Downloads\Binbin\transformer_output_groundtruth\groundtruth_cleaned_final_latent_features.csv
Columns and data types:
0            float64
1            float64
2            float64
3            float64
4            float64
5            float64
6            float64
7            float64
8            float64
9            float64
10           float64
11           float64
12           float64
13           float64
14           float64
15           float64
16           float64
17           float64
18           float64
19           float64
latitude     float64
longitude    float64
Class         object
dtype: object
Number of rows: 16464
------------------------------------------------------------
File: C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part1.csv
Columns and data types:
0                float64
1                float64
2                float64
3                float64
4                float64
5         

Running the classifer

In [17]:
import os
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder

def main():
    ###############################################################################
    # 1. Load Groundtruth Latent Features
    ###############################################################################
    groundtruth_csv = r"C:\Users\jmm267\Downloads\Binbin\transformer_output_groundtruth\groundtruth_cleaned_final_latent_features.csv"
    print(f"Reading groundtruth CSV from: {groundtruth_csv}")
    df_gt = pd.read_csv(groundtruth_csv)
    print("Groundtruth shape:", df_gt.shape)
    print("Groundtruth columns:", df_gt.columns.tolist())

    # 20 latent features are columns 0..19
    feature_cols = [str(i) for i in range(20)]
    # The class label is in 'Class'
    # The lat/lon columns are 'latitude','longitude', but not used for training

    ###############################################################################
    # 2. Encode the Class & Prepare Train/Test
    ###############################################################################
    label_encoder = LabelEncoder()
    df_gt['class_label'] = label_encoder.fit_transform(df_gt['Class'])
    print("Classes found:", list(label_encoder.classes_))

    X = df_gt[feature_cols].values  # shape (N, 20)
    y = df_gt['class_label'].values
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42, stratify=y
    )
    print(f"Train size: {X_train.shape[0]}, Test size: {X_test.shape[0]}")

    ###############################################################################
    # 3. Train a RandomForest Classifier
    ###############################################################################
    print("Training RandomForestClassifier on groundtruth data...")
    clf = RandomForestClassifier(
        n_estimators=200,
        max_depth=None,
        random_state=42,
        n_jobs=-1
    )
    clf.fit(X_train, y_train)

    # Evaluate quickly on test set
    test_acc = clf.score(X_test, y_test)
    print(f"Test accuracy on groundtruth: {test_acc:.3f}")

    ###############################################################################
    # 4. Load DEC-Clustered Parts & Concatenate
    ###############################################################################
    dec_folder = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered"
    part_files = [
        "combined_DEC_clustered_part1.csv",
        "combined_DEC_clustered_part2.csv",
        "combined_DEC_clustered_part3.csv"
    ]
    all_dec_dfs = []
    for f in part_files:
        path = os.path.join(dec_folder, f)
        print(f"Reading DEC part: {path}")
        df_part = pd.read_csv(path)
        all_dec_dfs.append(df_part)
    df_dec = pd.concat(all_dec_dfs, ignore_index=True)
    print("DEC dataset shape (combined):", df_dec.shape)
    print("DEC columns:", df_dec.columns.tolist())

    # The DEC dataset has 20 columns named "0..19", plus 'latitude','longitude','cluster_label'.

    ###############################################################################
    # 5. Predict Class Probabilities for the DEC Data
    ###############################################################################
    # We'll apply the same 20 columns as features:
    X_dec = df_dec[feature_cols].values  # shape (M, 20)

    print("Predicting class probabilities on DEC data...")
    # predict_proba returns shape (M, #classes)
    y_proba = clf.predict_proba(X_dec)

    # We'll also pick the argmax as predicted class
    y_pred = np.argmax(y_proba, axis=1)  # shape (M,)

    # Create columns for each class probability
    # E.g. "prob_deci_broad", "prob_ever_coni", ...
    class_names = list(label_encoder.classes_)  # e.g. ["deci_broad","ever_coni","larch_CN","larch_JP","shrubland"]
    prob_cols = [f"prob_{cn}" for cn in class_names]

    df_probs = pd.DataFrame(y_proba, columns=prob_cols)
    df_pred_class = pd.Series(label_encoder.inverse_transform(y_pred), name="predicted_class")

    ###############################################################################
    # 6. Merge Probabilities with DEC Data & Save
    ###############################################################################
    # We can either store them in a single big DataFrame or chunk them again.
    # For simplicity, we'll store them in one DataFrame in memory:
    result_df = pd.concat([df_dec, df_probs, df_pred_class], axis=1)

    # The result has columns:
    #   0..19, latitude, longitude, cluster_label,
    #   prob_deci_broad, prob_ever_coni, prob_larch_CN, prob_larch_JP, prob_shrubland,
    #   predicted_class
    print("Final result shape:", result_df.shape)
    print("Sample rows:\n", result_df.head(5))

    # Save to a single CSV. (2.2 million rows can be large but typically feasible.)
    # If memory is an issue, consider chunked approach or partitioned saving.
    output_csv = os.path.join(dec_folder, "combined_DEC_clustered_with_classProbs.csv")
    print(f"Saving final DEC with class probabilities to: {output_csv}")
    result_df.to_csv(output_csv, index=False)
    print("Done.")

if __name__ == "__main__":
    main()


Reading groundtruth CSV from: C:\Users\jmm267\Downloads\Binbin\transformer_output_groundtruth\groundtruth_cleaned_final_latent_features.csv
Groundtruth shape: (16464, 23)
Groundtruth columns: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'latitude', 'longitude', 'Class']
Classes found: ['deci_broad', 'ever_coni', 'larch_CN', 'larch_JP', 'shrubland']
Train size: 13171, Test size: 3293
Training RandomForestClassifier on groundtruth data...
Test accuracy on groundtruth: 0.950
Reading DEC part: C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part1.csv
Reading DEC part: C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part2.csv
Reading DEC part: C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part3.csv
DEC dataset shape (combined): (2205952, 23)
DEC columns: [

train new classifier.

In [23]:
import os
import pandas as pd
import numpy as np

# For progress bars
from tqdm import tqdm
try:
    from tqdm.keras import TqdmCallback
except ImportError:
    print("Please install tqdm >= 4.36.0 with `pip install tqdm` to use TqdmCallback for Keras.")
    raise

# For neural network classification
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split

##############################################
# 0. Configure GPU if available (CUDA usage) #
##############################################
print("=== Checking for GPU (CUDA) availability ===")
gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        # Enable dynamic memory growth on all found GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        print(f"GPU(s) found: {[gpu.name for gpu in gpus]}")
        print("Successfully set GPU memory growth. TensorFlow will use GPU by default.\n")
    except RuntimeError as e:
        print("Could not set GPU memory growth:", e, "\n")
else:
    print("No GPU found. Running on CPU.\n")


#############################################
# 1. Load and inspect the groundtruth data  #
#############################################
groundtruth_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final_latent_features.csv"

print("=== Loading Groundtruth Data ===")
print(f"Reading groundtruth data from:\n  {groundtruth_path}")
df_groundtruth = pd.read_csv(groundtruth_path)
print(f"Loaded groundtruth data with shape: {df_groundtruth.shape}")
print("First few rows:\n", df_groundtruth.head(), "\n")

# Separate features (X) and labels (y).
X = df_groundtruth.iloc[:, 0:20].values
y = df_groundtruth['Class'].values

# Encode class labels to numeric
print("Encoding class labels...")
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Scale the latent features
print("Scaling features...")
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Train/Validation split
print("Splitting data into train/validation sets...")
X_train, X_val, y_train, y_val = train_test_split(
    X_scaled, y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)
print("Train set shape:", X_train.shape, "Val set shape:", X_val.shape, "\n")

##############################################
# 2. Build and train a neural network model  #
##############################################
print("=== Building and Training the Model ===")
model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train.shape[1],)),
    Dropout(0.3),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(len(np.unique(y_encoded)), activation='softmax')
])

print("Compiling model...")
model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

print("Starting training for 100 epochs...\n")
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=100,
    batch_size=256,
    verbose=0,  # Turn off default Keras logging
    callbacks=[TqdmCallback(verbose=1)]  # Use TQDM progress bar
)

train_acc = history.history['accuracy'][-1]
val_acc = history.history['val_accuracy'][-1]
print(f"\nTraining completed!")
print(f"Final Training Accuracy:   {train_acc:.4f}")
print(f"Final Validation Accuracy: {val_acc:.4f}\n")

################################################
# 3. Load and combine the DEC-clustered files  #
################################################
dec_part1_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part1.csv"
dec_part2_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part2.csv"
dec_part3_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\combined_DEC_clustered_part3.csv"

dec_files = [dec_part1_path, dec_part2_path, dec_part3_path]

print("=== Loading and Concatenating DEC Data ===")
df_list = []
for file_path in tqdm(dec_files, desc="Reading DEC CSVs"):
    df_part = pd.read_csv(file_path)
    df_list.append(df_part)

df_dec_combined = pd.concat(df_list, axis=0, ignore_index=True)
print(f"All DEC files loaded and concatenated. Combined shape: {df_dec_combined.shape}")
print("First few rows of the combined DEC data:\n", df_dec_combined.head(), "\n")

##############################################
# 4. Apply the trained classifier to DEC data
##############################################
print("=== Predicting Classes for DEC Data ===")
X_dec = df_dec_combined.iloc[:, 0:20].values

# Scale them using the same scaler
X_dec_scaled = scaler.transform(X_dec)

print("Generating predictions...")
y_dec_pred_probs = model.predict(X_dec_scaled)
y_dec_pred_classes = np.argmax(y_dec_pred_probs, axis=1)

# Convert numeric predictions back to original class names
dec_pred_labels = label_encoder.inverse_transform(y_dec_pred_classes)
df_dec_combined['predicted_class'] = dec_pred_labels

# Save to CSV
output_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\DEC_with_predictions.csv"
print(f"Saving predictions to:\n  {output_path}")
df_dec_combined.to_csv(output_path, index=False)
print("Done!")


=== Checking for GPU (CUDA) availability ===
No GPU found. Running on CPU.

=== Loading Groundtruth Data ===
Reading groundtruth data from:
  C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\groundtruth_cleaned_final_latent_features.csv
Loaded groundtruth data with shape: (16464, 23)
First few rows:
           0         1         2         3         4         5         6  \
0  0.303286  0.850955  0.057265 -1.085985  0.244422  1.693576  2.825499   
1  0.303553  0.855530  0.056125 -1.088880  0.244643  1.693512  2.832759   
2  0.302698  0.856721  0.055590 -1.087381  0.242954  1.690936  2.831756   
3  0.295183  0.854956  0.058925 -1.088144  0.250835  1.689031  2.831896   
4  0.296416  0.864984  0.057353 -1.086355  0.250660  1.688048  2.831185   

          7         8         9  ...        13         14        15        16  \
0 -2.474173  1.149435  1.074792  ... -0.097846 -11.351932 -0.426686  0.514707   
1 -2.475309  1.146067  1.072967  ... -0.097587 -11.348213 -0.428438  0.516408

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


0epoch [00:00, ?epoch/s]

0batch [00:00, ?batch/s]


Training completed!
Final Training Accuracy:   0.9579
Final Validation Accuracy: 0.9602

=== Loading and Concatenating DEC Data ===


Reading DEC CSVs: 100%|██████████| 3/3 [00:03<00:00,  1.05s/it]


All DEC files loaded and concatenated. Combined shape: (2205952, 23)
First few rows of the combined DEC data:
           0         1         2         3          4         5         6  \
0  0.155357  0.286328  0.393404  0.614512 -13.277342 -0.116932  0.244814   
1  0.151448  0.289241  0.390746  0.617897 -13.285116 -0.115736  0.244864   
2  0.151448  0.289241  0.390746  0.617897 -13.285116 -0.115736  0.244864   
3  0.154539  0.290430  0.393342  0.620422 -13.284363 -0.124646  0.240445   
4  0.153474  0.291959  0.391579  0.622427 -13.276670 -0.125364  0.236160   

          7         8         9  ...        13        14        15        16  \
0  0.948458  1.034857 -0.511455  ...  0.885650 -0.656209  3.106227 -0.126195   
1  0.946321  1.030107 -0.509817  ...  0.884141 -0.656548  3.106382 -0.126204   
2  0.946321  1.030107 -0.509817  ...  0.884141 -0.656548  3.106382 -0.126204   
3  0.945105  1.030176 -0.505772  ...  0.882250 -0.655979  3.104670 -0.125328   
4  0.947631  1.035075 -0.506821 

See that the predicted class and DEC cluster class C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\DEC_with_predictions.csv

In [24]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    confusion_matrix,
    adjusted_rand_score,
    adjusted_mutual_info_score,
    homogeneity_score,
    completeness_score,
    v_measure_score
)

# 1) Load the CSV containing DEC cluster labels + predicted_class
file_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\transformer_output_latlong\DEC_clustered\DEC_with_predictions.csv"
df = pd.read_csv(file_path)

print("=== DEC_with_predictions.csv loaded ===")
print("Columns:", df.columns.tolist())
print("Sample rows:\n", df.head(), "\n")

# 2) Build a contingency table (cross-tab) of cluster_label vs predicted_class
#    This quickly shows how many points in each cluster ended up with each predicted_class.
contingency = pd.crosstab(df['cluster_label'], df['predicted_class'])
print("=== Contingency Table: cluster_label vs. predicted_class ===")
print(contingency)
print()

# 3) Convert predicted_class from strings to integer IDs for clustering metrics
label_enc = LabelEncoder()
y_true = label_enc.fit_transform(df['predicted_class'].values)  # "True" labels (supervised notion)
y_cluster = df['cluster_label'].values  # Unsupervised cluster assignments

# 4) Some popular cluster vs. label metrics:

# (a) Confusion Matrix
#    Note that typically confusion_matrix is for "true_label" vs "predicted_label".
#    Here we treat "predicted_class" as 'true' and DEC cluster_label as 'predicted'.
cm = confusion_matrix(y_true, y_cluster)
print("=== Confusion Matrix (Predicted Class vs. Cluster Label) ===")
print(cm)
print()

# (b) Adjusted Rand Index (ARI)
#    Ranges from -1 to 1 (random to perfect clustering). 0 indicates random labeling.
ari = adjusted_rand_score(y_true, y_cluster)
print(f"Adjusted Rand Index: {ari:.4f}")

# (c) Adjusted Mutual Information (AMI)
#    Also 0 = random, 1 = perfect. Adjusted for chance.
ami = adjusted_mutual_info_score(y_true, y_cluster)
print(f"Adjusted Mutual Info Score: {ami:.4f}")

# (d) Homogeneity & Completeness
#    Homogeneity: 1 => each cluster contains only members of a single class.
#    Completeness: 1 => each class is entirely contained within a single cluster.
h = homogeneity_score(y_true, y_cluster)
c = completeness_score(y_true, y_cluster)
print(f"Homogeneity Score: {h:.4f}")
print(f"Completeness Score: {c:.4f}")

# (e) V-measure
#    Harmonic mean of homogeneity & completeness. 1 => perfect.
v = v_measure_score(y_true, y_cluster)
print(f"V-Measure: {v:.4f}")

print("\n=== Analysis Complete ===")


=== DEC_with_predictions.csv loaded ===
Columns: ['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10', '11', '12', '13', '14', '15', '16', '17', '18', '19', 'latitude', 'longitude', 'cluster_label', 'predicted_class']
Sample rows:
           0         1         2         3          4         5         6  \
0  0.155357  0.286328  0.393404  0.614512 -13.277342 -0.116932  0.244814   
1  0.151448  0.289241  0.390746  0.617897 -13.285116 -0.115736  0.244864   
2  0.151448  0.289241  0.390746  0.617897 -13.285116 -0.115736  0.244864   
3  0.154539  0.290430  0.393342  0.620422 -13.284363 -0.124646  0.240445   
4  0.153474  0.291959  0.391579  0.622427 -13.276670 -0.125364  0.236160   

          7         8         9  ...        14        15        16        17  \
0  0.948458  1.034857 -0.511455  ... -0.656209  3.106227 -0.126195 -1.456508   
1  0.946321  1.030107 -0.509817  ... -0.656548  3.106382 -0.126204 -1.458185   
2  0.946321  1.030107 -0.509817  ... -0.656548  3.106382 -0.126204 -

In short, the DEC clusters do not map cleanly onto the (supervised) predicted classes—they are largely mixed. That’s not unexpected in an unsupervised approach; a single cluster may contain multiple species, and a single species may appear across several clusters. The low ARI and AMI confirm that there isn’t a strong one‐to‐one correspondence between the cluster labels and the final predicted classes.

Manual trend analysis

In [25]:
from sklearn.metrics import confusion_matrix, classification_report

# After training, predictions on the validation set:
val_preds = model.predict(X_val)  # shape (val_samples, n_classes)
val_pred_classes = np.argmax(val_preds, axis=1)

cm = confusion_matrix(y_val, val_pred_classes)
print("Confusion Matrix:\n", cm)

print("Classification Report:\n", classification_report(y_val, val_pred_classes))


[1m103/103[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step
Confusion Matrix:
 [[ 989    6    1    7    0]
 [   2  733    0   21    0]
 [   3    1  161   42    0]
 [  11   20   14 1102    0]
 [   0    1    0    2  177]]
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.99      0.99      1003
           1       0.96      0.97      0.97       756
           2       0.91      0.78      0.84       207
           3       0.94      0.96      0.95      1147
           4       1.00      0.98      0.99       180

    accuracy                           0.96      3293
   macro avg       0.96      0.94      0.95      3293
weighted avg       0.96      0.96      0.96      3293



Simple LSTM classifier

In [30]:
import os
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, random_split
from tqdm import tqdm

from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)
if device.type == 'cuda':
    print("GPU Name:", torch.cuda.get_device_name())

##############################################################################
# 1) Dataset Class for 12-Month Sequences
##############################################################################
class GroundtruthTimeSeriesDataset(Dataset):
    """
    Reads a CSV with repeated rows over months for each (lat, lon),
    e.g. 12 rows for 12 months. We group them to form a single time-series
    sample: shape (12, number_of_features). The 'Class' label is taken
    from the first row's 'Class' (assuming each lat/lon is a single species).

    If a lat/lon has fewer than 12 months, we can pad with the last row.
    If it has more than 12, we can slice down to 12.

    The user can define which columns are the 'feature_cols'.
    We'll store 'label_encoder' for Class strings -> integers.
    """

    def __init__(self, csv_path, feature_cols=None, desired_length=12):
        print(f"Reading CSV file: {csv_path}")
        df = pd.read_csv(csv_path, parse_dates=['image_date'])

        if feature_cols is None:
            # You indicated at least 20 features: AOT..WVP, EVI, NDVI, NDWI, etc.
            # Adjust to exactly the ones you want. Example below:
            self.feature_cols = [
                "AOT","B1","B11","B12","B2","B3","B4","B5",
                "B6","B7","B8","B8A","B9","EVI","NDVI","NDWI",
                "TCI_B","TCI_G","TCI_R","WVP"
            ]
        else:
            self.feature_cols = feature_cols

        # We'll group by (latitude, longitude). If you prefer rounding, do so here.
        # e.g. df['lat_lon_id'] = ...
        # But let's assume the lat/lon values are consistent for each patch.
        df["lat_lon_id"] = df["latitude"].astype(str) + "_" + df["longitude"].astype(str)

        # Encode "Class" if present
        if "Class" not in df.columns:
            raise ValueError("CSV must have 'Class' column for supervised training.")

        # Build label encoder
        self.label_encoder = LabelEncoder()
        class_strs = df["Class"].unique()
        self.label_encoder.fit(class_strs)

        self.desired_length = desired_length
        self.samples = []  # each is a (time_series_tensor, label_int)

        # group by lat_lon_id
        grouped = df.groupby("lat_lon_id", sort=False)
        for group_id, group_data in grouped:
            group_data = group_data.sort_values("image_date")
            arr = group_data[self.feature_cols].to_numpy(dtype=float)  # shape (T, D)
            T, D = arr.shape

            if T < desired_length:
                # pad with last row
                pad = np.tile(arr[-1:], (desired_length - T, 1))
                arr = np.vstack([arr, pad])
            elif T > desired_length:
                arr = arr[:desired_length]

            # Convert to torch tensor
            seq_tensor = torch.tensor(arr, dtype=torch.float)  # shape (12, #features)
            # Label: from the first row's 'Class'
            class_str = group_data.iloc[0]["Class"]
            label_id = self.label_encoder.transform([class_str])[0]

            self.samples.append((seq_tensor, label_id))

        print(f"Constructed {len(self.samples)} time-series samples from {csv_path}.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        seq, label = self.samples[idx]
        return seq, label


##############################################################################
# 2) Neural Network Model: LSTM (or GRU) to handle 12×D sequences
##############################################################################
class LSTMClassifier(nn.Module):
    """
    An LSTM-based classifier that:
      - Reads sequences of shape (T=12, D=20+).
      - Optionally: you can add multiple LSTM layers, dropout, etc.
      - Then we take the final hidden state or last time step for classification.
    """

    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, drop_prob=0.2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers

        # You can also use nn.GRU(...) if you prefer a GRU
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=drop_prob,
            bidirectional=False
        )
        # final classification layer
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        # x: shape (B, T, D)
        # Initialize hidden + cell states:
        batch_size = x.size(0)
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)

        # LSTM forward
        out, (hn, cn) = self.lstm(x, (h0, c0))
        # out shape: (B, T, hidden_dim)
        # hn shape: (num_layers, B, hidden_dim) => final hidden state for last time step

        # We'll take the last time step's output
        # or we can just use hn[-1], which is the last layer's hidden state
        last_out = out[:, -1, :]  # shape (B, hidden_dim)
        logits = self.fc(last_out)  # shape (B, num_classes)
        return logits


##############################################################################
# 3) Main Training Pipeline
##############################################################################
def train_lstm_classifier(
    csv_path,
    epochs=100,
    batch_size=32,
    hidden_dim=128,
    num_layers=1,
    drop_prob=0.2,
    save_path="lstm_classifier.pth"
):
    # 1) Create Dataset
    dataset = GroundtruthTimeSeriesDataset(csv_path)

    # 2) Train/Val Split
    ds_len = len(dataset)
    train_size = int(0.8 * ds_len)
    val_size = ds_len - train_size
    train_ds, val_ds = random_split(dataset, [train_size, val_size])

    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)

    num_classes = len(dataset.label_encoder.classes_)
    input_dim = len(dataset.feature_cols)

    print(f"Dataset has {ds_len} time-series samples, {num_classes} distinct classes.")
    print(f"Using LSTM with input_dim={input_dim}, hidden_dim={hidden_dim}, num_classes={num_classes}.")
    print(f"Training for {epochs} epochs...")

    # 3) Build Model
    model = LSTMClassifier(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=num_classes,
        drop_prob=drop_prob
    ).to(device)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=1e-3)

    # 4) Training Loop
    for epoch in range(1, epochs+1):
        # ---- train ----
        model.train()
        total_loss = 0.0
        correct, total = 0, 0
        for X, y in tqdm(train_loader, desc=f"Epoch {epoch}/{epochs} (train)", leave=False):
            X = X.to(device)            # shape (B, 12, input_dim)
            y = y.to(device)            # shape (B,)
            optimizer.zero_grad()
            logits = model(X)           # shape (B, num_classes)
            loss = criterion(logits, y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item() * X.size(0)
            preds = torch.argmax(logits, dim=1)
            correct += (preds == y).sum().item()
            total += X.size(0)
        train_acc = correct / total
        avg_loss = total_loss / total

        # ---- validate ----
        model.eval()
        val_loss = 0.0
        val_correct, val_total = 0, 0
        all_val_preds = []
        all_val_labels = []
        with torch.no_grad():
            for Xv, yv in tqdm(val_loader, desc=f"Epoch {epoch}/{epochs} (val)", leave=False):
                Xv = Xv.to(device)
                yv = yv.to(device)
                logits_v = model(Xv)
                loss_v = criterion(logits_v, yv)
                val_loss += loss_v.item() * Xv.size(0)
                preds_v = torch.argmax(logits_v, dim=1)
                val_correct += (preds_v == yv).sum().item()
                val_total += Xv.size(0)
                all_val_preds.extend(preds_v.cpu().numpy())
                all_val_labels.extend(yv.cpu().numpy())

        val_acc = val_correct / val_total
        avg_val_loss = val_loss / val_total
        print(f"[Epoch {epoch}/{epochs}] Train Loss={avg_loss:.4f}, Train Acc={train_acc:.2f}, "
              f"Val Loss={avg_val_loss:.4f}, Val Acc={val_acc:.2f}")

    # 5) Save the model
    torch.save(model.state_dict(), save_path)
    print(f"\nModel saved to {save_path}.")

    # 6) Final Confusion Matrix on val set
    cm = confusion_matrix(all_val_labels, all_val_preds)
    print("\nConfusion Matrix (on validation set):")
    print(cm)

    # Classification Report
    print("\nClassification Report:")
    # Convert numeric labels back to strings
    inv_class_map = {i: c for i, c in enumerate(dataset.label_encoder.classes_)}
    all_val_preds_str = [inv_class_map[p] for p in all_val_preds]
    all_val_labels_str = [inv_class_map[l] for l in all_val_labels]
    print(classification_report(all_val_labels_str, all_val_preds_str))

    return model, dataset


##############################################################################
# 4) Example Usage
##############################################################################
if __name__ == "__main__":
    groundtruth_csv = r"C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\groundtruth_cleaned_final.csv"
    model, ds = train_lstm_classifier(
        csv_path=groundtruth_csv,
        epochs=100,
        batch_size=32,
        hidden_dim=128,
        num_layers=2,
        drop_prob=0.2,
        save_path="lstm_classifier_model.pth"
    )


Using device: cuda
GPU Name: NVIDIA RTX 2000 Ada Generation
Reading CSV file: C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\groundtruth_cleaned_final.csv
Constructed 16464 time-series samples from C:\Users\jmm267\Downloads\Binbin\Dataset\groundtruth_raw\time_series_cleaned\groundtruth_cleaned_final.csv.
Dataset has 16464 time-series samples, 5 distinct classes.
Using LSTM with input_dim=20, hidden_dim=128, num_classes=5.
Training for 100 epochs...


                                                                       

[Epoch 1/100] Train Loss=0.4724, Train Acc=0.83, Val Loss=0.2206, Val Acc=0.93


                                                                       

[Epoch 2/100] Train Loss=0.3468, Train Acc=0.88, Val Loss=0.2372, Val Acc=0.92


                                                                       

[Epoch 3/100] Train Loss=0.2778, Train Acc=0.90, Val Loss=0.1838, Val Acc=0.94


                                                                       

[Epoch 4/100] Train Loss=0.2478, Train Acc=0.91, Val Loss=0.1461, Val Acc=0.95


                                                                       

[Epoch 5/100] Train Loss=0.2362, Train Acc=0.92, Val Loss=0.1371, Val Acc=0.96


                                                                       

[Epoch 6/100] Train Loss=0.1798, Train Acc=0.94, Val Loss=0.1140, Val Acc=0.96


                                                                       

[Epoch 7/100] Train Loss=0.1867, Train Acc=0.93, Val Loss=0.1258, Val Acc=0.95


                                                                       

[Epoch 8/100] Train Loss=0.1880, Train Acc=0.93, Val Loss=0.1003, Val Acc=0.97


                                                                       

[Epoch 9/100] Train Loss=0.1691, Train Acc=0.94, Val Loss=0.0983, Val Acc=0.97


                                                                        

[Epoch 10/100] Train Loss=0.1589, Train Acc=0.94, Val Loss=0.0925, Val Acc=0.97


                                                                        

[Epoch 11/100] Train Loss=0.1480, Train Acc=0.95, Val Loss=0.0820, Val Acc=0.97


                                                                        

[Epoch 12/100] Train Loss=0.1719, Train Acc=0.94, Val Loss=0.1572, Val Acc=0.94


                                                                        

[Epoch 13/100] Train Loss=0.1534, Train Acc=0.94, Val Loss=0.0835, Val Acc=0.97


                                                                        

[Epoch 14/100] Train Loss=0.1475, Train Acc=0.95, Val Loss=0.0957, Val Acc=0.97


                                                                        

[Epoch 15/100] Train Loss=0.1553, Train Acc=0.94, Val Loss=0.0782, Val Acc=0.97


                                                                        

[Epoch 16/100] Train Loss=0.1424, Train Acc=0.95, Val Loss=0.0782, Val Acc=0.97


                                                                        

[Epoch 17/100] Train Loss=0.1159, Train Acc=0.96, Val Loss=0.0876, Val Acc=0.97


                                                                        

[Epoch 18/100] Train Loss=0.1364, Train Acc=0.95, Val Loss=0.0751, Val Acc=0.97


                                                                        

[Epoch 19/100] Train Loss=0.1204, Train Acc=0.96, Val Loss=0.0696, Val Acc=0.98


                                                                        

[Epoch 20/100] Train Loss=0.1315, Train Acc=0.95, Val Loss=0.1193, Val Acc=0.96


                                                                        

[Epoch 21/100] Train Loss=0.1486, Train Acc=0.95, Val Loss=0.0800, Val Acc=0.97


                                                                        

[Epoch 22/100] Train Loss=0.1339, Train Acc=0.95, Val Loss=0.0997, Val Acc=0.97


                                                                        

[Epoch 23/100] Train Loss=0.1141, Train Acc=0.96, Val Loss=0.0677, Val Acc=0.98


                                                                        

[Epoch 24/100] Train Loss=0.1135, Train Acc=0.96, Val Loss=0.0753, Val Acc=0.97


                                                                        

[Epoch 25/100] Train Loss=0.1235, Train Acc=0.96, Val Loss=0.0697, Val Acc=0.98


                                                                        

[Epoch 26/100] Train Loss=0.1284, Train Acc=0.95, Val Loss=0.0779, Val Acc=0.97


                                                                        

[Epoch 27/100] Train Loss=0.1074, Train Acc=0.96, Val Loss=0.0616, Val Acc=0.98


                                                                        

[Epoch 28/100] Train Loss=0.1081, Train Acc=0.96, Val Loss=0.0693, Val Acc=0.97


                                                                        

[Epoch 29/100] Train Loss=0.1006, Train Acc=0.96, Val Loss=0.0670, Val Acc=0.98


                                                                        

[Epoch 30/100] Train Loss=0.1169, Train Acc=0.96, Val Loss=0.0763, Val Acc=0.98


                                                                        

[Epoch 31/100] Train Loss=0.1415, Train Acc=0.95, Val Loss=0.0761, Val Acc=0.97


                                                                        

[Epoch 32/100] Train Loss=0.1266, Train Acc=0.95, Val Loss=0.0686, Val Acc=0.97


                                                                        

[Epoch 33/100] Train Loss=0.1097, Train Acc=0.96, Val Loss=0.0710, Val Acc=0.98


                                                                        

[Epoch 34/100] Train Loss=0.0985, Train Acc=0.97, Val Loss=0.0707, Val Acc=0.98


                                                                        

[Epoch 35/100] Train Loss=0.1040, Train Acc=0.96, Val Loss=0.0867, Val Acc=0.97


                                                                        

[Epoch 36/100] Train Loss=0.1094, Train Acc=0.96, Val Loss=0.0695, Val Acc=0.97


                                                                        

[Epoch 37/100] Train Loss=0.1192, Train Acc=0.96, Val Loss=0.0677, Val Acc=0.98


                                                                        

[Epoch 38/100] Train Loss=0.1122, Train Acc=0.96, Val Loss=0.0581, Val Acc=0.98


                                                                        

[Epoch 39/100] Train Loss=0.1112, Train Acc=0.96, Val Loss=0.0644, Val Acc=0.98


                                                                        

[Epoch 40/100] Train Loss=0.1023, Train Acc=0.96, Val Loss=0.0790, Val Acc=0.97


                                                                        

[Epoch 41/100] Train Loss=0.1063, Train Acc=0.96, Val Loss=0.0672, Val Acc=0.98


                                                                        

[Epoch 42/100] Train Loss=0.1142, Train Acc=0.96, Val Loss=0.0609, Val Acc=0.98


                                                                        

[Epoch 43/100] Train Loss=0.1023, Train Acc=0.96, Val Loss=0.0742, Val Acc=0.97


                                                                        

[Epoch 44/100] Train Loss=0.1192, Train Acc=0.96, Val Loss=0.0750, Val Acc=0.97


                                                                        

[Epoch 45/100] Train Loss=0.1229, Train Acc=0.95, Val Loss=0.0571, Val Acc=0.98


                                                                        

[Epoch 46/100] Train Loss=0.1063, Train Acc=0.96, Val Loss=0.0742, Val Acc=0.97


                                                                        

[Epoch 47/100] Train Loss=0.1035, Train Acc=0.96, Val Loss=0.0658, Val Acc=0.98


                                                                        

[Epoch 48/100] Train Loss=0.0951, Train Acc=0.96, Val Loss=0.0601, Val Acc=0.98


                                                                        

[Epoch 49/100] Train Loss=0.1231, Train Acc=0.96, Val Loss=0.0744, Val Acc=0.97


                                                                        

[Epoch 50/100] Train Loss=0.1051, Train Acc=0.96, Val Loss=0.0629, Val Acc=0.98


                                                                        

[Epoch 51/100] Train Loss=0.1009, Train Acc=0.97, Val Loss=0.0663, Val Acc=0.98


                                                                        

[Epoch 52/100] Train Loss=0.1201, Train Acc=0.96, Val Loss=0.0735, Val Acc=0.97


                                                                        

[Epoch 53/100] Train Loss=0.1124, Train Acc=0.96, Val Loss=0.0598, Val Acc=0.98


                                                                        

[Epoch 54/100] Train Loss=0.1112, Train Acc=0.96, Val Loss=0.0528, Val Acc=0.98


                                                                        

[Epoch 55/100] Train Loss=0.1083, Train Acc=0.96, Val Loss=0.0626, Val Acc=0.98


                                                                        

[Epoch 56/100] Train Loss=0.0866, Train Acc=0.97, Val Loss=0.0681, Val Acc=0.97


                                                                        

[Epoch 57/100] Train Loss=0.1007, Train Acc=0.96, Val Loss=0.0616, Val Acc=0.98


                                                                        

[Epoch 58/100] Train Loss=0.1076, Train Acc=0.96, Val Loss=0.0649, Val Acc=0.98


                                                                        

[Epoch 59/100] Train Loss=0.1057, Train Acc=0.96, Val Loss=0.0599, Val Acc=0.98


                                                                        

[Epoch 60/100] Train Loss=0.0896, Train Acc=0.97, Val Loss=0.0606, Val Acc=0.98


                                                                        

[Epoch 61/100] Train Loss=0.0979, Train Acc=0.96, Val Loss=0.0566, Val Acc=0.98


                                                                        

[Epoch 62/100] Train Loss=0.0882, Train Acc=0.97, Val Loss=0.0496, Val Acc=0.98


                                                                        

[Epoch 63/100] Train Loss=0.0879, Train Acc=0.97, Val Loss=0.0556, Val Acc=0.98


                                                                        

[Epoch 64/100] Train Loss=0.0852, Train Acc=0.97, Val Loss=0.0758, Val Acc=0.97


                                                                        

[Epoch 65/100] Train Loss=0.0979, Train Acc=0.96, Val Loss=0.0571, Val Acc=0.98


                                                                        

[Epoch 66/100] Train Loss=0.1094, Train Acc=0.96, Val Loss=0.0660, Val Acc=0.97


                                                                        

[Epoch 67/100] Train Loss=0.0949, Train Acc=0.96, Val Loss=0.0522, Val Acc=0.98


                                                                        

[Epoch 68/100] Train Loss=0.0891, Train Acc=0.97, Val Loss=0.0490, Val Acc=0.98


                                                                        

[Epoch 69/100] Train Loss=0.1019, Train Acc=0.96, Val Loss=0.0518, Val Acc=0.98


                                                                        

[Epoch 70/100] Train Loss=0.0917, Train Acc=0.97, Val Loss=0.0498, Val Acc=0.98


                                                                        

[Epoch 71/100] Train Loss=0.1010, Train Acc=0.96, Val Loss=0.0581, Val Acc=0.98


                                                                        

[Epoch 72/100] Train Loss=0.0921, Train Acc=0.97, Val Loss=0.0627, Val Acc=0.98


                                                                        

[Epoch 73/100] Train Loss=0.0895, Train Acc=0.97, Val Loss=0.0517, Val Acc=0.98


                                                                        

[Epoch 74/100] Train Loss=0.0864, Train Acc=0.97, Val Loss=0.0524, Val Acc=0.98


                                                                        

[Epoch 75/100] Train Loss=0.1083, Train Acc=0.96, Val Loss=0.0679, Val Acc=0.98


                                                                        

[Epoch 76/100] Train Loss=0.0860, Train Acc=0.97, Val Loss=0.0540, Val Acc=0.98


                                                                        

[Epoch 77/100] Train Loss=0.0798, Train Acc=0.97, Val Loss=0.0508, Val Acc=0.98


                                                                        

[Epoch 78/100] Train Loss=0.0707, Train Acc=0.97, Val Loss=0.0511, Val Acc=0.98


                                                                        

[Epoch 79/100] Train Loss=0.0722, Train Acc=0.97, Val Loss=0.0509, Val Acc=0.98


                                                                        

[Epoch 80/100] Train Loss=0.0713, Train Acc=0.97, Val Loss=0.0573, Val Acc=0.98


                                                                        

[Epoch 81/100] Train Loss=0.0898, Train Acc=0.97, Val Loss=0.0501, Val Acc=0.98


                                                                        

[Epoch 82/100] Train Loss=0.0779, Train Acc=0.97, Val Loss=0.0585, Val Acc=0.98


                                                                        

[Epoch 83/100] Train Loss=0.1023, Train Acc=0.96, Val Loss=0.0765, Val Acc=0.97


                                                                        

[Epoch 84/100] Train Loss=0.0868, Train Acc=0.97, Val Loss=0.0572, Val Acc=0.98


                                                                        

[Epoch 85/100] Train Loss=0.0791, Train Acc=0.97, Val Loss=0.0700, Val Acc=0.98


                                                                        

[Epoch 86/100] Train Loss=0.0840, Train Acc=0.97, Val Loss=0.0633, Val Acc=0.98


                                                                        

[Epoch 87/100] Train Loss=0.0960, Train Acc=0.97, Val Loss=0.0443, Val Acc=0.98


                                                                        

[Epoch 88/100] Train Loss=0.0950, Train Acc=0.97, Val Loss=0.0518, Val Acc=0.98


                                                                        

[Epoch 89/100] Train Loss=0.0780, Train Acc=0.97, Val Loss=0.0500, Val Acc=0.98


                                                                        

[Epoch 90/100] Train Loss=0.0765, Train Acc=0.97, Val Loss=0.0510, Val Acc=0.98


                                                                        

[Epoch 91/100] Train Loss=0.0853, Train Acc=0.97, Val Loss=0.0472, Val Acc=0.98


                                                                        

[Epoch 92/100] Train Loss=0.0939, Train Acc=0.96, Val Loss=0.0570, Val Acc=0.98


                                                                        

[Epoch 93/100] Train Loss=0.1036, Train Acc=0.96, Val Loss=0.0474, Val Acc=0.98


                                                                        

[Epoch 94/100] Train Loss=0.0915, Train Acc=0.97, Val Loss=0.0578, Val Acc=0.98


                                                                        

[Epoch 95/100] Train Loss=0.0808, Train Acc=0.97, Val Loss=0.0587, Val Acc=0.98


                                                                        

[Epoch 96/100] Train Loss=0.0831, Train Acc=0.97, Val Loss=0.0635, Val Acc=0.98


                                                                        

[Epoch 97/100] Train Loss=0.0997, Train Acc=0.96, Val Loss=0.0600, Val Acc=0.98


                                                                        

[Epoch 98/100] Train Loss=0.0875, Train Acc=0.97, Val Loss=0.0517, Val Acc=0.98


                                                                        

[Epoch 99/100] Train Loss=0.0788, Train Acc=0.97, Val Loss=0.0570, Val Acc=0.98


                                                                         

[Epoch 100/100] Train Loss=0.0835, Train Acc=0.97, Val Loss=0.0514, Val Acc=0.98

Model saved to lstm_classifier_model.pth.

Confusion Matrix (on validation set):
[[ 965    0    0    0    0]
 [   5  791    0   10    2]
 [   0    0  172   20    0]
 [   4    3   13 1139    2]
 [   0    1    0    0  166]]

Classification Report:
              precision    recall  f1-score   support

  deci_broad       0.99      1.00      1.00       965
   ever_coni       0.99      0.98      0.99       808
    larch_CN       0.93      0.90      0.91       192
    larch_JP       0.97      0.98      0.98      1161
   shrubland       0.98      0.99      0.99       167

    accuracy                           0.98      3293
   macro avg       0.97      0.97      0.97      3293
weighted avg       0.98      0.98      0.98      3293





[[ 965    0    0    0    0]   -> deci_broad
 [   5  791    0   10    2]   -> ever_coni
 [   0    0  172   20    0]   -> larch_CN
 [   4    3   13 1139    2]   -> larch_JP
 [   0    1    0    0  166]]  -> shrubland

Class 0 (deci_broad) has 965 samples, all but 5 are correct → near 100% precision/recall.
Class 1 (ever_coni) has 808 samples: 791 correct → ~98% accuracy for that class.
Class 2 (larch_CN) has 192 samples: 172 correct, 20 predicted as larch_JP. ~90% recall, some confusion with class 3.
Class 3 (larch_JP) has 1161 samples, 1139 correct. Also near 98% recall.
Class 4 (shrubland) with 167 samples: 166 correct → 99% recall.

Testing the LSTM model on the unseen data.

In [None]:
import os
import glob
import math
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from tqdm import tqdm

# ---------------------------------------------------------------------
# 0. Check CUDA / GPU availability
# ---------------------------------------------------------------------
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("=== Checking for GPU (CUDA) availability ===")
if device.type == 'cuda':
    print(f"GPU is available! Using device: {device}")
    print(f"GPU Name: {torch.cuda.get_device_name()}")
else:
    print("No GPU found. Running on CPU.")

# ---------------------------------------------------------------------
# 1. LSTM Model Architecture
# ---------------------------------------------------------------------
class LSTMClassifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, num_layers, num_classes, drop_prob=0.2):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.lstm = nn.LSTM(
            input_size=input_dim,
            hidden_size=hidden_dim,
            num_layers=num_layers,
            batch_first=True,
            dropout=drop_prob,
            bidirectional=False
        )
        self.fc = nn.Linear(hidden_dim, num_classes)

    def forward(self, x):
        batch_size = x.size(0)
        # Initialize hidden/cell
        h0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)
        c0 = torch.zeros(self.num_layers, batch_size, self.hidden_dim).to(x.device)

        out, (hn, cn) = self.lstm(x, (h0, c0))
        # last time-step output
        last_out = out[:, -1, :]  # shape (B, hidden_dim)
        logits = self.fc(last_out)
        return logits

# ---------------------------------------------------------------------
# 2. BlindPatchDataset for unlabeled CSV
# ---------------------------------------------------------------------
class BlindPatchDataset(Dataset):
    """
    For a single CSV with columns:
      [AOT,B1,B11,B12,B2,B3,B4,B5,B6,B7,B8,B8A,B9,EVI,NDVI,NDWI,
       TCI_B,TCI_G,TCI_R,WVP, image_date,longitude,latitude]
    We group by lat/lon, sort by image_date, ensure T=12 (pad or truncate).
    Returns (seq_tensor, (lat, lon)) for each pixel/time-series.
    """

    def __init__(self, csv_path, feature_cols=None, desired_length=12):
        print(f"\n--- Reading CSV file: {csv_path}")
        self.df = pd.read_csv(csv_path, parse_dates=["image_date"])
        print(f"[DEBUG] Shape of read DataFrame: {self.df.shape}")

        if feature_cols is None:
            self.feature_cols = [
                "AOT","B1","B11","B12","B2","B3","B4","B5",
                "B6","B7","B8","B8A","B9","EVI","NDVI","NDWI",
                "TCI_B","TCI_G","TCI_R","WVP"
            ]
        else:
            self.feature_cols = feature_cols

        # Combine lat/lon into a group id
        self.df["lat_lon_id"] = (
            self.df["latitude"].round(6).astype(str)
            + "_"
            + self.df["longitude"].round(6).astype(str)
        )

        self.desired_length = desired_length
        self.samples = []
        self.latlons = []

        grouped = self.df.groupby("lat_lon_id", sort=False)
        for group_id, gdata in grouped:
            gdata_sorted = gdata.sort_values("image_date")
            arr = gdata_sorted[self.feature_cols].to_numpy(dtype=float)  # shape (T, D)
            T, D = arr.shape

            if T < desired_length:
                # pad with last row
                pad = np.tile(arr[-1:], (desired_length - T, 1))
                arr = np.vstack([arr, pad])
            elif T > desired_length:
                arr = arr[:desired_length]

            seq_tensor = torch.tensor(arr, dtype=torch.float)
            lat0 = gdata_sorted.iloc[0]["latitude"]
            lon0 = gdata_sorted.iloc[0]["longitude"]
            self.samples.append(seq_tensor)
            self.latlons.append((lat0, lon0))

        print(f"[DEBUG] Constructed {len(self.samples)} time-series samples.")

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        return self.samples[idx], self.latlons[idx]

# ---------------------------------------------------------------------
# 2.5 Custom Collate Function
# ---------------------------------------------------------------------
def collate_blind_patches(batch):
    """
    Expects a list of (seq_tensor, (lat, lon)) pairs.
    We'll stack all seq_tensors into (B, T, D),
    and keep coords as a list of (lat, lon).
    """
    seqs = []
    coords = []
    for (seq_tensor, (lat, lon)) in batch:
        seqs.append(seq_tensor)
        coords.append((lat, lon))

    # Stack the sequence tensors along the batch dimension
    seq_batch = torch.stack(seqs, dim=0)  # shape (B, 12, feature_dim)

    return seq_batch, coords

# ---------------------------------------------------------------------
# 3. Inference Function
# ---------------------------------------------------------------------
def infer_on_blind_data(
    folder_path,                   # path to raw_cleaned folder
    model_path="lstm_classifier_model.pth",
    output_folder="inference_results",
    batch_size=32,
    hidden_dim=128,
    num_layers=2,
    drop_prob=0.2,
    feature_cols=None
):
    """
    - folder_path: Directory that contains *cleaned.csv files
    - model_path: Path to .pth model weights
    - output_folder: Where we store the results
    - batch_size, hidden_dim, num_layers, drop_prob: must match your training config
    - feature_cols: same features used in training
    """
    print("\n=== Starting Inference on Blind Data ===")
    csv_files = glob.glob(os.path.join(folder_path, "*cleaned.csv"))
    print(f"Found {len(csv_files)} files in {folder_path} that match '*cleaned.csv'")

    print(f"Model path: {model_path}")
    print(f"Output folder: {output_folder}")
    print("Feature cols:", feature_cols)
    print("--------------------------------------------------")

    # Suppose you have 5 classes from training
    num_classes = 5
    input_dim = len(feature_cols) if feature_cols else 20

    # Rebuild the same LSTM
    print(f"[INFO] Building LSTM model: input_dim={input_dim}, hidden_dim={hidden_dim}, num_classes={num_classes}")
    model = LSTMClassifier(
        input_dim=input_dim,
        hidden_dim=hidden_dim,
        num_layers=num_layers,
        num_classes=num_classes,
        drop_prob=drop_prob
    )
    print(f"[INFO] Loading state_dict from: {model_path}")
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.to(device)
    model.eval()

    # Label mapping from training
    label_mapping = ["deci_broad", "ever_coni", "larch_CN", "larch_JP", "shrubland"]

    # Ensure output folder
    os.makedirs(output_folder, exist_ok=True)

    for csv_path in tqdm(csv_files, desc="CSV Files", unit="file"):
        print(f"\n--- Inference on file: {csv_path}")
        try:
            ds = BlindPatchDataset(csv_path, feature_cols=feature_cols)
            # Pass our custom collate function:
            dl = DataLoader(ds, batch_size=batch_size, shuffle=False, collate_fn=collate_blind_patches)

            print(f"[DEBUG] Dataset has {len(ds)} samples. Beginning inference...")

            all_preds = []
            all_coords = []

            with torch.no_grad():
                for seq_batch, coords_batch in tqdm(dl, desc="Predicting Batches", leave=False):
                    # seq_batch: shape (B, 12, input_dim)
                    # coords_batch: list of length B, each is (lat, lon)
                    seq_batch = seq_batch.to(device)
                    logits = model(seq_batch)
                    preds = torch.argmax(logits, dim=1).cpu().numpy()
                    all_preds.extend(preds)
                    all_coords.extend(coords_batch)

            if len(all_preds) != len(all_coords):
                raise ValueError(f"[ERROR] length mismatch: preds={len(all_preds)}, coords={len(all_coords)}")

            pred_classes = [label_mapping[p] for p in all_preds]

            # Build results DataFrame
            print(f"[DEBUG] Building DataFrame for {len(pred_classes)} predictions...")
            results_df = pd.DataFrame({
                "latitude": [c[0] for c in all_coords],
                "longitude": [c[1] for c in all_coords],
                "predicted_class": pred_classes
            })
            print(results_df.head(5))

            base_name = os.path.splitext(os.path.basename(csv_path))[0]
            out_csv = os.path.join(output_folder, base_name + "_with_predictions.csv")
            results_df.to_csv(out_csv, index=False)
            print(f"Results saved to: {out_csv}")

        except Exception as e:
            print(f"[ERROR] Failed on file {csv_path}\nReason: {e}")

# ---------------------------------------------------------------------
# 4. Example usage
# ---------------------------------------------------------------------
if __name__ == "__main__":
    folder_path = r"C:\Users\jmm267\Downloads\Binbin\Dataset\raw_cleaned"
    features_used = [
        "AOT","B1","B11","B12","B2","B3","B4","B5",
        "B6","B7","B8","B8A","B9","EVI","NDVI","NDWI",
        "TCI_B","TCI_G","TCI_R","WVP"
    ]

    infer_on_blind_data(
        folder_path=folder_path,
        model_path="lstm_classifier_model.pth",
        output_folder="larchCN_inference_results",
        batch_size=32,
        hidden_dim=128,
        num_layers=2,
        drop_prob=0.2,
        feature_cols=features_used
    )
