# Biebrza Pixel Time-Series Classification with PyTorch

This notebook loads the exported **biannual pixel time-series** dataset from Google Drive
and trains two neural models to classify vegetation trajectory categories:

- A **1D Convolutional Network** (Conv1D)
- An **LSTM**

It then compares their performance.

Assumptions:
- This notebook loads a table from Google Earth Engine named
  `biebrza_biannual_pixel_series.csv` from the folder `GEE_Biebrza` in your Drive.
- Each row is one pixel, with columns:
  - `NDMI_YYYY_YYYY`, `NBR_YYYY_YYYY`, `NIR_YYYY_YYYY` for multiple 2-year windows
  - `traj_simpl` (original category label)
  - `numark` (square id)
  - `pixel_id` (unique pixel id)
  - `cat_id` (original numeric label, can be ignored here)


In [1]:
!pip install -q torch torchvision torchaudio scikit-learn

import os
import numpy as np
import pandas as pd
from collections import Counter

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from sklearn.utils import shuffle as sk_shuffle
from sklearn.preprocessing import LabelEncoder


## 1. Load dataset from Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# Path to your exported CSV (adjust if needed)
data_path = '/content/drive/MyDrive/GEE_Biebrza/biebrza_biannual_pixel_series.csv'

df = pd.read_csv(data_path)
print('Data shape:', df.shape)
print('Columns:', df.columns.tolist()[:20], '...')


FileNotFoundError: [Errno 2] No such file or directory: 'E:\\git_projects\\biebrza-shrub-encroachment-analysis\\output\\kopec_and_slawik_2020_pixel_series\\biebrza_biannual_pixel_series.csv'

Quick look at the main label columns and one row:


In [None]:
print(df[['traj_simpl', 'numark']].head())
print('\nUnique traj_simpl values:', df['traj_simpl'].unique())


## 2. Re-label categories and subsample pixels

Steps:
1. Merge `wetland_to_trees` into `wetland_to_shrubs` â†’ new class `wetland_to_woody`.
2. Keep only these classes:
   - `wetland_to_woody` (all pixels)
   - `shrubs_to_trees` (all pixels)
   - `stable_wetland` (max 5000 pixels)
   - `stable_trees` (max 5000 pixels)
   - `stable_shrubs` (max 5000 pixels)
3. Drop all other categories.
4. Compute **class weights** based on the full dataset size (before subsampling)
   for use in the loss function.


In [None]:
# 2.1 Create unified class string column

df['class_str'] = df['traj_simpl']

# Merge wetland_to_* into wetland_to_woody
df.loc[df['class_str'].isin(['wetland_to_shrubs', 'wetland_to_trees']), 'class_str'] = 'wetland_to_woody'

print('New unique class_str values:', df['class_str'].unique())

# 2.2 Keep only desired classes
target_classes = ['wetland_to_woody', 'shrubs_to_trees', 'stable_wetland', 'stable_trees', 'stable_shrubs']

df = df[df['class_str'].isin(target_classes)].copy()
print('After filtering, shape:', df.shape)
print('Class counts (all pixels, before subsampling):')
print(df['class_str'].value_counts())

# 2.3 Compute class weights based on FULL filtered dataset size
class_counts_full = df['class_str'].value_counts().to_dict()
num_classes = len(target_classes)
total_samples_full = len(df)

class_weights_dict = {}
for cls in target_classes:
    count_c = class_counts_full.get(cls, 1)
    # Inverse-frequency style weight
    class_weights_dict[cls] = total_samples_full / (num_classes * count_c)

print('\nClass weights (for loss):')
for k, v in class_weights_dict.items():
    print(f'  {k}: {v:.3f}')

# 2.4 Subsample per class (where requested)
samples_per_class = {
    'wetland_to_woody': None,   # all
    'shrubs_to_trees': None,   # all
    'stable_wetland': 5000,
    'stable_trees': 5000,
    'stable_shrubs': 5000,
}

dfs = []
rng = np.random.default_rng(42)

for cls in target_classes:
    df_cls = df[df['class_str'] == cls]
    n = len(df_cls)
    max_n = samples_per_class[cls]
    if max_n is None or n <= max_n:
        dfs.append(df_cls)
        print(f'Class {cls}: using ALL {n} pixels')
    else:
        df_sampled = df_cls.sample(n=max_n, random_state=42)
        dfs.append(df_sampled)
        print(f'Class {cls}: downsampled from {n} to {max_n} pixels')

df_balanced = pd.concat(dfs, ignore_index=True)
df_balanced = sk_shuffle(df_balanced, random_state=42)

print('\nAfter balancing, per-class counts:')
print(df_balanced['class_str'].value_counts())


## 3. Train/Val/Test split by square (`numark`) and class

Requirements:
- 60% train, 20% validation, 20% test.
- Stratified by **class_str**.
- All pixels from a given `numark` (square id) must go to the same split.

Implementation:
- For each class, collect unique `numark` values.
- Within each class, split its `numark`s into train/val/test sets by the 60/20/20 rule.
- Union over classes to get global sets of `numark` for each split.
  (Assumes each `numark` belongs predominantly to one class, as is typical for MPC squares.)


In [None]:
# 3.1 Stratified group split by class_str and numark

train_squares = set()
val_squares = set()
test_squares = set()

rng = np.random.default_rng(123)

for cls in target_classes:
    df_cls = df_balanced[df_balanced['class_str'] == cls]
    squares = df_cls['numark'].dropna().unique()
    squares = list(squares)
    rng.shuffle(squares)

    n = len(squares)
    n_train = int(0.6 * n)
    n_val = int(0.2 * n)
    # rest to test

    train_s = squares[:n_train]
    val_s = squares[n_train:n_train + n_val]
    test_s = squares[n_train + n_val:]

    train_squares.update(train_s)
    val_squares.update(val_s)
    test_squares.update(test_s)

print('Unique train squares:', len(train_squares))
print('Unique val squares  :', len(val_squares))
print('Unique test squares :', len(test_squares))

# 3.2 Build the actual splits
is_train = df_balanced['numark'].isin(train_squares)
is_val = df_balanced['numark'].isin(val_squares)
is_test = df_balanced['numark'].isin(test_squares)

df_train = df_balanced[is_train].copy()
df_val = df_balanced[is_val].copy()
df_test = df_balanced[is_test].copy()

print('\nSplit sizes (rows):')
print('Train:', df_train.shape[0])
print('Val  :', df_val.shape[0])
print('Test :', df_test.shape[0])

print('\nPer-class counts in Train:')
print(df_train['class_str'].value_counts())
print('\nPer-class counts in Val:')
print(df_val['class_str'].value_counts())
print('\nPer-class counts in Test:')
print(df_test['class_str'].value_counts())


## 4. Build time-series tensors for PyTorch

We:
- Identify all NDMI/NBR/NIR band columns.
- Sort them by name to keep time order.
- Reshape features into `[N, T, C]` with `C=3` (NDMI, NBR, NIR).
- Encode `class_str` to integer labels 0..(num_classes-1).


In [None]:
# 4.1 Identify time-series columns (NDMI/NBR/NIR)

ts_cols = sorted([c for c in df_balanced.columns
                 if c.startswith('NDMI_') or c.startswith('NBR_') or c.startswith('NIR_')])

print('Number of time-series columns:', len(ts_cols))
print('First 9 time-series cols:', ts_cols[:9])

# 4.2 Determine sequence length (T) and feature channels (C=3)
T = len(ts_cols) // 3
assert len(ts_cols) == 3 * T, 'Expected number of time-series columns to be multiple of 3.'

print(f'Sequence length T = {T}, channels C = 3')

# 4.3 Label encoding

le = LabelEncoder()
le.fit(target_classes)  # ensure consistent class order

df_train['label_idx'] = le.transform(df_train['class_str'])
df_val['label_idx'] = le.transform(df_val['class_str'])
df_test['label_idx'] = le.transform(df_test['class_str'])

print('Label mapping:')
for cls, idx in zip(le.classes_, range(len(le.classes_))):
    print(f'  {cls} -> {idx}')

num_classes = len(le.classes_)

# 4.4 Convert class_weights_dict (string->float) into tensor aligned with label_idx
class_weights_list = []
for cls in le.classes_:
    class_weights_list.append(class_weights_dict[cls])

class_weights = torch.tensor(class_weights_list, dtype=torch.float32)
print('\nClass weights tensor:', class_weights)


## 5. PyTorch Dataset and DataLoaders


In [None]:
class PixelTimeSeriesDataset(Dataset):
    def __init__(self, df, ts_cols, label_col):
        X = df[ts_cols].values.astype(np.float32)
        N = X.shape[0]
        T = len(ts_cols) // 3
        # reshape to [N, T, 3]
        X = X.reshape(N, T, 3)
        self.X = X
        self.y = df[label_col].values.astype(np.int64)

    def __len__(self):
        return len(self.y)

    def __getitem__(self, idx):
        return self.X[idx], self.y[idx]


train_ds = PixelTimeSeriesDataset(df_train, ts_cols, 'label_idx')
val_ds = PixelTimeSeriesDataset(df_val, ts_cols, 'label_idx')
test_ds = PixelTimeSeriesDataset(df_test, ts_cols, 'label_idx')

batch_size = 256

train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=False)
test_loader = DataLoader(test_ds, batch_size=batch_size, shuffle=False)

len(train_ds), len(val_ds), len(test_ds)


## 6. Define models: 1D-Conv and LSTM


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print('Using device:', device)

class Conv1DClassifier(nn.Module):
    def __init__(self, seq_len, num_classes):
        super().__init__()
        self.seq_len = seq_len
        self.conv1 = nn.Conv1d(in_channels=3, out_channels=32, kernel_size=3, padding=1)
        self.conv2 = nn.Conv1d(in_channels=32, out_channels=64, kernel_size=3, padding=1)
        self.relu = nn.ReLU()
        self.pool = nn.AdaptiveAvgPool1d(1)  # global average pooling over time
        self.fc = nn.Linear(64, num_classes)

    def forward(self, x):
        # x: [B, T, C]
        x = x.permute(0, 2, 1)  # [B, C, T]
        x = self.relu(self.conv1(x))
        x = self.relu(self.conv2(x))
        x = self.pool(x).squeeze(-1)  # [B, 64]
        logits = self.fc(x)
        return logits


class LSTMClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, num_classes):
        super().__init__()
        self.lstm = nn.LSTM(input_size=input_size,
                            hidden_size=hidden_size,
                            num_layers=num_layers,
                            batch_first=True,
                            bidirectional=False)
        self.fc = nn.Linear(hidden_size, num_classes)

    def forward(self, x):
        # x: [B, T, C]
        out, (h_n, c_n) = self.lstm(x)  # h_n: [num_layers, B, H]
        last_hidden = h_n[-1]           # [B, H]
        logits = self.fc(last_hidden)
        return logits


## 7. Training and evaluation utilities


In [None]:
def train_one_epoch(model, loader, criterion, optimizer, device):
    model.train()
    total_loss = 0.0
    correct = 0
    total = 0
    for X, y in loader:
        X = X.to(device)
        y = y.to(device)

        optimizer.zero_grad()
        logits = model(X)
        loss = criterion(logits, y)
        loss.backward()
        optimizer.step()

        total_loss += loss.item() * X.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += X.size(0)

    return total_loss / total, correct / total


@torch.no_grad()
def evaluate(model, loader, criterion, device):
    model.eval()
    total_loss = 0.0
    correct = 0
    total = 0
    for X, y in loader:
        X = X.to(device)
        y = y.to(device)

        logits = model(X)
        loss = criterion(logits, y)

        total_loss += loss.item() * X.size(0)
        preds = logits.argmax(dim=1)
        correct += (preds == y).sum().item()
        total += X.size(0)

    return total_loss / total, correct / total


## 8. Train Conv1D model


In [None]:
seq_len = T
conv_model = Conv1DClassifier(seq_len=seq_len, num_classes=num_classes).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer = torch.optim.Adam(conv_model.parameters(), lr=1e-3)

num_epochs = 20

best_val_acc = 0.0
best_state = None

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(conv_model, train_loader, criterion, optimizer, device)
    val_loss, val_acc = evaluate(conv_model, val_loader, criterion, device)

    if val_acc > best_val_acc:
        best_val_acc = val_acc
        best_state = conv_model.state_dict()

    print(f'Epoch {epoch:02d}: '
          f'train_loss={train_loss:.4f}, train_acc={train_acc:.3f}, '
          f'val_loss={val_loss:.4f}, val_acc={val_acc:.3f}')

print('\nBest Conv1D val_acc:', best_val_acc)

if best_state is not None:
    conv_model.load_state_dict(best_state)

test_loss_conv, test_acc_conv = evaluate(conv_model, test_loader, criterion, device)
print(f'Conv1D TEST: loss={test_loss_conv:.4f}, acc={test_acc_conv:.3f}')


## 9. Train LSTM model


In [None]:
lstm_model = LSTMClassifier(input_size=3, hidden_size=64, num_layers=1, num_classes=num_classes).to(device)

criterion_lstm = nn.CrossEntropyLoss(weight=class_weights.to(device))
optimizer_lstm = torch.optim.Adam(lstm_model.parameters(), lr=1e-3)

best_val_acc_lstm = 0.0
best_state_lstm = None

for epoch in range(1, num_epochs + 1):
    train_loss, train_acc = train_one_epoch(lstm_model, train_loader, criterion_lstm, optimizer_lstm, device)
    val_loss, val_acc = evaluate(lstm_model, val_loader, criterion_lstm, device)

    if val_acc > best_val_acc_lstm:
        best_val_acc_lstm = val_acc
        best_state_lstm = lstm_model.state_dict()

    print(f'Epoch {epoch:02d}: '
          f'train_loss={train_loss:.4f}, train_acc={train_acc:.3f}, '
          f'val_loss={val_loss:.4f}, val_acc={val_acc:.3f}')

print('\nBest LSTM val_acc:', best_val_acc_lstm)

if best_state_lstm is not None:
    lstm_model.load_state_dict(best_state_lstm)

test_loss_lstm, test_acc_lstm = evaluate(lstm_model, test_loader, criterion_lstm, device)
print(f'LSTM TEST: loss={test_loss_lstm:.4f}, acc={test_acc_lstm:.3f}')


## 10. Final comparison


In [None]:
print('\n=== Final Test Results ===')
print(f'Conv1D  - test_acc: {test_acc_conv:.3f}')
print(f'LSTM    - test_acc: {test_acc_lstm:.3f}')

print('\nClass mapping (label_idx -> class_str):')
for idx, cls in enumerate(le.classes_):
    print(f'  {idx}: {cls}')
