In [1]:
!pip install rasterio



In [2]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import os
import rasterio
from tqdm.notebook import tqdm
import datetime
import pandas as pd

# Set random seed for reproducibility
SEED = 42
np.random.seed(SEED)

In [None]:
DATA_PATH = 'datasets'

train_dataset = pd.read_csv(os.path.join(DATA_PATH, 'TrainDataset.csv'))
test_dataset = pd.read_csv(os.path.join(DATA_PATH, 'TestDataset.csv'))

train_images_path = os.path.join(DATA_PATH, 'train_test_images/train/')
test_images_path = os.path.join(DATA_PATH, 'train_test_images/test/')

In [4]:
# Lets drop null tifpaths
train_dataset = train_dataset.dropna(subset=['tifPath'])
test_dataset = test_dataset.dropna(subset=['tifPath'])

print(f"Train dataset shape: {train_dataset.shape}")
print(f"Test dataset shape: {test_dataset.shape}")

Train dataset shape: (7433, 6)
Test dataset shape: (2201, 4)


In [7]:
# Replace the image paths
train_dataset['tifPath'] = train_dataset['tifPath'].apply(lambda x: os.path.join(train_images_path, os.path.basename(x)))
test_dataset['tifPath'] = test_dataset['tifPath'].apply(lambda x: os.path.join(test_images_path, os.path.basename(x)))

# View train data
train_dataset.head()

Unnamed: 0,ID,year,month,tifPath,Target,class
0,ID_h14T0B_Jan,2024,Jan,datasets\train_test_images/train/s2_Rubber_ID_...,Rubber,3
1,ID_KbyKOr_Jan,2024,Jan,datasets\train_test_images/train/s2_Rubber_ID_...,Rubber,3
2,ID_t4Tmmn_Jan,2024,Jan,datasets\train_test_images/train/s2_Rubber_ID_...,Rubber,3
3,ID_yipWoC_Jan,2024,Jan,datasets\train_test_images/train/s2_Rubber_ID_...,Rubber,3
4,ID_XKiksa_Jan,2024,Jan,datasets\train_test_images/train/s2_Rubber_ID_...,Rubber,3


In [8]:
# View test data
test_dataset.head()

Unnamed: 0,ID,year,month,tifPath
0,ID_731818_Jan,2024,Jan,datasets\train_test_images/test/s2_Unknown_ID_...
1,ID_790093_Jan,2024,Jan,datasets\train_test_images/test/s2_Unknown_ID_...
2,ID_931033_Jan,2024,Jan,datasets\train_test_images/test/s2_Unknown_ID_...
3,ID_079024_Jan,2024,Jan,datasets\train_test_images/test/s2_Unknown_ID_...
4,ID_691532_Jan,2024,Jan,datasets\train_test_images/test/s2_Unknown_ID_...


In [9]:
print(f"Train dataset shape: {train_dataset.shape}")
print(f"Test dataset shape: {test_dataset.shape}")

Train dataset shape: (7433, 6)
Test dataset shape: (2201, 4)


In [10]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from functools import partial

def process_statistics(row, all_bands, is_train=False):
    tif_path = row["tifPath"]
    if pd.isna(tif_path) or not os.path.exists(tif_path):
        print(f"Missing or invalid path: {tif_path}")
        return None

    try:
        with rasterio.open(tif_path) as src:
            band_stats = {}
            for i in range(1, src.count + 1):
                if i-1 < len(all_bands):
                    band_name = all_bands[i - 1]
                    band_data = src.read(i).astype('float32')
                    band_data[band_data == 0] = np.nan  # mask invalid data

                    # Extract statistics
                    band_stats[f"{band_name}_mean"] = np.nanmean(band_data)
                    band_stats[f"{band_name}_std"] = np.nanstd(band_data)
                    band_stats[f"{band_name}_min"] = np.nanmin(band_data)
                    band_stats[f"{band_name}_max"] = np.nanmax(band_data)
                    band_stats[f"{band_name}_median"] = np.nanmedian(band_data)

        row_data = {
            "ID": row["ID"],
            "month": row.get("month", None),
            **band_stats
        }

        if is_train:
            if "Target" in row:
                row_data["Crop"] = row["Target"]
            if "class" in row:
                row_data["class"] = row["class"]

        return row_data

    except Exception as e:
        print(f"Error processing {tif_path}: {e}")
        return None

def extract_sentinel2_statistics(
    df: pd.DataFrame, 
    data_path: str, 
    output_filename: str, 
    is_train: bool = False, 
    workers: int = 4) -> pd.DataFrame:
    
    ALL_BANDS = ['B1', 'B2', 'B3', 'B4', 'B5', 'B6', 'B7', 'B8', 'B8A', 'B9', 'B11', 'B12']
    process_func = partial(process_statistics, all_bands=ALL_BANDS, is_train=is_train)
    features = []

    print(f"Processing {len(df)} files using {workers} parallel workers...")

    with ThreadPoolExecutor(max_workers=workers) as executor:
        future_to_row = {executor.submit(process_func, row): idx for idx, row in df.iterrows()}

        for future in tqdm(as_completed(future_to_row), total=len(future_to_row), desc="Extracting Band Statistics"):
            result = future.result()
            if result:
                features.append(result)

    features_df = pd.DataFrame(features)

    # Calculate success rate
    success_rate = len(features_df) / len(df) * 100
    print(f"Successfully processed {len(features_df)} out of {len(df)} files ({success_rate:.1f}%)")

    # Save to CSV
    output_path = os.path.join(DATA_PATH, output_filename)
    features_df.to_csv(output_path, index=False)
    print(f"Saved features to {output_path}")

    return features_df

# Lets save the features in the drive
train_features_path = os.path.join(DATA_PATH, 'train_features.csv')
test_features_path = os.path.join(DATA_PATH, 'test_features.csv')

train_features_df = extract_sentinel2_statistics(
        train_dataset,
        data_path=train_images_path,
        output_filename='train_features.csv',
        is_train=True,
        workers=4
    )

test_features_df = extract_sentinel2_statistics(
        test_dataset,
        data_path=test_images_path,
        output_filename='test_features.csv',
        is_train=False,
        workers=4
    )

Processing 7433 files using 4 parallel workers...


Extracting Band Statistics:   0%|          | 0/7433 [00:00<?, ?it/s]

Successfully processed 7433 out of 7433 files (100.0%)
Saved features to datasets\train_features.csv
Processing 2201 files using 4 parallel workers...


Extracting Band Statistics:   0%|          | 0/2201 [00:00<?, ?it/s]

Successfully processed 2201 out of 2201 files (100.0%)
Saved features to datasets\test_features.csv


In [11]:
train_features_df = pd.read_csv(os.path.join(DATA_PATH, 'train_features.csv'))
test_features_df = pd.read_csv(os.path.join(DATA_PATH, 'test_features.csv'))

In [12]:
train_features_df.head()

Unnamed: 0,ID,month,B1_mean,B1_std,B1_min,B1_max,B1_median,B2_mean,B2_std,B2_min,...,B11_min,B11_max,B11_median,B12_mean,B12_std,B12_min,B12_max,B12_median,Crop,class
0,ID_ZUfp59_Jan,Jan,3119.697,593.1587,2584.0,4595.0,2810.0,2963.4856,760.8518,2267.0,...,2730.0,6633.0,3372.0,2835.8235,1114.4021,1759.0,5613.0,2238.0,Palm,2
1,ID_KfCbOO_Jan,Jan,4412.24,496.0062,3406.0,5444.0,4367.0,4380.461,642.4509,2979.0,...,2716.0,7161.0,5151.0,4236.176,830.2943,2072.0,5860.0,4260.0,Palm,2
2,ID_t4Tmmn_Jan,Jan,2530.2866,10.403985,2501.0,2600.0,2529.0,2249.944,24.179565,2187.0,...,1977.0,3596.0,2988.0,1940.8265,145.27899,1460.0,2422.0,1937.0,Rubber,3
3,ID_XKiksa_Jan,Jan,2682.8823,279.12048,2526.0,3972.0,2577.0,2397.5918,364.70447,2194.0,...,1980.0,5821.0,2401.0,1892.7155,639.4767,1446.0,4789.0,1669.0,Rubber,3
4,ID_yipWoC_Jan,Jan,2559.1108,28.09847,2530.0,2823.0,2555.0,2277.1003,32.87494,2210.0,...,2476.0,3731.0,2978.0,1906.5609,87.35511,1695.0,2477.0,1896.0,Rubber,3


In [13]:
train_features_df.columns.tolist()

['ID',
 'month',
 'B1_mean',
 'B1_std',
 'B1_min',
 'B1_max',
 'B1_median',
 'B2_mean',
 'B2_std',
 'B2_min',
 'B2_max',
 'B2_median',
 'B3_mean',
 'B3_std',
 'B3_min',
 'B3_max',
 'B3_median',
 'B4_mean',
 'B4_std',
 'B4_min',
 'B4_max',
 'B4_median',
 'B5_mean',
 'B5_std',
 'B5_min',
 'B5_max',
 'B5_median',
 'B6_mean',
 'B6_std',
 'B6_min',
 'B6_max',
 'B6_median',
 'B7_mean',
 'B7_std',
 'B7_min',
 'B7_max',
 'B7_median',
 'B8_mean',
 'B8_std',
 'B8_min',
 'B8_max',
 'B8_median',
 'B8A_mean',
 'B8A_std',
 'B8A_min',
 'B8A_max',
 'B8A_median',
 'B9_mean',
 'B9_std',
 'B9_min',
 'B9_max',
 'B9_median',
 'B11_mean',
 'B11_std',
 'B11_min',
 'B11_max',
 'B11_median',
 'B12_mean',
 'B12_std',
 'B12_min',
 'B12_max',
 'B12_median',
 'Crop',
 'class']