## In this notebook

- Download features.

In [1]:
import os
from tqdm import tqdm

# analytics
import pandas as pd
pd.set_option("display.max_rows", 100)
pd.set_option("display.max_columns", 100)
import numpy as np

# features
import planetary_computer as pc
from pystac_client import Client

# plot
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
cd ..

/Users/strvmac/projects/competition-drivendata-tick-tick-bloom


In [3]:
from utils.features import (
    crop_landsat_image,
    crop_sentinel_image,
    get_bounding_box,
    get_date_range,
    select_best_item,
)

In [4]:
DATA_FOLDER = "/Users/strvmac/projects/competition-drivendata-tick-tick-bloom/data/"
FEATURES_FOLDER = "/Users/strvmac/projects/competition-drivendata-tick-tick-bloom/data/features/"
METADATA_FILEPATH = "/Users/strvmac/projects/competition-drivendata-tick-tick-bloom/data/metadata.csv"

PYSTAT_CLIENT_URL = "https://planetarycomputer.microsoft.com/api/stac/v1"

In [5]:
! ls -l $DATA_FOLDER

total 3016
drwxr-xr-x  5 strvmac  staff      160 Feb  7 20:59 [34mfeatures[m[m
-rw-r--r--@ 1 strvmac  staff  1071430 Feb  6 16:02 metadata.csv
-rw-r--r--@ 1 strvmac  staff    89581 Feb  6 16:02 submission_format.csv
-rw-r--r--@ 1 strvmac  staff   377581 Feb  6 16:02 train_labels.csv


## Load metadata

In [6]:
%%time

df = (
    pd
    .read_csv(METADATA_FILEPATH)
    .sort_values(by=["split", "date"], ascending=[False, True])
    .reset_index(drop=True)
)

df.date = pd.to_datetime(df.date)
df.tail()

CPU times: user 30.5 ms, sys: 8.16 ms, total: 38.7 ms
Wall time: 37.4 ms


Unnamed: 0,uid,latitude,longitude,date,split
23565,howu,36.7085,-121.749,2021-12-29,test
23566,nsoi,36.7368,-121.734,2021-12-29,test
23567,prfi,36.7518,-121.742,2021-12-29,test
23568,teuu,36.7723,-121.788,2021-12-29,test
23569,thki,36.7254,-121.73,2021-12-29,test


## Load features

In [7]:
# connect to the STAC API

catalog = Client.open(
    url=PYSTAT_CLIENT_URL, 
    modifier=pc.sign_inplace,
)

In [8]:
%%time

errored_ids = []
    
for i, row in tqdm(df.iterrows(), total=len(df)):
    
    image_array_path = os.path.join(FEATURES_FOLDER, f"{row.split}/{row.uid}.npy")

    if os.path.exists(image_array_path):
        continue

    try:
        ## QUERY STAC API
        # get query ranges for location and date
        search_bbox = get_bounding_box(row.latitude, row.longitude, meter_buffer=50000)
        search_date_range = get_date_range(row.date, time_buffer_days=15)

        # search the planetary computer
        search = catalog.search(
            collections=["sentinel-2-l2a", "landsat-c2-l2"],
            # collections=["sentinel-2-l2a", "landsat-8-c2-l2", "landsat-9-c2-l2"],
            bbox=search_bbox,
            datetime=search_date_range,
        )
        items = [item for item in search.get_all_items()]

        ## GET BEST IMAGE
        if len(items) == 0:
            pass
        else:
            best_item, item_platform, item_date = select_best_item(
                items, row.date, row.latitude, row.longitude
            )

        ## CONVERT TO FEATURES
        # get small bbox just for features
        feature_bbox = get_bounding_box(row.latitude, row.longitude, meter_buffer=100)

        # crop the image
        if "sentinel" in item_platform.lower():
            image_array = crop_sentinel_image(best_item, feature_bbox)
        else:
            image_array = crop_landsat_image(best_item, feature_bbox)

        # save image array so we don't have to rerun
        with open(image_array_path, "wb") as f:
            np.save(f, image_array)

    # keep track of any that ran into errors without interrupting the process
    except Exception as e:
        errored_ids.append(row.uid)
    
    # break

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23570/23570 [2:49:55<00:00,  2.31it/s]

CPU times: user 1h 7min 59s, sys: 7min 36s, total: 1h 15min 35s
Wall time: 2h 49min 55s





In [9]:
print(f"❌ Could not pull satellite imagery for {len(errored_ids)} samples!")

❌ Could not pull satellite imagery for 264 samples!


## Results

- Data downloaded and saved into proper folders.