<a href="https://colab.research.google.com/github/issmythe/ccai_crop_mapping/blob/main/tutorial_cleaned.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# OpenMapFlow Tutorial

### Sections
1. Installing OpenMapFlow
2. Exploring labeled earth observation data
3. Training a model
4. Doing inference over small region
5. Deploying of best model

### Prerequisites:
- Github account
- Github access token (obtained [here](https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/creating-a-personal-access-token))
- Forked OpenMapFlow repository
- Basic Python knowledge  

### Editable Google Doc for Q&A:
https://docs.google.com/document/d/1Kp6MphER1G5tdLYeAzl4n19S10TweIxiYT64rXsjKm4/edit?usp=sharing

## 1. Clone Github repo and install OpenMapFlow


In [None]:
!pip install "ipywidgets>=7,<8" -q # https://github.com/googlecolab/colabtools/issues/3020

In [None]:
#@title Git credentials
from ipywidgets import HTML, Password, Text, Textarea, VBox
inputs = [
      Password(description="Github Token:"),
      Text(description='Github Email:'),
      Text(description='Github User:'),
]
VBox(inputs)

In [None]:
#@title Clone directory
token = inputs[0].value
email = inputs[1].value
username = inputs[2].value

github_url_input = Textarea(value=f'https://github.com/{username}/openmapflow.git')
VBox([HTML(value="<b>Github Clone URL</b>"), github_url_input])

! git clone -q https://$token@github.com/nasaharvest/openmapflow.git

In [None]:
#@title Config
from pathlib import Path

github_url = github_url_input.value
project_name = "crop-mask-example" # maize-example
country_name = "Togo" # Kenya

for input_value in [token, email, username, github_url]:
  if input_value.strip() == "":
    raise ValueError("Found input with blank value.")

path_to_project = f"{Path(github_url).stem}/{project_name}"

!git config --global user.email $username
!git config --global user.name $email
!git clone {github_url.replace("https://", f"https://{username}:{token}@")}

%cd {path_to_project}

In [None]:
#@title Installs
!pip install openmapflow[all] -q &> /dev/null
!pip install dvc[gs] cmocean -q &> /dev/null

In [None]:
#@title Download GDAL
%%shell
GDAL_VERSION="3.6.4+dfsg-1~jammy0"
add-apt-repository -y ppa:ubuntugis/ubuntugis-unstable &> /dev/null
apt-get -qq update &> /dev/null
apt-get -qq install python3-gdal=$GDAL_VERSION gdal-bin=$GDAL_VERSION libgdal-dev=$GDAL_VERSION &> /dev/null

In [None]:
# CLI
!openmapflow

## 2. Exploring labeled earth observation data 🛰️



###Setup

In [None]:
# A Google Cloud Account is required to access the data
!gcloud auth application-default login

In [None]:
# Pull in data already available
! dvc pull &> /dev/null

In [None]:
# See report of data already available
! openmapflow datasets

### Exploring labels

In [None]:
#@title Imports + read data
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from datasets import datasets, label_col
from openmapflow.constants import LAT, LON, DATASET, SUBSET

df = pd.concat([d.load_df(to_np=True) for d in datasets[:1]]) # Global only

In [None]:
#@title Convert pandas dataframe to geopandas dataframe
gdf = gpd.GeoDataFrame(df)
gdf["geometry"] = [Point(xy) for xy in zip(gdf[LON], gdf[LAT])]

In [None]:
#@title Plot labels
world = gpd.read_file(gpd.datasets.get_path('naturalearth_lowres'))
ax = world.plot(figsize=(20,20), facecolor="lightgray")
ax.set_title("Label Locations")
ax.axis('off')
gdf.plot(
    ax=ax,
    marker='o',
    categorical=True,
    markersize=1,
    column=DATASET,
    legend=True,
    legend_kwds={'loc': 'lower left'});

## 3. Train a model 🏋️‍♂️

In [None]:
import importlib
from openmapflow import train_utils
importlib.reload(train_utils)


In [None]:
#@title Imports
import warnings
from argparse import ArgumentParser

import numpy as np
import geopandas as gpd
import pandas as pd
import torch
import yaml
from datasets import datasets, label_col
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    precision_score,
    recall_score,
    roc_auc_score,
)
from torch.utils.data import DataLoader
from tsai.models.TransformerModel import TransformerModel

from openmapflow.bands import BANDS_MAX
from openmapflow.constants import SUBSET
from openmapflow.pytorch_dataset import PyTorchDataset
from openmapflow.train_utils import (
    generate_model_name,
    get_x_y,
    model_path_from_name,
    upsample_df,
)
from openmapflow.utils import tqdm

try:
    import google.colab  # noqa

    IN_COLAB = True
except ImportError:
    IN_COLAB = False


warnings.simplefilter("ignore", UserWarning)  # TorchScript throws excessive warnings

In [None]:
#@title Overwrite get_x_y
from typing import List, Tuple
from openmapflow.constants import CLASS_PROB, EO_DATA, MONTHS
from openmapflow.utils import str_to_np

def get_x_y(
    df: pd.DataFrame,
    label_col: str = CLASS_PROB,
    start_month: str = "February",
    input_months: int = 12,
) -> Tuple[List[np.ndarray], List[float]]:
    """Get the X and y data from a dataframe."""
    i = MONTHS.index(start_month)

    def to_numpy(x: str):
        if type(x) == str:
            x = str_to_np(x)
        return x[i : i + input_months, :]  # noqa

    tqdm.pandas()
    return df[EO_DATA].progress_apply(to_numpy).to_list(), df[label_col].to_list()


In [None]:
model_name = 'm1'

start_month = 'February'
input_months = 12
batch_size = 32
upsample_minority_ratio = 0.5
lr = 0.0001
num_epochs = 100


In [None]:
#@title Get country boundaries
countries = gpd.read_file('/content/world_adm.geojson')
gdf_lab = countries[['name', 'geometry']].sjoin(gdf, how='inner', predicate='intersects')


In [None]:
#@title Make Mexico training data
mex_train = pd.DataFrame(gdf_lab[gdf_lab['name'] == 'Mexico'].drop('geometry', axis=1))

np.random.seed(123)
mex_train = mex_train.sample(frac=1).reset_index(drop=True)
mex_train.loc[mex_train.index > int(len(mex_train) * 0.7), 'subset'] = 'validation'
mex_train.loc[mex_train.index > int(len(mex_train) * 0.9), 'subset'] = 'testing'


In [None]:
#@title Dataloaders
mex_train[label_col] = (mex_train[label_col] > 0.5).astype(int)
train_df = mex_train[mex_train[SUBSET] == "training"]

train_df = upsample_df(train_df, label_col, upsample_minority_ratio)
val_df = mex_train[mex_train[SUBSET] == "validation"]
x_train, y_train = get_x_y(train_df, label_col, start_month, input_months)
x_val, y_val = get_x_y(val_df, label_col, start_month, input_months)

# Convert to tensors
train_data = PyTorchDataset(x=x_train, y=y_train)
val_data = PyTorchDataset(x=x_val, y=y_val)
train_dataloader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
val_dataloader = DataLoader(val_data, batch_size=batch_size, shuffle=False)


In [None]:
# @title Init model
num_timesteps, num_bands = train_data[0][0].shape

class Model(torch.nn.Module):
    def __init__(self, normalization_vals=BANDS_MAX):
        super().__init__()
        self.model = TransformerModel(c_in=num_bands, c_out=1)
        self.normalization_vals = torch.tensor(normalization_vals)

    def forward(self, x):
        with torch.no_grad():
            x = x / self.normalization_vals
            x = x.transpose(2, 1)
        x = self.model(x).squeeze(dim=1)
        x = torch.sigmoid(x)
        return x

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = Model().to(device)

# ------------ Model hyperparameters -------------------------------------
params_to_update = model.parameters()
optimizer = torch.optim.Adam(params_to_update, lr=lr)
criterion = torch.nn.BCELoss()

if model_name == "":
    model_name = generate_model_name(val_df=val_df, start_month=start_month)

lowest_validation_loss = None
metrics = {}
train_batches = 1 + len(train_data) // batch_size
val_batches = 1 + len(val_data) // batch_size

In [None]:
#@title Train model
train_loss_arr, val_loss_arr = [], []
acc_arr, f1_arr, recall_arr, prec_arr = [], [], [], []

with tqdm(range(num_epochs), desc="Epoch") as tqdm_epoch:
    for epoch in tqdm_epoch:

        # ------------------------ Training ----------------------------------------
        total_train_loss = 0.0
        model.train()
        for x in tqdm(
            train_dataloader,
            total=train_batches,
            desc="Train",
            leave=False,
            disable=IN_COLAB,
        ):
            inputs, labels = x[0].to(device), x[1].to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # Get model outputs and calculate loss
            outputs = model(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            total_train_loss += loss.item() * len(inputs)

        # ------------------------ Validation --------------------------------------
        total_val_loss = 0.0
        y_true = []
        y_score = []
        y_pred = []
        model.eval()
        with torch.no_grad():
            for x in tqdm(
                val_dataloader,
                total=val_batches,
                desc="Validate",
                leave=False,
                disable=IN_COLAB,
            ):
                inputs, labels = x[0].to(device), x[1].to(device)

                # Get model outputs and calculate loss
                outputs = model(inputs)
                loss = criterion(outputs, labels)
                total_val_loss += loss.item() * len(inputs)

                y_true += labels.tolist()
                y_score += outputs.tolist()
                y_pred += (outputs > 0.5).long().tolist()

        # ------------------------ Metrics + Logging -------------------------------
        train_loss = total_train_loss / len(train_data)
        val_loss = total_val_loss / len(val_data)

        if lowest_validation_loss is None or val_loss < lowest_validation_loss:
            lowest_validation_loss = val_loss

        metrics = {
            "accuracy": accuracy_score(y_true, y_pred),
            "f1": f1_score(y_true, y_pred),
            "precision": precision_score(y_true, y_pred),
            "recall": recall_score(y_true, y_pred),
            "roc_auc": roc_auc_score(y_true, y_score),
        }
        metrics = {k: round(float(v), 4) for k, v in metrics.items()}
        print(round(train_loss, 3), round(val_loss, 3), metrics)

        train_loss_arr.append(train_loss)
        val_loss_arr.append(val_loss)
        acc_arr.append(metrics['accuracy'])
        f1_arr.append(metrics['f1'])
        recall_arr.append(metrics['recall'])
        prec_arr.append(metrics['precision'])

        tqdm_epoch.set_postfix(loss=val_loss)

        # ------------------------ Model saving --------------------------
        if lowest_validation_loss == val_loss:
            # Some models in tsai need to be modified to be TorchScriptable
            # https://github.com/timeseriesAI/tsai/issues/561
            sm = torch.jit.script(model)
            model_path = model_path_from_name(model_name=model_name)
            if model_path.exists():
                model_path.unlink()
            else:
                model_path.parent.mkdir(parents=True, exist_ok=True)
            sm.save(str(model_path))



In [None]:
#@title Plot results
import plotly
import plotly.graph_objects as go
from plotly import subplots

fig = plotly.subplots.make_subplots(rows=1, cols=2)

x = [x for x in range(num_epochs)]
fig.add_trace(go.Scatter(name='Train Loss', x=x, y=train_loss_arr, line_color='cornflowerblue'), row=1, col=1)
fig.add_trace(go.Scatter(name='Val Loss', x=x, y=val_loss_arr, line_color='orange'), row=1, col=1)

fig.add_trace(go.Scatter(name='Accuracy', x=x, y=acc_arr, line_color='blue'), row=1, col=2)
fig.add_trace(go.Scatter(name='F1', x=x, y=f1_arr, line_color='green'), row=1, col=2)
fig.add_trace(go.Scatter(name='Precision', x=x, y=prec_arr, line_color='purple'), row=1, col=2)
fig.add_trace(go.Scatter(name='Recall', x=x, y=recall_arr, line_color='red'), row=1, col=2)

fig.show()


In [None]:
print(f"MODEL_NAME={model_name}")
print(model_path_from_name(model_name=model_name))
print(yaml.dump(metrics, allow_unicode=True, default_flow_style=False))
