# Create synthetic dataset

The data was kindly provided by the Plantix team, who develop a mobile application for smallholder farmers. To protect the privacy of the Plantix users, the data used in this example does not contain the exact locations of the observations. This, the results are slightly different than the ones presented in the paper. To generate the data for this paper, locations were randomly samples within each 11x11km grid cell.

This notebook is for transparency. You need access to the original non-public dataset to run it.

In [None]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np

In [None]:
df = pd.read_feather('/content/gdrive/MyDrive/data/ap/combined1.feather')

In [None]:
grid = gpd.read_file("/content/gdrive/MyDrive/data/ap/grid.geojson")

In [None]:
def get_xy_minmax(row):
    coords = np.array(row.geometry.exterior.coords)
    return coords[:, 0].min(), coords[:, 0].max(), coords[:, 1].min(), coords[:, 1].max()

In [None]:
grid[["xmin", "xmax", "ymin", "ymax"]] = pd.DataFrame(grid.apply(get_xy_minmax, axis=1).tolist(), index=grid.index)

In [None]:
df = pd.merge(df, grid.drop("geometry", axis=1), on="cell_id", how="left")

In [None]:
df.rename(columns={"latitude": "real_latitude", "longitude": "real_longitude"}, inplace=True)

In [None]:
df["longitude"] = np.random.random(len(df)) * (df.xmax - df.xmin) + df.xmin
df["latitude"] = np.random.random(len(df)) * (df.ymax - df.ymin) + df.ymin

In [None]:
DISEASE = 600038 # Tuta absoluta
df["presence"] = (df["dnn_peat_id"] == DISEASE).astype(int)
df.drop(columns=["index", "level_0", "dnn_peat_id", "dnn_variety", "real_latitude", "real_longitude", "xmin", "xmax", "ymin", "ymax"], inplace=True)
df = pd.concat([df[df.presence==0].sample(int((df.presence==1).sum() * 5)), df[df.presence==1]])
df.sort_values(["date", "cell_id"], ascending=True, inplace=True) # for time series cross validation

In [None]:
df.to_feather('/content/gdrive/MyDrive/data/ap/combined_synthetic1.feather')