In [None]:
import numpy as np
import pandas as pd
import dask.dataframe as dd
from tqdm.notebook import tqdm

import datetime
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

import ipywidgets as widgets
from IPython.display import display
from ipywidgets import IntProgress, HTML, VBox

import warnings
warnings.filterwarnings("ignore")

In [4]:
Adults = pd.read_csv("Adults.csv")
Swarms  = pd.read_csv("Swarms.csv")
Hoppers = pd.read_csv("Hoppers.csv")
Ecology = pd.read_csv("Ecology.csv")
Control_Operations = pd.read_csv("Control_Operations.csv")

In [5]:
fao_data = pd.concat([Ecology, Swarms, Hoppers, Adults])

In [6]:
#convert date format from onbject to pandas datetime
date_columns = ['STARTDATE', 'FINISHDATE', 'CTLSTDATE', 'CTLFNDATE']
fao_data[date_columns] = fao_data[date_columns].apply(pd.to_datetime)

In [7]:
#Ensure that both Ad and Adult are categorized under the same category
fao_data.loc[fao_data['CAT'] == 'Ad', 'CAT'] = 'Adult'

In [8]:
fao_data["TIME"] = fao_data["STARTDATE"]

In [9]:
month_name = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']

In [10]:
resx, resy = (0.01, 0.01)

lat_to_bucket_id = lambda x: int((x+90)/resy)
lon_to_bucket_id = lambda x: int((x+180)/resx)

bucket_id_to_lat = lambda x: (x*resy) - 90
bucket_id_to_lon = lambda x: (x*resx) - 180

In [10]:
#Replacing specific lat and lon of each row of data with its corresponding bucket_id

truncated_data = fao_data.copy()
truncated_data['lat_bucket_id'] = fao_data['Y'].apply(lat_to_bucket_id)
truncated_data['lon_bucket_id'] = fao_data['X'].apply(lon_to_bucket_id)

truncated_data['date'] = fao_data['STARTDATE']
truncated_data["year"] = fao_data['STARTDATE'].dt.year
truncated_data["month"] = fao_data['STARTDATE'].dt.month_name()

In [9]:
aggregate_data = truncated_data.groupby(['date','lat_bucket_id','lon_bucket_id']).agg(
# meta data
    TIME = ('STARTDATE', lambda col: col.mean()), 

# data count
    Count      = ('CAT', len),
    Swarm      = ('CAT', lambda col: sum(col=='Swarm')),
    Adult      = ('CAT', lambda col: sum(col=='Adult')),
    Hopper     = ('CAT', lambda col: sum(col=='Hopper')),
    Ecology    = ('CAT', lambda col: sum(col=='Ecology')),

# Ecology data count
    # NATVEGCAT___Ecology
    NATVEGCAT_Green    = ('NATVEGCAT', lambda col: sum(col=='Green')), 
    NATVEGCAT_Drying   = ('NATVEGCAT', lambda col: sum(col=='Drying')), 
    NATVEGCAT_Dry      = ('NATVEGCAT', lambda col: sum(col=='Dry')), 
    NATVEGCAT_Greening = ('NATVEGCAT', lambda col: sum(col=='Greening')), 

    # NATVEGDEN___Ecology
    NATVEGDEN_Moderate = ('NATVEGDEN', lambda col: sum(col=='Moderate')),
    NATVEGDEN_Sparse   = ('NATVEGDEN', lambda col: sum(col=='Sparse')), 
    NATVEGDEN_Dense    = ('NATVEGDEN', lambda col: sum(col=='Dense')), 

    # SOILMOIST___Ecology
    SOILMOIST_Dry      = ('SOILMOIST', lambda col: sum(col=='Dry')),
    SOILMOIST_Wet      = ('SOILMOIST', lambda col: sum(col=='Wet')),

    # BREEDING
    BREEDING_yes = ("BREEDING", lambda col: sum(col==1)),
    BREEDING_no = ("BREEDING", lambda col: sum(col==2)),
    BREEDING_na = ("BREEDING", lambda col: sum(col==-1)),
)

In [11]:
#Add month and day data
aggregate_data = truncated_data #aggregate_data.reset_index()
aggregate_data['month'] = aggregate_data['TIME'].dt.month_name()
aggregate_data['day']   = aggregate_data['TIME'].dt.day
aggregate_data['year']   = aggregate_data['TIME'].dt.year
aggregate_data["lat"] = aggregate_data["lat_bucket_id"].apply(bucket_id_to_lat)
aggregate_data["lon"] = aggregate_data["lon_bucket_id"].apply(bucket_id_to_lon)

available_years = aggregate_data["year"].unique().tolist()

# Merge ISRIC Data

In [3]:
def preprocess_isric_data(filename):
    data = dd.read_csv(filename)
    data["lat_bucket_id"] = data["y"].apply(lat_to_bucket_id)
    data["lon_bucket_id"] = data["x"].apply(lon_to_bucket_id)
    data = data.drop(["band", 'y', 'x'], axis=1)
    data_column = None
    for col in data.columns:
        if "mean" in col:
            data_column = col
    data = data.groupby(['lat_bucket_id','lon_bucket_id']).agg(np.max)
    return data.reset_index().set_index("lat_bucket_id")

aggregate_data = dd.from_pandas(aggregate_data, npartitions=1)

NameError: name 'aggregate_data' is not defined

In [13]:
clay_0_5cm = preprocess_isric_data("data/fao_clay_0_5cm_mean.csv")
fao_data_with_isric = dd.merge(aggregate_data, clay_0_5cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del clay_0_5cm

In [14]:
clay_5_15cm = preprocess_isric_data("data/fao_clay_5_15cm_mean.csv")
fao_data_with_isric = dd.merge(dd.from_pandas(fao_data_with_isric, npartitions=1), clay_5_15cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del clay_5_15cm

In [15]:
sand_0_5cm = preprocess_isric_data("data/fao_sand_0_5cm_mean.csv")
fao_data_with_isric = dd.merge(dd.from_pandas(fao_data_with_isric, npartitions=1), sand_0_5cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del sand_0_5cm

In [16]:
sand_5_15cm = preprocess_isric_data("data/fao_sand_5_15cm_mean.csv")
fao_data_with_isric = dd.merge(dd.from_pandas(fao_data_with_isric, npartitions=1), sand_5_15cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del sand_5_15cm

In [17]:
silt_0_5cm = preprocess_isric_data("data/fao_silt_0_5cm_mean.csv")
fao_data_with_isric = dd.merge(dd.from_pandas(fao_data_with_isric, npartitions=1), silt_0_5cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del silt_0_5cm

In [18]:
silt_5_15cm = preprocess_isric_data("data/fao_silt_5_15cm_mean.csv")
fao_data_with_isric = dd.merge(dd.from_pandas(fao_data_with_isric, npartitions=1), silt_5_15cm, how="left", on=['lat_bucket_id', 'lon_bucket_id']).compute()
del silt_5_15cm

In [19]:
fao_data_with_isric.to_csv("fao_data_with_isric_raw.csv")

In [None]:
def preprocess_isric_data(filename):
    data = dd.read_csv(filename)
    data["lat_bucket_id"] = data["y"].apply(lat_to_bucket_id)
    data["lon_bucket_id"] = data["x"].apply(lon_to_bucket_id)
    data = data.drop(["band", 'y', 'x'], axis=1)
    data_column = None
    for col in data.columns:
        if "mean" in col:
            data_column = col
    data = data.groupby(['lat_bucket_id','lon_bucket_id']).agg(np.max)
    return data.compute() #reset_index().set_index("lat_bucket_id")

In [None]:
clay_5_15cm = preprocess_isric_data("data/fao_clay_5_15cm_mean.csv")
sand_5_15cm = preprocess_isric_data("data/fao_sand_5_15cm_mean.csv")
silt_5_15cm = preprocess_isric_data("data/fao_silt_5_15cm_mean.csv")

In [None]:
soil_profile = clay_5_15cm
del clay_5_15cm

In [None]:
soil_profile["sand_5_15cm"] = sand_5_15cm["sand_5_15cm"]
del sand_5_15cm

In [13]:
soil_profile["silt_5_15cm"] = silt_5_15cm["silt_5_15cm"]
del silt_5_15cm

In [None]:
soil_profile = soil_profile.compute()

In [None]:
del clay_5_15cm
del sand_5_15cm

soil_profile = dd.concat([soil_profile, silt_5_15cm], axis=1).compute()

In [13]:
sand_5_15cm

Unnamed: 0_level_0,Unnamed: 1_level_0,sand_5-15cm_mean
lat_bucket_id,lon_bucket_id,Unnamed: 2_level_1
7600,10200,0
7600,10201,0
7600,10202,0
7600,10203,0
7600,10204,0
...,...,...
13765,26595,372
13765,26596,355
13765,26597,348
13765,26598,351


In [16]:
silt_5_15cm

Unnamed: 0_level_0,Unnamed: 1_level_0,silt_5-15cm_mean
lat_bucket_id,lon_bucket_id,Unnamed: 2_level_1
7600,10200,0
7600,10201,0
7600,10202,0
7600,10203,0
7600,10204,0
...,...,...
13581,26595,0
13581,26596,0
13581,26597,0
13581,26598,0


In [2]:
soil_profile = pd.concat([sand_5_15cm, clay_5_15cm, silt_5_15cm], axis=1)

NameError: name 'pd' is not defined

In [1]:
soil_profile

NameError: name 'soil_profile' is not defined