## Dependencies

In [1]:
#import libraries

import os
import glob
import json
import getpass
import rasterio
import numpy as np
import pandas as pd
from tqdm import tqdm
from radiant_mlhub import Dataset

import warnings
warnings.filterwarnings('ignore')

# DOWNLOAD DATA FROM MLHUB

In [2]:
#For simplicity we select 4 out 12 bands for the this baseline model

Full_bands = ['B01', 'B02', 'B03', 'B04','B05', 'B06', 'B07', 'B08','B8A', 'B09', 'B11', 'B12']

selected_bands = Full_bands
selected_bands

['B01',
 'B02',
 'B03',
 'B04',
 'B05',
 'B06',
 'B07',
 'B08',
 'B8A',
 'B09',
 'B11',
 'B12']

In [3]:
#define dataset collection_id , assets and necessary paths to collections

main = 'ref_agrifieldnet_competition_v1'

assets = ['field_ids','raster_labels']

source_collection = f'{main}_source'
train_label_collection = f'{main}_labels_train'
test_label_collection = f'{main}_labels_test'

In [4]:
#Append your MLHUB_API_KEY after this cell is executed to download dataset

os.environ['MLHUB_API_KEY'] =  getpass.getpass(prompt="MLHub API Key: ")

dataset = Dataset.fetch(main)

my_filter = dict(
    ref_agrifieldnet_competition_v1_labels_train=assets,

    ref_agrifieldnet_competition_v1_labels_test=[assets[0]],

    ref_agrifieldnet_competition_v1_source=selected_bands 
)

dataset.download(collection_filter=my_filter)

MLHub API Key: ········


unarchive ref_agrifieldnet_competition_v1.tar.gz: 100%|██████████| 6186/6186 [00:01<00:00, 5793.46it/s]
filter by collection ids and asset keys: 231716it [00:00, 3408205.70it/s]         
download assets: 100%|██████████| 7905/7905 [16:28<00:00,  8.00it/s]


## Prepare Train data


### Utill Functions

In [5]:
#Extract field_crop Pairs 

def field_crop_extractor(crop_field_files):
    field_crops = {}

    for label_field_file in tqdm(crop_field_files):
        with rasterio.open(f'{main}/{train_label_collection}/{train_label_collection}_{label_field_file}/field_ids.tif') as src:
            field_data = src.read()[0]
        with rasterio.open(f'{main}/{train_label_collection}/{train_label_collection}_{label_field_file}/raster_labels.tif') as src:
            crop_data = src.read()[0]
    
        for x in range(0, crop_data.shape[0]):
            for y in range(0, crop_data.shape[1]):
                field_id = str(field_data[x][y])
                field_crop = crop_data[x][y]

                if field_crops.get(field_id) is None:
                    field_crops[field_id] = []

                if field_crop not in field_crops[field_id]:
                    field_crops[field_id].append(field_crop)
    
    field_crop_map  =[[k, v[0]]  for k, v in field_crops.items() ]
    field_crop = pd.DataFrame(field_crop_map , columns=['field_id','crop_id'])

    return field_crop[field_crop['field_id']!='0']


img_sh = 256
selected_bands = ["B01","B02","B03","B04","B05","B06","B07","B08","B8A","B09","B11","B12"]
n_selected_bands= len(selected_bands)
def feature_extractor(data_ , path):
    '''
        data_: Dataframe with 'field_paths' and 'unique_folder_id' columns
        path: Path to source collections files

        returns: pixel dataframe with corresponding field_ids and pixel values for each band
    '''

    pixel_df = pd.DataFrame()
    tile_df = pd.DataFrame()
    tile_field_df = pd.DataFrame()

    idx = 0
    for tile_id in tqdm(data_['unique_folder_id']):
        
        field_src =   rasterio.open( data_['field_paths'].values[idx])
        field_array = field_src.read(1)
        field_ids = field_array.flatten()
        
        bands_src = [rasterio.open(f'{main}/{path}/{path}_{tile_id}/{band}.tif') for band in selected_bands]
        bands_array = [np.expand_dims(band.read(1).flatten(), axis=1) for band in bands_src ]

        # read json file for tile
        tile_info = json.load(open(f'{main}/{path}/{path}_{tile_id}/{path}_{tile_id}.json'))
        tile_bbox = tile_info['bbox']
        tile_inf = {
            "tile_id" : tile_id,
            "minx": tile_bbox[0],
            "miny": tile_bbox[1],
            "maxx": tile_bbox[2],
            "maxy": tile_bbox[3]
        }
        tile_df = tile_df.append(tile_inf, ignore_index=True)


        # build a dataframe with pixel values for each band
        df = pd.DataFrame(np.concatenate(bands_array, axis=1), columns=selected_bands)
        df['field_id'] = field_ids
        df['x'] = np.tile(np.arange(img_sh), img_sh)
        df['y'] = np.repeat(np.arange(img_sh), img_sh)
        
        # append df to pixel_df
        pixel_df = pixel_df.append(
            df[df['field_id'] != 0], ignore_index=True
        )

        # add field_id to tile_field_df
        unique_field_ids = df[df['field_id'] != 0][['field_id']].drop_duplicates()
        unique_field_ids['tile_id'] = tile_id
        tile_field_df = tile_field_df.append(unique_field_ids, ignore_index=True)

        idx += 1

    return pixel_df, tile_df, tile_field_df                                   



In [6]:
#load collection json and retrieve all unique folder ids 
#use all unique folder ids to create a list of field and label paths for all tiles

with open (f'{main}/{train_label_collection}/collection.json') as f:
    train_json = json.load(f)
    
train_folder_ids = [i['href'].split('_')[-1].split('.')[0] for i in train_json['links'][4:]]

train_field_paths = [f'{main}/{train_label_collection}/{train_label_collection}_{i}/field_ids.tif' for i in train_folder_ids]
train_label_paths = [f'{main}/{train_label_collection}/{train_label_collection}_{i}/raster_labels.tif' for i in train_folder_ids]

In [7]:
#create dataset for folder_ids and field_paths

competition_train_data = pd.DataFrame(train_folder_ids, columns=['unique_folder_id'])
competition_train_data['field_paths'] = train_field_paths
competition_train_data.head()

Unnamed: 0,unique_folder_id,field_paths
0,28852,ref_agrifieldnet_competition_v1/ref_agrifieldn...
1,d987c,ref_agrifieldnet_competition_v1/ref_agrifieldn...
2,ca1d4,ref_agrifieldnet_competition_v1/ref_agrifieldn...
3,2ec18,ref_agrifieldnet_competition_v1/ref_agrifieldn...
4,7575d,ref_agrifieldnet_competition_v1/ref_agrifieldn...


In [8]:
field_crop_pair = field_crop_extractor(train_folder_ids)
field_crop_pair.field_id = field_crop_pair.field_id.astype(int)
field_crop_pair.head()

100%|██████████| 1165/1165 [04:51<00:00,  4.00it/s]


Unnamed: 0,field_id,crop_id
1,757,6
2,756,6
3,1372,5
4,1374,1
5,1986,4


In [9]:
# extract features from train data

train_pixel_df, train_tile_df, train_tile_field_df = feature_extractor(competition_train_data, source_collection)
display(train_pixel_df.head(), train_tile_df.head(), train_tile_field_df.head())

100%|██████████| 1165/1165 [01:22<00:00, 14.19it/s]


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id,x,y
0,43,39,38,38,41,54,63,61,64,12,57,37,757,23,43
1,43,39,38,38,42,57,67,63,72,12,63,42,757,23,44
2,43,39,38,37,41,59,69,65,78,12,68,43,757,24,44
3,43,38,37,36,41,59,69,64,78,12,68,43,757,25,44
4,43,39,38,38,42,57,67,64,72,12,63,42,757,23,45


Unnamed: 0,tile_id,minx,miny,maxx,maxy
0,28852,82.293829,27.327138,82.319974,27.350486
1,d987c,81.542781,27.193544,81.568744,27.216758
2,ca1d4,81.543229,27.285989,81.569214,27.309203
3,2ec18,87.712321,25.343604,87.7379,25.366846
4,7575d,83.676445,19.10429,83.701132,19.127756


Unnamed: 0,field_id,tile_id
0,757,28852
1,756,28852
2,1372,28852
3,1374,28852
4,1986,d987c


In [10]:
with open (f'{main}/{test_label_collection}/collection.json') as f:
    test_json = json.load(f)
    
test_folder_ids = [i['href'].split('_')[-1].split('.')[0] for i in test_json['links'][4:]]

test_field_paths = [f'{main}/{test_label_collection}/{test_label_collection}_{i}/field_ids.tif' for i in test_folder_ids]

In [11]:
competition_test_data = pd.DataFrame(test_folder_ids , columns=['unique_folder_id'])
competition_test_data['field_paths'] = test_field_paths
competition_test_data.head()

Unnamed: 0,unique_folder_id,field_paths
0,6199c,ref_agrifieldnet_competition_v1/ref_agrifieldn...
1,6c81d,ref_agrifieldnet_competition_v1/ref_agrifieldn...
2,1ebeb,ref_agrifieldnet_competition_v1/ref_agrifieldn...
3,586a2,ref_agrifieldnet_competition_v1/ref_agrifieldn...
4,65812,ref_agrifieldnet_competition_v1/ref_agrifieldn...


In [12]:
# extract features from test data

test_pixel_df, test_tile_df, test_tile_field_df = feature_extractor(competition_test_data, source_collection)
display(test_pixel_df.head(), test_tile_df.head(), test_tile_field_df.head())

100%|██████████| 707/707 [00:44<00:00, 15.94it/s]


Unnamed: 0,B01,B02,B03,B04,B05,B06,B07,B08,B8A,B09,B11,B12,field_id,x,y
0,39,35,35,35,38,48,55,59,60,11,53,39,5407,211,137
1,39,34,33,34,37,49,58,58,63,11,54,40,5407,212,137
2,39,36,36,37,39,59,70,56,76,14,55,37,5407,210,138
3,39,35,36,34,39,59,70,75,76,14,55,37,5407,211,138
4,39,33,34,31,37,70,85,79,90,14,54,34,5407,212,138


Unnamed: 0,tile_id,minx,miny,maxx,maxy
0,6199c,76.846067,24.617101,76.871689,24.640523
1,6c81d,87.305872,25.576258,87.331425,25.599431
2,1ebeb,81.624621,27.955832,81.650785,27.979061
3,586a2,83.070523,19.274051,83.095164,19.297449
4,65812,83.412359,19.33917,83.437053,19.362609


Unnamed: 0,field_id,tile_id
0,5407,6199c
1,6309,6c81d
2,6311,6c81d
3,6314,6c81d
4,6310,6c81d


In [13]:
# merge title_df, tile_field_df, pixel_df
tile_df = pd.concat([train_tile_df, test_tile_df], ignore_index=True)
tile_field_df = pd.concat([train_tile_field_df, test_tile_field_df], ignore_index=True)
pixel_df = pd.concat([train_pixel_df, test_pixel_df], ignore_index=True)

# drop duplicates
tile_df = tile_df.drop_duplicates().reset_index(drop=True)
tile_field_df = tile_field_df.drop_duplicates().reset_index(drop=True)

In [15]:
# export data to csv

path = ""

tile_df.to_csv(f'{path}tile_df.csv', index=False)
tile_field_df.to_csv(f'{path}tile_field_df.csv', index=False)
pixel_df.to_csv(f'{path}pixel_df.csv', index=False)
field_crop_pair.to_csv(f'{path}field_crop_df.csv', index=False)