# About

These last decades, Earth Observation brought quantities of new perspectives from geosciences to human activity monitoring. As more data became available, artificial intelligence techniques led to very successful results for understanding remote sensing data. Moreover, various acquisition techniques such as Synthetic Aperture Radar (SAR) can also be used for problems that could not be tackled only through optical images. This is the case for weather-related disasters such as floods or hurricanes, which are generally associated with large clouds cover. Yet, machine learning on SAR data is still considered challenging due to the lack of available labeled data. This dataset is composed of co-registered optical and SAR images time series for the detection of flood events.

Downloaded from Radiant ML Hub : https://mlhub.earth/data/sen12floods

Citation
Clément Rambour, Nicolas Audebert, Elise Koeniguer, Bertrand Le Saux, Michel Crucianu, Mihai Datcu, September 14, 2020, "SEN12-FLOOD : a SAR and Multispectral Dataset for Flood Detection ", IEEE Dataport, doi: https://dx.doi.org/10.21227/w6xz-s898.

# Dataset

The dataset is composed of 412 time series with 4 to 20 optical images and 10 to 58 SAR im- ages in each sequence. On average, there are 9 optical and 14 SAR images per sequence. The period of acquisition goes from December 2018 to May 2019. A flood event is occuring in 40% of the optical Sentinel 2 images and in 47% of the SAR Sen- tinel 1 images. As in the MediaEval dataset, once a flood oc- curred in a sequence, all the subsequent images are labeled as flooded which corresponds to the hypothesis that the surface still presents characteristic modifications after the event.

# Import All libraries

In [1]:
import json
import tensorflow as tf
import tensorflow.keras.backend as K
import numpy as np 
import pandas as pd
import seaborn as sns
import os
import gc
import rasterio as rio
from PIL import Image
import matplotlib.pyplot as plt
from matplotlib import  cm
import cv2
from matplotlib import animation
from IPython.display import HTML
from tqdm import tqdm
from sklearn.model_selection import train_test_split

# Config

In [4]:

class CFG:
    """
    This class provides a set of parameters and constants that may be used throughout a machine learning 
    pipeline for image classification, specifically in the context of identifying flooded areas.
    """
    seed = 7 # random initialization of weights in a machine learning model
    img_size = (256,256) # representing the dimensions of an image, specifically 256 x 256 pixels.
    BATCH_SIZE = 3 #  representing the number of samples that will be fed to a machine learning model during training.
    Autotune = tf.data.AUTOTUNE # a constant value from the tf.data.AUTOTUNE module that enables dynamic 
    # allocation of computational resources to improve performance.
    validation_size = 0.2 # a float value of 0.2 representing the fraction of the training dataset to be used for validation during training.
    class_dict= {0:'No Flooding', 
                 1: 'Flooding'}
    
    test_run = False # in training mode

# Input Data

Read more about the dataset here : https://clmrmb.github.io/SEN12-FLOOD/

In [8]:
# quick visual confirmation that all label files have corresponding source files in the dataset

# Set the path for Dataset
s1_labels = 'sen12flood/sen12floods_s1_labels/sen12floods_s1_labels/'
s1_tiles = 'sen12flood/sen12floods_s1_source/sen12floods_s1_source/'

s2_tiles = 'sen12flood/sen12floods_s2_source/sen12floods_s2_source/'
s2_labels = 'sen12flood/sen12floods_s2_labels/sen12floods_s2_labels/'


s1_check = 0
for file in os.listdir(s1_labels):
    if os.path.exists(s1_tiles + '/' + file.replace('labels','source')):
        s1_check += 1
        
         
assert s1_check == len(os.listdir(s1_tiles)), 'Not present'
    
s2_check = 0
for file in os.listdir(s2_labels):
    if os.path.exists(s2_tiles + '/' + file.replace('labels','source')):
        s2_check += 1
        
        
assert s2_check == len(os.listdir(s2_tiles)), 'Not present'


s1_check,s2_check 

(3332, 2237)

In [9]:
def load_json(path):
    '''loads a json file'''
    with open(path,'r') as file:
        js = json.load(file)
        
    return js


# Helper Functions

In [11]:
def process_label_json(label_json):
    '''process a single label json'''
    info_dict = {}
    # extracting the data from json file
    info_dict['geometry'] = label_json['geometry']['coordinates']
    info_dict['label'] = label_json['properties']['FLOODING']
    info_dict['date'] = label_json['properties']['date']
    info_dict['tile_number'] = label_json['properties']['tile']
#     info_dict['full_data_coverage']= label_json['properties']['FULL-DATA-COVERAGE']
    
    return info_dict


def process_label_stac(stac_json):
    return stac_json['id']
    
    


def image_path_from_label_dir(image_parent_dir:str,
                              label_file :str)->str:
    
    return image_parent_dir + '/' + label_file.replace('labels','source')
    
    

def process_json(label_path,image_directory):
    '''get the data for a single example
     Inputs 
     label_path : path to the label folder 
     image_directory: path to the corresponding image directory'''
    
    

    #get image directory for that label
    folder_id = label_path.rsplit('/',1)[1]
    image_dir_path = image_path_from_label_dir(image_directory,folder_id)

    if not os.path.exists(image_dir_path):
        return {'File_not_found':image_dir_path}
    
    
    for file in os.listdir(label_path):
        #if image dir exists 
        if file.startswith('labels'):
            label_json = load_json(os.path.join(label_path,file))
        else:
            stac_json = load_json(os.path.join(label_path,file))


    #get data 
    info_dict = process_label_json(label_json)

    #get id 
    info_dict['id'] = process_label_stac(stac_json)
    
    #location id 
    info_dict['location_id'] = info_dict['id'].split('_')[3]
    
    
    info_dict['image_dir'] = image_dir_path
    
    
    return info_dict


In [12]:
def get_dataframe(label_directory,image_directory):
    '''get dataframe from the nested label directory'''
    records = []
    
        
    for folder in os.listdir(label_directory):
        if folder.startswith('sen12'):
#             print(folder,label_directory)
            folder_path = label_directory + '/' + folder
            
            
            #get data for a single example
            feature = process_json(label_path=folder_path,
                                   image_directory=image_directory)
            
            
            records.append(feature)
            
            
    return pd.DataFrame.from_records(data = records)



def type_cast_dataset(dataset):
    '''typecasting columns in dataset'''
    dataset['label'] = dataset['label'].astype(int)
    
    dataset['date'] = pd.to_datetime(dataset['date'])
    dataset['tile_number'] = dataset['tile_number'].astype('int8')
    
    
    return dataset

In [13]:
%%time
s1_data = type_cast_dataset(
                            get_dataframe(
                                label_directory=s1_labels,
                                image_directory=s1_tiles
                                        )
                            )


s2_data = type_cast_dataset(
                            get_dataframe(label_directory=s2_labels,
                                          image_directory=s2_tiles)
                            )

print(f'Number of unique locations in Sentinel1 (SAR) data : {s1_data.location_id.nunique()}')
print(f'Number of unique locations in Sentinel2 (optical) data : {s2_data.location_id.nunique()}')

s1_data.shape,s2_data.shape

Number of unique locations in Sentinel1 (SAR) data : 335
Number of unique locations in Sentinel2 (optical) data : 335
CPU times: total: 1.38 s
Wall time: 1min 27s


((3331, 7), (2236, 7))

In [14]:
 # saving datasets
s1_data.to_csv('s1_data.csv',index=False)
s2_data.to_csv('s2_data.csv',index=False)