# 3. Data Preparation

*Date: August 1, 2023*  
*Author: Alicia Larsen*     
*Institution: The Research Institute of Sweden (RISE)*   
*Contact: alicia.hh.larsen@gmail.com*  

This is the 4th notebook of 7, in the series "RISE Wildfire Prediction Using Machine Learning"
##### Keywords: LST, LSR, Fire detection, MODIS, Python

## Reference
This notebook is based on the procedures in the notebook found on this [link](https://github.com/ornldaac/modis_restservice_qc_filter_Python/blob/master/modis_restservice_qc_filter_Python.ipynb). This notebook can also be found in /initial-eda/data-procurement/reference-notebook/download-modis-data-example-notebook.ipynb, on github.com:larsenalicia/RISE-wildfire-prediction.git

## Overview
The norebook will handle the following:
* Standardisation of spatial and temporal resolution.
* Value and unit derivations
* Area differentiation
* Normalization of features

## Prerequisites: 

* Python 2 or 3   
* Libraries: requests, json, datetime, pandas, numpy, matplotlib
* Having run 1_data_procurement.ipynb, and have the resulting csv files in the directory /data/..

---
## Set-up
### Imports:

In [None]:
# Imports
import requests
import json
import pandas as pd
import numpy as np
import pickle
import sys
from pathlib import Path
import os

from globals.global_vars import bands, coordinate_description, start_year, end_year, products, data_points_in_time_interval_8, product_names, time_intervals, alternative_mapping, types, date_indices, original_dimensions, space_resolution, above_below_left_right
from procerdures.b_aggregation import half_time_interval, double_time_interval, resolution_decrement

### Load required data:

In [None]:
dataframes: dict = {}

# Iterate through the different filtering restrictions
for restriction in ['hard', 'loose']:

    # Iterate through the different products
    for product in product_names:
        try:
            # Read a CSV in the right directory
            df_data = pd.read_csv(f'data/filtered/{restriction}/{product}_{start_year}-{end_year}_{coordinate_description}.csv').rename(columns={'Unnamed: 0': 'date'})

            # Add the dataframe to a dictionary, for access
            dataframes[f'{product}_{restriction}'] = df_data
        except:
            pass

dataframes.keys()

## Standardize time-intervals

In [None]:
# Check whether the dataframes start with the same date
first_index = []
for df in dataframes.values():
    first_index.append(df.loc[0, 'date'])

len(set(first_index)) == 1

In [None]:
dataframes_time_interval_least_frequent: dict = {}
dataframes_time_interval_most_frequent: dict = {}

with open ('globals/most_frequent.ob', 'rb') as fp:
    most_frequent_index = pickle.load(fp)

# Iterate through the keys for the dataframes
for key in dataframes:

    # Remove 'hard', and 'loose' from the key
    key_core = key.split('_')[0]
    
    # Some core-keys have multiple data-bands, and then the we need a unique key that identified the band
    if key_core in products:
        pass
    else:
        key_core = alternative_mapping[key_core]

    # Find out to what intervals we want to convert the input-dataframe
    least_freq_interval = int(max(time_intervals.values()))
    most_freq_interval = int(min(time_intervals.values()))

    # Determine the type (max/mean), e.g. max for ´fire´, and mean for all numerical data.
    type = types[key_core]

    # ------------------------------------------------
    # Standardize the *least* frequent time interval 
    # ------------------------------------------------
    
    # Define the dataframe
    df = dataframes[key]

    # If the dataframe already has this time interval, then append it as it is
    if int(time_intervals[products[key_core]]) == least_freq_interval:
        dataframes_time_interval_least_frequent[key] = df
    # Otherwise:
    else:
        # Assumption: the date-intervals from modis will always be 2^n.
        # Half the dataframe data-index until it has the same index as the max-interval dataframe.
        while len(df.index) > date_indices[least_freq_interval]:
            dataframes_time_interval_least_frequent[key] = half_time_interval(df, type)
            df = dataframes_time_interval_least_frequent[key]

    # ------------------------------------------------
    # Standardize the *most* frequent time interval 
    # ------------------------------------------------
    
    # Define the dataframe
    df = dataframes[key]

    # If the dataframe already has this time interval, then append it as it is
    if int(time_intervals[products[key_core]]) == most_freq_interval:
        dataframes_time_interval_most_frequent[key] = df
    # Otherwise:
    else:
        # Assumption: the date-intervals from modis will always be 2^n.
        # Double the dataframe data-index until it has the same index as the max-interval dataframe.

        while len(df.index) < len(most_frequent_index):
            dataframes_time_interval_most_frequent[key] = double_time_interval(df, most_frequent_index)
            df = dataframes_time_interval_most_frequent[key]

In [None]:
# Check if all dataframes in the most frequent dataframe are the same, and if all frames where appended
index_len_long = []
for df in dataframes_time_interval_most_frequent.values():
    index_len_long.append(len(df.index))
len(set(index_len_long)) == 1 and len(dataframes_time_interval_most_frequent.keys()) == len(dataframes.keys())

In [None]:
# Check if all dataframes in the less frequent dataframe are the same, and if all frames where appended
index_len_short = []
for df in dataframes_time_interval_least_frequent.values():
    index_len_short.append(len(df.index))
len(set(index_len_short)) == 1 and len(dataframes_time_interval_most_frequent.keys()) == len(dataframes.keys())

In [None]:
# Define a dictionary containing all the dataframes
time_standardized_dataframes: dict = {'least_freq': dataframes_time_interval_least_frequent, 'most_freq': dataframes_time_interval_most_frequent}

## Standardize the number of pixels

Before stacking the dataframes to a two-level multiindex, we will reduce the number of pixels in the <code>df_nir</code> and <code>df_swir</code> dataframes. Since the NMDI has a higher resolution than the other two, the pixels will be aggregated to have equal resolution for all.

In [None]:
standardized_dataframes: dict = {}
total = (len(dataframes_time_interval_least_frequent.keys())+len(dataframes_time_interval_most_frequent.keys()))

# Iterate through the list of dictionares
for time_key in time_standardized_dataframes: 
    dict_ = time_standardized_dataframes[time_key]

    # Iterate through every key, which identifies the dataframe
    for key in dict_:
        
        # Remove 'hard', and 'loose' from the key
        key_core = key.split('_')[0]

        # Define the dataframe and dimensions
        df = dataframes[key]
        desired_dimension = min(original_dimensions.values())
        current_dimension = original_dimensions[key_core]

        # Only decrease the dimension if it is not already the desired (lowest)
        if desired_dimension == current_dimension:
            standardized_dataframes[f'{time_key}_{key}'] = df
        else:
            standardized_dataframes[f'{time_key}_{key}'] = resolution_decrement(df, current_dimension, desired_dimension)
        
        print(f'✅ {key} {len(standardized_dataframes.keys())}/{total}')


## Change format

In [None]:
formatted_standardized_dataframes: dict = {}

# Make every dataframe stacked 
for key in standardized_dataframes:
    formatted_standardized_dataframes[key] = standardized_dataframes[key].set_index('date').stack().to_frame()

In [None]:
# Check that everything looks right
df_test = list(formatted_standardized_dataframes.values())[2]
df_test

## Land Surface Temperature (LST): Kelvin to Celsius
Although the Kelvin-scaled dateframe will be used for the data mining methods, we will preapre a celsiu-based dataframe for exploratory data analysis.

In [None]:
celsius_dataframes: dict = {}

# Iterate through the keys in the most recent dictionary of dataframes
for key in formatted_standardized_dataframes:
    
    # Retrieve the ´predictor´
    core_key = key.split('_')[2]

    # If the ´predictor´ is land surface temperature (LST), then:
    if core_key == 'lst':

        # Define the dataframe, calculate celsius from Kelvin, and store the dataframe in ´celsius_dataframes´
        df_lst = formatted_standardized_dataframes[key]
        df_lst_celsius = (df_lst - 272.15)
        celsius_dataframes[key] = df_lst_celsius

In [None]:
# Iterate through the resulting dictionary and add the dataframes to ´formatted_standardized_dataframes´
for key in celsius_dataframes:  
    
    # Define the dataframe and the relevant variables
    df = celsius_dataframes[key]
    key_lst = key.split('_')
    key_lst[2] = 'celsius'
    new_key = '_'.join(key_lst)

    # Store the dataframe in ´formatted_standardized_dataframes´
    formatted_standardized_dataframes[new_key] = df

## Surface reflectance in NIR and SWIR to NDMI Index
The disired index to use as a predictor in the analysis is the normalised difference moisture index (NDMI) index. 

The current data is the percentage of surface reflectance in the near infrared (NIR) and the short wave-length infrared (SWIR) bands. These bands can be used to calculate the NDMI:

<code>NDMI = (NIR - SWIR) / (NIR + SWIR)</code>

Let's make a function describing this formula:

In [None]:
# Function of NDMI formula
def ndmi(nir, swir):
    ndmi = (nir - swir) / (nir + swir)
    return ndmi

In [None]:
ndmi_dataframes: dict = {}
seen: list = []

# Iterate through the dataframe-identifiers (dictionary keys) in ´formatted_standardized_dataframes´
for key1 in formatted_standardized_dataframes.keys():
    
    # Disect the key, to access key-words
    key_lst1 = key1.split('_')

    # Look for dataframes of 'NIR',
    if key_lst1[2] == 'nir':
        
        # And check that the datafrane has not already been handled,
        # If not, now you have "seen" it, and continue
        if key not in seen:
            seen.append(key1)

            # Iterate through the  dataframe-identifiers (dictionary keys), 
            # to find the SWIR pair to NIR
            for key2 in formatted_standardized_dataframes.keys():
                key_lst2 = key2.split('_')

                # If the keys are the same, apart from the predictor, which is "swir" for the second key, then:
                if (key_lst1[0] == key_lst2[0]) and (key_lst1[1] == key_lst2[1]) and (key_lst2[2] == 'swir') and (key_lst1[3] == key_lst2[3]):
                    seen.append(key2)       

                    # Retrieve the dataframes for both keys
                    df_nir = formatted_standardized_dataframes[key1]
                    df_swir = formatted_standardized_dataframes[key2]
                    
                    # Calculate the NDMI, and store it in 'ndmi_dataframes'
                    df_ndmi = ndmi(df_nir[0], df_swir[0]).to_frame()
                    key_lst1[2] = 'ndmi'
                    ndmi_dataframes['_'.join(key_lst1)] = df_ndmi

In [None]:
# Remove the swir and nir dataframes from ´formatted_standardized_dataframes´
for key in seen:
    del formatted_standardized_dataframes[key]

# Iterate through the resulting dictionary and add the dataframes to ´formatted_standardized_dataframes´
for key in ndmi_dataframes:  
    formatted_standardized_dataframes[key] = ndmi_dataframes[key]

## Change of fire detection classification
The filtered data include:

- 5: non-fire land pixel
- 7: fire (low confidence, land or water)
- 8: fire (nominal confidence, land or water)
- 9: fire (high confidence, land or water)

Let's renumber these into a binary schema, into two levels of restrictions. One that classifies "low" condidence fire as fire and one that classifies it as none-fire:

More restricted aggregation:
- 0: non-fire land pixel, low confidence of fire.
- 1: fire (average/high confidence, land or water)

Less restricted aggregation:
- 0: non-fire land pixel
- 1: fire (low/average/high confidence, land or water)

In [None]:
fire_dataframes: dict = {}
seen_fire = []

# Iterate through the dataframe-identifiers (dictionary keys) in ´formatted_standardized_dataframes´
for key in formatted_standardized_dataframes.keys():
    
    # Disect the key, to access key-words
    key_lst = key.split('_')

    # Look for dataframes of 'NIR',
    if key_lst[2] == 'fire':

        df = formatted_standardized_dataframes[key]

        if key_lst[3] == 'hard':
            df.loc[(df[0] <= 6.0)] = 0
            df.loc[(df[0] > 6.0)] = 1
        
        elif key_lst[3] == 'loose':
            df.loc[(df[0] <= 5.0)] = 0
            df.loc[(df[0] > 5.0)] = 1
        else:
            raise ValueError('The naming of the dataframes are wrong')
        
        seen_fire.append(key)
        fire_dataframes[key] = df

In [None]:
# Remove the swir and nir dataframes from ´formatted_standardized_dataframes´
for key in seen_fire:
    del formatted_standardized_dataframes[key]

# Iterate through the resulting dictionary and add the dataframes to ´formatted_standardized_dataframes´
for key in fire_dataframes:  
    formatted_standardized_dataframes[key] = fire_dataframes[key]

## Aggregate all data to one dataframe

In [None]:
# Check length of all dataframes
print(f'''
LENGTH OF DATAFRAMES
--------------------
temp: {len(formatted_standardized_dataframes['least_freq_lst_hard'].index)}
ndmi: {len(formatted_standardized_dataframes['least_freq_ndmi_hard'].index)}
evi:  {len(formatted_standardized_dataframes['least_freq_evi_hard'].index)}
fire: {len(formatted_standardized_dataframes['least_freq_fire_hard'].index)}
''')

In [None]:
dataframes_aggregated: dict = {}
seen: list = []

# Iterate through the dataframe-identifiers (dictionary keys) in ´formatted_standardized_dataframes´
for key1 in formatted_standardized_dataframes.keys():

    # Disect the key, to access key-words
    key_lst1 = key1.split('_')

    # Define core-words
    frequency1 = key_lst1[0]
    predictor1 = key_lst1[2]
    restriction1 = key_lst1[3]
    core_key1 = '_'.join([frequency1, restriction1])

    # Let's use LST as key1
    if predictor1 == 'lst':
        print('lst')
        # If the dataframe has not already been handled:
        if core_key1 not in seen:    
            seen.append(core_key1)
            
            # Define the initial dataframe (necessay to set multiindex, to concatenate later)
            df_data = formatted_standardized_dataframes[key1].rename(columns={0: f'{predictor1}'})

            # Iterate through the keys again,
            for key2 in formatted_standardized_dataframes.keys():

                # Disect the key, to access key-words
                key_lst2 = key2.split('_')
                frequency2 = key_lst2[0]
                predictor2 = key_lst2[2]
                restriction2 = key_lst2[3]
                core_key2 = '_'.join([frequency2, restriction2]) # general

                # Check if the categories of restriction, and frequency are the same, then:
                if core_key1 == core_key2:
                    
                    # If the dataframe is not already handled, then:
                    if predictor1 != predictor2:

                        # Concatenate the dataframes 
                        df_data = pd.concat([df_data, formatted_standardized_dataframes[key2].rename(columns={0: f'{predictor2}'})], axis=1)
                
                elif predictor2 == 'ndmi' and (frequency1 == frequency2):
                    # Concatenate the dataframes 
                    df_data = pd.concat([df_data, formatted_standardized_dataframes[key2].rename(columns={0: f'{predictor2}'})], axis=1)
                
            # Lastly, add every aggregated dataframe (per category) in a dictionary for storage
            dataframes_aggregated[core_key1] = df_data
        else:
            pass

print('done!')

In [None]:
# Check that dataframes_aggregated has the epected keys
dataframes_aggregated.keys()

In [None]:
# Look at an example
df_test = dataframes_aggregated['least_hard']
df_test.head()

## Data Normalisation

In [None]:
dataframes_aggregated_normalized: dict = {}

# Iterate through the dataframe identifiers (dictionary keys)
for key in dataframes_aggregated.keys():
    
    # Define the dataframes
    df_data = dataframes_aggregated[key]
    df_normalized = pd.DataFrame()

    # Normalize the columns in ´df_data´ and store it in ´df_normalized´
    df_normalized['ndmi']=(df_data['ndmi']-df_data['ndmi'].mean())/df_data['ndmi'].std()
    df_normalized['temperature_k']=(df_data['lst']-df_data['lst'].mean())/df_data['lst'].std()
    df_normalized['temperature_c']=(df_data['celsius']-df_data['celsius'].mean())/df_data['celsius'].std()
    df_normalized['evi']=(df_data['evi']-df_data['evi'].mean())/df_data['evi'].std()
    df_normalized['fire']=df_data['fire']
    

    # Remove all NaN values
    df_normalized = df_normalized.dropna()

    # Store the resulting dataframe in ´dataframes_aggregated_normalized´
    dataframes_aggregated_normalized[key] = df_normalized

In [None]:
# Iterate through the dataframe-identifiers
for key in dataframes_aggregated_normalized:
    series = dataframes_aggregated_normalized[key]['fire']
    print(key, series.unique())

## Area differentiation

In [None]:
# Defines the necessary function for calculating the dimension of an area.
def dimension(above_below_left_right):
    """ Takes a string key and calculates it's dimension in pixels.
    """
    space_resolution_ = max(space_resolution.values())
    return int((2 * above_below_left_right) / space_resolution_) + 1

In [None]:
# Define the dimensions of the future dataframes, trying to make every smaller dataframe, half the size of the first.
dimension_largest = dimension(above_below_left_right)
dimension_middle = int(np.ceil(((dimension(above_below_left_right)**2) / 2)**(1/2)))
dimension_smallest = int((((dimension_middle)**2) / 2)**(1/2))

print(f"""
DIMENSIONS
------------------------
Largest dimension:    {dimension_largest}
Middle dimension:     {dimension_middle}
Smallest dimension:   {dimension_smallest}
""")

In [None]:
# Function that defines the pixel range
def pixel_range(dimension_self):
    """ Takes a dimension and calculates the pixels to include from a higher dimension.
    """
    # Define relevant variables
    padding = int((dimension_largest-dimension_self) / 2)
    start_pixel = dimension_largest * padding
    pixels_lst = []
    recent_pix = start_pixel

    # Iterate through the the rows of the new dataframe, and calculate the pixels to include, and save them
    for _ in range(0, dimension_self):
        pixles = list(range(recent_pix + padding-1, recent_pix + padding-1 + dimension_self))
        pixels_lst += pixles
        
        # re-define the most recent pixel number
        recent_pix += padding-1 + dimension_self + padding+1

    # Check if the number of pixels were added as expected.
    print(f'Expected number of pixels: {len(pixels_lst) == dimension_self**2}')

    return pixels_lst

In [None]:
# Check if the calculations yielded the expected pixel ranges.
dimension_largest
pixels_middle = pixel_range(dimension_middle)
pixels_smallest = pixel_range(dimension_smallest)

In [None]:
# Function that halves the size of a dataframe.
def smaller_dataframe(df, pixels):
    """ Takes a list of pixel numbers from current dimension, and returns a dataframe with half the size.
    """
    # Convert every integer in list to str, to type-match the dataframe
    pixels_string = map(str, pixels)

    # Choose the desired rows
    df = df.reset_index().set_index('date').rename(columns={'level_1': 'pixel'})
    df = df[df.pixel.isin(pixels_string)]
    df = df.reset_index().set_index(['date', 'pixel'])
    return df

In [None]:
dataframes: dict = {}
pixel_nums: dict = {'largest': dimension_largest**2, 'middle': pixels_middle, 'smallest': pixels_smallest}

# Iterate through the labels for each pixel-number-list
for pix_name in pixel_nums:
    
    # Iterate through the dataframe identifiers in ´dataframes_aggregated_normalized´
    for key in dataframes_aggregated_normalized:
        
        # Define the dataframe
        df = dataframes_aggregated_normalized[key]

        # If the dataframe does not have to go through the ´smaller_dataframe´ function, don't
        if pix_name == 'largest':
            dataframes[f'{key}_{pix_name}'] = df.reset_index().rename(columns={'level_1': 'pixel'}).set_index(['date', 'pixel']).rename(columns={'level_1': 'pixel'})
        
        # Otherwise, decrease the pize of the dataframe with ´smaller_dataframe´
        else:
            pixels = pixel_nums[pix_name]
            dataframes[f'{key}_{pix_name}'] = smaller_dataframe(df, pixels)

In [None]:
# Iterate through the dataframe-identifiers
for key in dataframes:
    series = dataframes[key]['fire']
    print(key, series.unique())

## Data storage

In [None]:
# Save the raw, but aggregated data to CSVs
for key in dataframes_aggregated.keys():
    df = dataframes_aggregated[key]
    df.to_csv(f'data/aggregation/raw/alldata_{key}_{start_year}-{end_year}_{coordinate_description}.csv')

# Save the normalized, and pixel-differentiated dataframes to CSVs
for key in dataframes.keys():
    df = dataframes[key]
    df.to_csv(f'data/aggregation/normalized/alldata_{key}_{start_year}-{end_year}_{coordinate_description}.csv')

## Wrap-up
Now you should have pre-processed all datasets.

Have a nice day!

/ Alicia