# 📚 Import Libraries

In [None]:
%matplotlib inline
import pandas as pd
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
import random
InteractiveShell.ast_node_interactivity = "all"
import os
from datetime import datetime
import plotly.express as px
import glob
from tqdm import tqdm
from shutil import copyfile
from string import Template
import gc

## NASA FIRMS - Fire Information for Resource Management System

![FIRMS](https://cdn.earthdata.nasa.gov/conduit/upload/17937/Screen_Shot_2021-09-20_at_2.41.29_PM.png)

NASA's FIRMS (https://earthdata.nasa.gov/firms) distributes Near Real-Time (NRT) active fire data within 3 hours of satellite observation from the Moderate Resolution Imaging Spectroradiometer ([MODIS](https://modis.gsfc.nasa.gov/)) aboard the Aqua and Terra satellites, and the Visible Infrared Imaging Radiometer Suite ([VIIRS](https://www.jpss.noaa.gov/viirs.html)) aboard S-NPP and NOAA 20.


# 📌 Key-Points

### Read and check the chunks

The archive fire datasets can be requested at 
https://firms.modaps.eosdis.nasa.gov/download/ in yearly chunks for each instrument.

* [MODIS](https://earthdata.nasa.gov/earth-observation-data/near-real-time/firms/c6-mcd14dl) Collection 6.1: Temporal Coverage: 11 November 2000 - present
* [VIIRS](https://ncc.nesdis.noaa.gov/VIIRS/) S-NPP 375m: Temporal Coverage: 20 January 2012 - present
* [VIIRS NOAA-20](https://ncc.nesdis.noaa.gov/NOAA-20/NOAA20VIIRS.php) 375m: Temporal Coverage: 1 January 2020 - present

Since NOAA-20 has less than 2 years data let's focus on the other instruments.

In [None]:
# DATA_DIR keeps datasets fetched from https://firms.modaps.eosdis.nasa.gov/download/
DATA_DIR = './wildfire-firms-dataset'
filenames = glob.glob(DATA_DIR + '/data/*/*.csv')
filenames
len(filenames)

### Prepare file stats

In [None]:
rows = []
for f in tqdm(filenames):
    df = pd.read_csv(f, parse_dates=['acq_time'], low_memory=False #, nrows=1000
                    )
    csv_name = f.split('/')[-1]
    row = [
        f, csv_name, df.shape[0], df.shape[1], df.acq_date.min(), df.acq_date.max(),
        df.satellite.max(), df.instrument.max(), df.version.max(),
        df.latitude.nunique(), df.longitude.nunique(),
        df.confidence.nunique(), df.satellite.nunique(), df.acq_date.nunique()
    ]
    rows.append(row)
    
    del df
    gc.collect()

cols = [
    'path', 'csv', 'rows', 'cols', 'start', 'end',
    'satellite', 'instrument', 'version',
    'lats', 'lons', 'confs', 'sats', 'days'
]
filestats = pd.DataFrame(rows, columns=cols)
filestats.sort_values(by=['start', 'instrument'])
filestats.head()

### Raw fire readings

The satellite takes a ‘snapshot’ of events as it passes over the earth. Each hotspot/active fire detection represents the center of a pixel flagged as containing one or more fires, or other thermal anomalies (such as volcanoes). For MODIS the pixel is approximately 1km and for VIIRS the pixel is approximately 375m. The “location” is the center point of the pixel (not necessarily the coordinates of the actual fire).

In [None]:
filestats.head(2)

In [None]:
filestats = my_utils.reduce_mem_usage(filestats)
filestats.head(2)

### Confidence

The raw dataset has more detailed sensor measurements 

* **brightness**: Channel 21/22 brightness temperature of the fire pixel measured in Kelvin.
* **bright_t31**: Channel 31 brightness temperature of the fire pixel measured in Kelvin.
* **frp**: Fire Radiative Power depicts the pixel-integrated fire radiative power in MW (megawatts). 
* **type** Inferred hot spot type (0 = presumed vegetation fire, 1 = active volcano, 2 = other static land source, 3 = offshore)
* **confidence** This value is based on a collection of intermediate algorithm quantities used in the detection process. It is intended to help users gauge the quality of individual hotspot/fire pixels. Confidence estimates range between 0 and 100% and are assigned one of the three fire classes (low-confidence fire, nominal-confidence fire, or high-confidence fire).

For the machine learning model I will keep the provided confidence values to filter less confident fire detection records.

In [None]:
dfs = []
for f in tqdm(filenames):
    c = pd.read_csv(f, usecols=['confidence'], low_memory=False)
    csv_name = f.split('/')[-1]
    cnt = c.groupby('confidence').size().reset_index()
    cnt['csv'] = csv_name
    dfs.append(cnt)
    
    del c
    gc.collect()

In [None]:
confidences = pd.concat(dfs)

# Process each chunk

I removed fire readings with low or less than 50 confidence. For simplicity the coordinates are rounded to two decimal degrees. That is roughly 1.1 km at the Equator. For better spatial resolution the original VIIRS records could be used.


In [None]:
FIRE_LOW_CONF = 50

chunks = []
cols_to_read = ['latitude', 'longitude', 'acq_date', 'satellite', 'instrument', 'confidence']
for f in tqdm(filenames):
    fire = pd.read_csv(f, usecols=cols_to_read, parse_dates=['acq_date'], low_memory=False)
    if fire.satellite.loc[0] in ['Terra', 'Aqua', 'N']:
        fire.latitude = fire.latitude.round(2)
        fire.longitude = fire.longitude.round(2)
        fire.confidence = fire.confidence.replace({'l': 0, 'n': 50, 'h': 100})
        daily_fires = fire.groupby(
            ['latitude', 'longitude', 'acq_date', 'satellite', 'instrument']).confidence.max().reset_index()
        # Remove low confidence records
        daily_fires = daily_fires[daily_fires.confidence >= FIRE_LOW_CONF]  
        
        instrument = fire.instrument.loc[0]
        start = fire.acq_date.min()
        print(instrument, start, fire.shape[0], daily_fires.shape[0])
        daily_fires.to_csv(f'{instrument}_{start.strftime("%Y%m%d")}.csv', index=False)
        chunks.append(daily_fires)
    else:
        'skip', f
    
    del fire
    gc.collect()

### Globalfire dataset is ready!

In [None]:
full_dataset = pd.concat(chunks)
full_dataset.shape
full_dataset.head()
full_dataset.to_csv('firms_fire_daily.csv.gz', index=False, compression='gzip')