# 5. Machine Learning Methods
 
*Date: July 31, 2023*  
*Author: Alicia Larsen*     
*Institution: The Research Institute of Sweden (RISE)*   
*Contact: alicia.hh.larsen@gmail.com*   

This is the 6th notebook of 7, in the series "RISE Wildfire Prediction Using Machine Learning"

References: This notebook is based on the procedures in the notebook found on this [link](https://github.com/ornldaac/modis_restservice_qc_filter_Python/blob/master/modis_restservice_qc_filter_Python.ipynb). This notebook can also be found in /initial-eda/data-procurement/reference-notebook/download-modis-data-example-notebook.ipynb, on github.com:larsenalicia/RISE-wildfire-prediction.git

##### Keywords: LST, LSR, Fire, MODIS, Python

## Overview
This notebook will explore different prediction models and datasets using:
- Linear Regression
- Random forest
- Support Vector Machines (SVM)

## Prerequisites: 

* Python 2 or 3   
* Libraries: requests, json, datetime, pandas, numpy, matplotlib
---

## Set-up
### Imports:

In [None]:
# General imports
import pandas as pd
import numpy as np

# Import 'LogisticRegression' and create a LogisticRegression object
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

# Import 'RandomForestRegressor'
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score

# Import modules to visualise the random forest
from sklearn.tree import export_graphviz
# import pydot

# Import Support vector machine
from sklearn import svm

# Import for Cross validation
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report


from globals.global_vars import url, header, coordinate_description, lat, lon, start_year, end_year, products, bands, random_state, product_names
from procerdures.d_model import performace_matrix


In [None]:
# Variables
test_size = 0.33
seed = 42

In [None]:
dataframes: dict = {}

# Iterate through the different frequences
for frequency in ['least', 'most']:

    # Iterate through the different filtering restrictions
    for restriction in ['hard', 'loose']:

        for size in ['largest', 'middle', 'smallest']:

            # Read a CSV in the right directory
            df_data = pd.read_csv(f'data/aggregation/normalized/alldata_{frequency}_{restriction}_{size}_{start_year}-{end_year}_{coordinate_description}.csv')

            # Add the dataframe to a dictionary, for access
            dataframes[f'{frequency}_{restriction}_{size}'] = df_data.rename(columns={'Unnamed: 0': 'date'}).set_index(['date', 'pixel'])

# Take a look at the keys
dataframes.keys()

In [None]:
# Take a look at the structure at an arbitrary dataframe (they lall look the same)
dataframes['least_loose_largest'].head()

In [None]:
# Iterate through the dataframe-identifiers
for key in dataframes:
    series = dataframes[key]['fire']
    print(key, series.unique())

In [None]:
# Define the dataframe
df_data = dataframes['least_hard_largest']

# Define the features and targets
X = df_data[['temperature_k', 'ndmi', 'evi']].values
y = df_data['fire'].values


# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

# Initialize the models, and store them in a list for iterable access
lr = LogisticRegression(random_state=random_state)
rf = RandomForestRegressor(n_estimators = 50, random_state = random_state)
clf = svm.SVC(random_state=random_state)
models = [lr, rf, clf]
trained_models = []
validations_lst = []

# Iterate through the models: train them and look at the true/false positives/negatives.
for model in models:
    trained_model, df_validation = performace_matrix(model, X_train, X_test, y_train, y_test)
    trained_models.append(trained_model)
    validations_lst.append(df_validation)

In [None]:
df_validation = pd.DataFrame(columns = ['frequency', 'restriction', 'size', 'model', 'true_negative', 'false_positive', 'false_negative', 'true_positive'])

# Iterate through the models
for i, validation_model in enumerate(validations_lst):

    # Define necessary componants
    key_lst = key.split('_')
    row: dict = {}

    # Define the values at each position in the dictionary, later: at each column per row in dataframe 
    row['frequency'] = key_lst[0]
    row['restriction'] = key_lst[1]
    row['size'] = key_lst[2]
    
    if i == 0:
        row['model'] = 'LR'
    elif i == 1:
        row['model'] = 'RF'
    elif i == 2:
        row['model'] = 'SVM'

    row['true_negative'] = validation_model.loc['Negative', 'predicted_negative'] / (validation_model.sum()).sum()
    row['false_positive'] = validation_model.loc['Negative', 'predicted_positive'] / (validation_model.sum()).sum()
    row['false_negative'] = validation_model.loc['Positive', 'predicted_negative'] / (validation_model.sum()).sum()
    row['true_positive'] = validation_model.loc['Positive', 'predicted_positive'] / (validation_model.sum()).sum()

    # Add the dictionary as a row, as the last row in the the dataframe
    df_validation.loc[len(df_validation)] = row

# Change the format to multi-index, and show the result
df_validation_mi = df_validation.set_index(['frequency', 'restriction', 'size', 'model'])
df_validation_mi

In [None]:
models_per_data: dict = {}
validations: dict = {}

# Iterate through the dataframe-identifiers
for key in dataframes:
    print('hi')
    # Define the dataframe, and the idenpendent and depednent variables
    df = dataframes[key]
    X = df[['temperature_k', 'ndmi', 'evi']].values
    y = df['fire'].values

    # Split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

    # Initialize the models, and store them in a list for iterable access
    lr = LogisticRegression(random_state=random_state)
    rf = RandomForestRegressor(n_estimators = 50, random_state = random_state)
    clf = svm.SVC(random_state=random_state)
    models = [lr, rf, clf]
    trained_models = []
    validations_lst = []

    # Iterate through the models: train them and look at the true/false positives/negatives.
    for model in models:
        trained_model, df_validation = performace_matrix(model, X_train, X_test, y_train, y_test)
        trained_models.append(trained_model)
        validations_lst.append(df_validation)

    # Store the trained models per input data
    models_per_data[key] = trained_models
    validations[key] = validations_lst

In [None]:
df_validation = pd.DataFrame(columns = ['frequency', 'restriction', 'size', 'model', 'true_negative', 'false_positive', 'false_negative', 'true_positive'])

# Iterate through the lists of validation-dataframe identifiers (dictionary of list of 2x2 dataframes)
for key in validations:

    # Define the list of the validation-dataframes, per model
    validation_lst = validations[key]

    # Iterate through the models
    for i, validation_model in enumerate(validation_lst):

        # Define necessary componants
        key_lst = key.split('_')
        row: dict = {}

        # Define the values at each position in the dictionary, later: at each column per row in dataframe 
        row['frequency'] = key_lst[0]
        row['restriction'] = key_lst[1]
        row['size'] = key_lst[2]
        
        if i == 0:
            row['model'] = 'LR'
        elif i == 1:
            row['model'] = 'RF'
        elif i == 2:
            row['model'] = 'SVM'

        row['true_negative'] = validation_model.loc['Negative', 'predicted_negative']
        row['false_positive'] = validation_model.loc['Negative', 'predicted_positive']
        row['false_negative'] = validation_model.loc['Positive', 'predicted_negative']
        row['true_positive'] = validation_model.loc['Positive', 'predicted_positive']

        # Add the dictionary as a row, as the last row in the the dataframe
        df_validation.loc[len(df_validation)] = row

# Change the format to multi-index, and show the result
df_validation_mi = df_validation.set_index(['frequency', 'restriction', 'size', 'model'])
df_validation_mi

### Statistics per data-set category

In [None]:
# Frequency
df_validation_freq = df_validation_mi.groupby('frequency').mean()
df_validation_freq

In [None]:
# Filtering restriction
df_validation_rest = df_validation_mi.groupby('restriction').mean()
df_validation_rest

In [None]:
# Size
df_validation_size = df_validation_mi.groupby('size').mean()
df_validation_size

In [None]:
# Model type
df_validation_mod = df_validation_mi.groupby('model').mean()
df_validation_mod

### Statistics random forest specifically, per data-set category

In [None]:
# Statistics for random forest
df_validation_rf = df_validation[df_validation['model'] == 'RF']
df_validation_rf = df_validation_mi.groupby(['frequency', 'restriction', 'size']).mean()
df_validation_rf

In [None]:
# Frequency
df_validation_freq = df_validation_rf.groupby('frequency').mean()
df_validation_freq

In [None]:
# Filtering
df_validation_rest = df_validation_rf.groupby('restriction').mean()
df_validation_rest

In [None]:
# Size
df_validation_size = df_validation_rf.groupby('size').mean()
df_validation_size

In [None]:
lr_coefficients = pd.DataFrame(columns = ['frequency', 'restriction', 'size', 'lst', 'ndmi', 'evi'])


# Iterate through the dictionary of lists of models, and define the list of models
for key in models_per_data:
    trained_models = models_per_data[key]

    # Define necessary components
    key_lst = key.split('_')
    row: dict = {}

    # Define the values at each position in the dictionary, later: at each column per row in dataframe 
    row['frequency'] = key_lst[0]
    row['restriction'] = key_lst[1]
    row['size'] = key_lst[2]

    lr = trained_models[0]
    row['lst'] = lr.coef_[0][0]
    row['ndmi'] = lr.coef_[0][1]
    row['evi'] = lr.coef_[0][2]

    # Add the dictionary as a row, as the last row in the the dataframe
    lr_coefficients.loc[len(lr_coefficients)] = row

# Change the format to multi-index, and show the result
lr_coefficients = lr_coefficients.set_index(['frequency', 'restriction', 'size'])
lr_coefficients

## F1 score validation

In [None]:
df_f1_validation = pd.DataFrame(columns = ['frequency', 'restriction', 'size', 'f1'])
k_folds = 10

# Iterate through the dictionary of lists of models, and define the list of models
for key in dataframes:
    df = dataframes[key]
    key_lst = key.split('_')

    row: dict = {}

    # Define the values at each position in the dictionary, later: at each column per row in dataframe 
    row['frequency'] = key_lst[0]
    row['restriction'] = key_lst[1]
    row['size'] = key_lst[2]
    
    # Initialize the model
    rf = RandomForestClassifier(n_estimators = 100, 
                        random_state = random_state,
                        class_weight = {0: 0.01, 1: 0.99})

    # Calculate the f1 scores
    rf.fit(X_train, y_train)
    y_pred = np.rint(rf.predict(X_test))
    row['f1'] = f1_score(y_test, y_pred, average='binary')

    # Add the dictionary as a row, as the last row in the the dataframe
    df_f1_validation.loc[len(df_f1_validation)] = row

# Change the format to multi-index, and show the result
df_f1_validation_mi = df_f1_validation.set_index(['frequency', 'restriction', 'size'])
df_f1_validation_mi

In [None]:
# Frequency
df_k_validation_freq = df_f1_validation_mi.groupby('frequency').mean()
df_k_validation_freq

In [None]:
# Filtering
df_k_validation_rest = df_f1_validation_mi.groupby('restriction').mean()
df_k_validation_rest

In [None]:
# Size
df_k_validation_size = df_f1_validation_mi.groupby('size').mean()
df_k_validation_size

## Wrap-up
Now you should know what model that performs the best.

Have a nice day!

/ Alicia