# 5. Machine Learning Methods
 
*Date: July 31, 2023*  
*Author: Alicia Larsen*     
*Institution: The Research Institute of Sweden (RISE)*   
*Contact: alicia.hh.larsen@gmail.com*   

This is the 7th notebook of 7, in the series "RISE Wildfire Prediction Using Machine Learning"

References: This notebook is based on the procedures in the notebook found on this [link](https://github.com/ornldaac/modis_restservice_qc_filter_Python/blob/master/modis_restservice_qc_filter_Python.ipynb). This notebook can also be found in /initial-eda/data-procurement/reference-notebook/download-modis-data-example-notebook.ipynb, on github.com:larsenalicia/RISE-wildfire-prediction.git

##### Keywords: LST, LSR, Fire, MODIS, Python

## Overview
This notebook will mainly explore under- and oversampling techniques.

## Prerequisites: 

* Python 2 or 3   
* Libraries: requests, json, datetime, pandas, numpy, matplotlib
---

## Set-up
### Imports:

In [None]:
# General imports
import pandas as pd
import numpy as np

# Import 'LogisticRegression' and create a LogisticRegression object
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.model_selection import train_test_split

# Import 'RandomForestRegressor'
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import IsolationForest

# Import modules to visualise the random forest
from sklearn.tree import export_graphviz
import pydot

# SMOTE
from imblearn.over_sampling import SMOTE
from collections import Counter

# UnderSampling
from imblearn.under_sampling import RandomUnderSampler

# Import for Cross validation
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import f1_score

# Visualisations
import seaborn as sns
sns.set()
import matplotlib.pyplot as plt

from globals.global_vars import url, header, coordinate_description, lat, lon, start_year, end_year, products, bands, random_state, product_names
from procerdures.d_model import performace_matrix

In [None]:
# Variables
test_size = 0.33
seed = 42

In [None]:
dataframes: dict = {}

# Iterate through the different frequences
for frequency in ['least', 'most']:

    # Iterate through the different filtering restrictions
    for restriction in ['hard', 'loose']:

        for size in ['largest', 'middle', 'smallest']:

            # Read a CSV in the right directory
            df_data = pd.read_csv(f'data/aggregation/normalized/alldata_{frequency}_{restriction}_{size}_{start_year}-{end_year}_{coordinate_description}.csv')

            # Add the dataframe to a dictionary, for access
            dataframes[f'{frequency}_{restriction}_{size}'] = df_data.rename(columns={'Unnamed: 0': 'date'}).set_index(['date', 'pixel'])

# Take a look at the keys
dataframes.keys()

In [None]:
# Define the most promesing dataframe
df_data = dataframes['least_hard_largest']
df_data.head()

In [None]:
# Define the dataframe, and the idenpendent and depednent variables
X = df_data[['temperature_k', 'ndmi', 'evi']].values
y = df_data['fire'].values

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=seed)

## Train a model without compensating for value imbalance

In [None]:
rf = RandomForestClassifier(n_estimators = 100, 
                            random_state = random_state)

# Calculate the f1 scores
rf.fit(X_train, y_train)
y_pred = np.rint(rf.predict(X_test))
f1 = f1_score(y_test, y_pred, average='binary')

print(f1)

## Compensate for value imbalance

In [None]:
# Check the value imbalance
train_num_pos = list(y_train).count(1.0)
train_num_neg = list(y_train).count(0.0)

train_num_pos, train_num_neg

### Undersampling

In [None]:
f1 = 0
undersampling_parameter = 0
under_over_sampling = {'param': [], 'f1_un': []}

for i in range(1, int(np.floor(train_num_neg**(1/3))), 1):

    i = i**3

    # Undersampling
    rus = RandomUnderSampler(random_state=seed, sampling_strategy={0.0: i, 1.0: train_num_pos})
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

    # Initialize the model
    rf = RandomForestClassifier(n_estimators = 100, 
                            random_state = random_state)
    
    # Calculate the f1 scores
    rf.fit(X_train_res, y_train_res)
    y_pred = np.rint(rf.predict(X_test))
    f1_new = f1_score(y_test, y_pred, average='binary')
    
    # Append the values to look at later
    under_over_sampling['param'].append(i) 
    under_over_sampling['f1_un'].append(f1_new) 

    # Find the param with the best f1 score
    if f1_new > f1:
        f1 = f1_new
        undersampling_parameter = i

print(undersampling_parameter)

In [None]:
# Define df_data_ov as a dataframe
df_data_ov = pd.DataFrame.from_dict(under_over_sampling)
df_data_ov.head()

In [None]:
# Make line plot of model performance depending on undersampling
fig, ax = plt.subplots()

sns.lineplot(data=df_data_ov, x='param', y='f1_un', ax=ax)
ax.set_xlim(-300, 0.1*10**6)

### Oversampling
#### Maximal oversampling

In [None]:
f1 = 0
under_over_sampling = {'param': [], 'f1_ov': []}

# Oversampling
sm = SMOTE(random_state=seed, sampling_strategy={0.0: train_num_neg, 1.0: train_num_neg})
X_train_res, y_train_res = sm.fit_resample(X_train_res, y_train_res)

# Initialize the model
rf = RandomForestClassifier(n_estimators = 100, 
                        random_state = random_state)

# Calculate the f1 scores
rf.fit(X_train_res, y_train_res)
y_pred = np.rint(rf.predict(X_test))
f1 = f1_score(y_test, y_pred, average='binary')

print(f1)

#### Iterative oversampling

In [None]:
f1 = 0
undersampling_parameter = 0
under_over_sampling = {'param': [], 'f1_ov': []}

for i in range(0, int(np.floor(train_num_neg**(1/4))), 1):

    if i < train_num_pos:
        i = train_num_pos
    else:
        i = i**4

    # Oversampling
    sm = SMOTE(random_state=seed, sampling_strategy={0.0: train_num_neg, 1.0: i})
    X_train_res, y_train_res = sm.fit_resample(X_train, y_train)

    # Initialize the model
    rf = RandomForestClassifier(n_estimators = 100, 
                            random_state = random_state)
    
    # Calculate the f1 scores
    rf.fit(X_train_res, y_train_res)
    y_pred = np.rint(rf.predict(X_test))
    f1_new = f1_score(y_test, y_pred, average='binary')
    
    # Append the values to look at later
    under_over_sampling['param'].append(i) 
    under_over_sampling['f1_ov'].append(f1_new) 

    # Find the param with the best f1 score
    if f1_new > f1:
        f1 = f1_new
        undersampling_parameter = i

print(undersampling_parameter)

In [None]:
# Define df_data_un as a dataframe
df_data_un = pd.DataFrame.from_dict(under_over_sampling)
df_data_un.head()

In [None]:
# Make line plot of model performance depending on oversampling
sns.lineplot(data=df_data_un, x='param', y='f1_ov')

### Combination of Under- and Oversampling

In [None]:
f1 = 0
undersampling_parameter = 0
under_over_sampling = {'param': [], 'f1_unov': []}

for i in range(1, int(np.floor(train_num_neg**(1/2))), 1):

    i = i**2

    if i < train_num_pos:
        i = train_num_pos

    # Undersampling
    rus = RandomUnderSampler(random_state=seed, sampling_strategy={0.0: i, 1.0: train_num_pos})
    X_train_res, y_train_res = rus.fit_resample(X_train, y_train)

    # Oversampling
    sm = SMOTE(random_state=42)
    X_train_res, y_train_res = sm.fit_resample(X_train_res, y_train_res)

    # Initialize the model
    rf = RandomForestClassifier(n_estimators = 100, 
                            random_state = random_state)
    
    # Calculate the f1 scores
    rf.fit(X_train_res, y_train_res)
    y_pred = np.rint(rf.predict(X_test))
    f1_new = f1_score(y_test, y_pred, average='binary')
    
    # Append the values to look at later
    under_over_sampling['param'].append(i) 
    under_over_sampling['f1_unov'].append(f1_new) 

    # Find the param with the best f1 score
    if f1_new > f1:
        f1 = f1_new
        undersampling_parameter = i

print(undersampling_parameter)

In [None]:
df_data_ovun = pd.DataFrame.from_dict(under_over_sampling)
df_data_ovun.head()

In [None]:
# Line plot of the perfrmance of the model, after applying a combination of both under- and oversampling.
sns.lineplot(data=df_data_ovun, x='param', y='f1_unov')

### Visualisation of all three experiments

In [None]:
# Concatenate all results from exclusive under- and oversampling, as well as the combination of both.
df_all = pd.concat([df_data_ov, df_data_un, df_data_ovun], axis=1)
df_all = df_all.drop('param', axis=1)
df_all['param'] = df_data_ov['param']
df_all.head()

In [None]:
# Line plot of all performances
fig, ax = plt.subplots()

sns.lineplot(data=df_all, x='param', y='f1_un', ax=ax)
sns.lineplot(data=df_all, x='param', y='f1_ov', ax=ax)
sns.lineplot(data=df_all, x='param', y='f1_unov', linewidth=3, ax=ax)

ax.set_title('Model performance depending on under- and over sampling', size=14)
ax.set_xlabel('Number of original values used', size=12, weight='bold')
ax.set_ylabel('F1 score', size=12, weight='bold')

ax.legend(['Undersampling', '_','Oversampling', '_', 'Combination'], loc="upper right");


## Wrap-up
Now you should know what finetuning yield the best model performance.

Have a nice day!

/ Alicia