In [None]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler

In [None]:
import pandas as pd

# Read the data from the file into a DataFrame
bejaia = pd.read_csv('Bejaia.csv')
sidi = pd.read_csv('Sidi.csv')

# These files contain the following columns:
#   - day: the date of the observation
#   - month: the month of the observation
#   - year: the year of the observation
#   - temperature noon (temperature max) in Celsius degrees: 22 to 42
#   - RH: Relative Humidity in %: 21 to 90
#   - Ws: Wind speed in km/h: 6 to 29
#   - Rain: total day in mm: 0 to 16.8
#   - FFMC: Fine Fuel Moisture Code (FFMC) index from the FWI system: 28.6 to 92.5
#   - DMC: Duff Moisture Code (DMC) index from the FWI system: 1.1 to 65.9
#          DCDrought Code (DC) index from the FWI system: 7 to 220.4

# Print the first five rows of the DataFrame
print(bejaia.head())
print("----")
print(sidi.head())

In [None]:
# clean the data (clean the space in rows and columns; clean the none data)
bejaia.columns = bejaia.columns.str.strip()
display(bejaia.columns)
bejaia.dropna(inplace=True)
sidi.columns = sidi.columns.str.strip()
display(sidi.columns)
sidi.dropna(inplace=True)

bejaia.Classes = bejaia.Classes.str.strip()
display(bejaia.Classes.unique())
sidi.Classes = sidi.Classes.str.strip()
display(sidi.Classes.unique())

# drop the year column
bejaia = bejaia.drop(['year'], axis = 1)
sidi = sidi.drop(['year'], axis=1)

In [None]:
# recode the class as a dumby variable (not fire: 0, fire: 1)
replacement_mapping = {'fire': 1, 'not fire': 0}
bejaia['Classes'] = bejaia['Classes'].replace(replacement_mapping)
sidi['Classes'] = sidi['Classes'].replace(replacement_mapping)

In [None]:
# get the coefficient matrix
display(bejaia.corr())
display(sidi.corr())

In [None]:
# get the heatmap of the coefficient matrix to get a better idea of the correlation among variables
sns.set(rc={'figure.figsize':(12,10)})
sns.heatmap(bejaia.corr())

In [None]:
sns.set(rc={'figure.figsize':(12, 10)})
sns.heatmap(sidi.corr())

Based on these two heatmaps we can find a general pattern:
(1) The FWI indices seem to have a strong and positive correlation with the occurence of forest fire
(2) Natural factors like rain and relative humidity seem to have a strong and negative correlation with the occurence of forest fire
(3) Natural factors, like temperature seem to have a middle and positive correlation with the occurence of forest fire.


Another pattern we can identify is that some factors have a correlation coefficient that is so close to 1, which means we may have a multicollinearity that can introduce a bias to our regression model. In this case, we can see the correlation coefficient between BUI and DMC seem to be too high. Instead of making a distinction using naked eyes, we will use algorithim to help us filter our the highly-similar variables.

In [None]:
def multicolinearity_id(dataset, threshold):
    col_corr = set()
    corr_matrix = dataset.corr()
    
    # loop through the correlation matrix to find highly correlated features.
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold: 
                colname = corr_matrix.columns[i]
                rowname = corr_matrix.index[j]
                col_corr.add((rowname, colname,round(corr_matrix.iloc[i, j],2)))
                
    return col_corr

# set the threshold to 0.9
mul_bejaia = multicolinearity_id(bejaia, 0.9)
mul_sidi = multicolinearity_id(sidi, 0.9)
print(mul_bejaia)
print(mul_sidi)

So it seems that for the bejaia database, we need to drop BUI and/or DMC since they are so similar to other indices; for the sidi dataset we also need to drop BUI and FWI.