# Importing libraries

In [1]:
import numpy as np
import pandas as pd
import math

import matplotlib.pyplot as plt

# Loading data

In [2]:
def load_data(file_name):
    # read file
    file = pd.read_csv(file_name,
                       sep="\t",
                       skipinitialspace=True,
                       skip_blank_lines=True,
                       converters={'fire': str.strip})
    
    # replace newlines
    file.replace('\n','')

    # convert txt file to csv
    file.to_csv("wildfires.csv", index=False)
    
    df = pd.read_csv("wildfires.csv")

    return df

In [3]:
df = load_data("wildfires(1).txt")
df

Unnamed: 0,fire,year,temp,humidity,rainfall,drought_code,buildup_index,day,month,wind_speed
0,no,2015,28,59,0.0,8.06,3.47,1,6,19
1,no,2010,30,61,1.3,8.17,4.03,2,6,13
2,no,2009,26,83,13.1,8.08,3.59,3,6,22
3,no,2017,25,87,2.5,7.18,2.42,4,6,15
4,no,2014,28,77,0.0,14.98,4.63,5,6,18
...,...,...,...,...,...,...,...,...,...,...
199,yes,2017,31,67,0.0,45.15,17.89,26,9,15
200,no,2017,29,89,4.4,8.74,6.52,27,9,15
201,no,2009,27,88,0.5,8.87,3.71,28,9,30
202,no,2016,25,56,0.1,15.54,6.10,29,9,20


# Data pre-processing

## Normalising data

In [4]:
# normalise data
def normalisation(df):
    
    df_norm = df.copy()
    
    cols = [col for col in df_norm.columns]
    
    for col in cols[1:]:
        x = np.array(df_norm[col])
    
        # Z-Normalisation: D ← (D - Mean) / StDev
        #x = (x-np.mean(x))/np.std(x)
    
        # 0-1 Normalisation: D ← (D – Min) / (Max – Min)
        x = (x-np.min(x))/(np.max(x)-np.min(x))
    
        df_norm[col] = x
    
    return df_norm

In [5]:
df_norm = normalisation(df)
df_norm

Unnamed: 0,fire,year,temp,humidity,rainfall,drought_code,buildup_index,day,month,wind_speed
0,no,0.8,0.285714,0.535211,0.000000,0.004109,0.032114,0.000000,0.0,0.541667
1,no,0.3,0.380952,0.563380,0.077381,0.004622,0.040478,0.033333,0.0,0.291667
2,no,0.2,0.190476,0.873239,0.779762,0.004202,0.033906,0.066667,0.0,0.666667
3,no,1.0,0.142857,0.929577,0.148810,0.000000,0.016430,0.100000,0.0,0.375000
4,no,0.7,0.285714,0.788732,0.000000,0.036420,0.049440,0.133333,0.0,0.500000
...,...,...,...,...,...,...,...,...,...,...
199,yes,1.0,0.428571,0.647887,0.000000,0.177289,0.247498,0.833333,1.0,0.375000
200,no,1.0,0.333333,0.957746,0.261905,0.007284,0.077670,0.866667,1.0,0.375000
201,no,0.2,0.238095,0.943662,0.029762,0.007891,0.035698,0.900000,1.0,1.000000
202,no,0.9,0.142857,0.492958,0.005952,0.039034,0.071397,0.933333,1.0,0.583333


## Dividing dataset into testing set and remainder set

In [6]:
# randomly devide the file, 2/3 for training/validation and 1/3 for testing

def divide_test_data(df, ratio_test=1.0/3.0):
    
    df_norm = normalisation(df)
    
    data_norm = df_norm.to_numpy()
    
    np.random.shuffle(data_norm) # shuffle the data before dividing 
    
    test_data = data_norm[:int(len(data_norm)*ratio_test)]
    remainder_data = data_norm[int(len(data_norm)*ratio_test):]
    
    return test_data, remainder_data 

In [7]:
test_data, remainder_data = divide_test_data(df_norm)

print(len(test_data))
print(len(remainder_data))
print(remainder_data.shape)

68
136
(136, 10)


## Dividing remainder set into training set and validation set

#### Use the N-folders cross validation technique

In [8]:
# randomly divide remainder_data into training sets and validation sets

# divide remainder set into 4/5 for training
def divide_train_val(remainder_data, ratio_train=4./5.):
#def divide_train_val(remainder_data, N=5):
    
    np.random.shuffle(remainder_data) # shuffle the data before dividing 

    train_data = remainder_data[:int(len(remainder_data)*ratio_train)]
    validation_data = remainder_data[int(len(remainder_data)*ratio_train):]
    
    #batches = []
    
    #for i in range(N):
        #batches.append((i+1)*len(remainder_data))
     #   print()
    
    return train_data, validation_data 

In [9]:
train_data, validation_data = divide_train_val(remainder_data)
print(len(train_data))
print(len(validation_data))

108
28


In [10]:
# Preparing the training/validation dataset

def wildfires_dataloader(data):
    
    X = data[:,1:] # features
    y = data[:,0] # labels
    
    # covert label from string to numerial format
    yes_index = np.where(y=="yes")[0]
    for i in yes_index:
        #y[i] = np.array(([1],[0]))
        y[i] = 1
    
    no_index = np.where(y=="no")[0]
    for i in no_index:
        #y[i] = np.array(([0],[1]))
        y[i] = 0
              
    return X, y

In [11]:
X, y = wildfires_dataloader(train_data)

print(len(X))
print(len(y))
print(X.shape)
print(y.shape)


108
108
(108, 9)
(108,)


# Generate one perceptron layer

Define the function set:

$$
y=f(x)
$$
denotes the output from the perceptron for an input vector $z$.

And:

$$
y=X_{j,i}\times w_{i} + b_{j}
$$

where $x_{j,i}$ represents the value of the $ith$ feature of the $jth$ training input vector, $w_{i}$ denoted the the $ith$ value in the weight vector, and $b_{j}$ represents the $jth$ value in the bias vector.

$$
y = f(x) = \sigma(W^{L}... \sigma(W^2\times \sigma(W^{1}\times x + b^{1})+b^{2})...+b^{L})
$$

#### techiniques for random initialising weight matrix

$$
W^{L}=np.random.randn(size_{L}, size_{L-1})*np.sqrt(1/size_{L-1})
$$

Xavier initialization.

ref: https://towardsdatascience.com/weight-initialization-techniques-in-neural-networks-26c649eb3b78

#### weight initialisation for different activation function

ref: https://www.deeplearningwizard.com/deep_learning/boosting_models_pytorch/weight_initialization_activation_functions/

#### technique of calculating Root Mean Squared Error

$$
RMSD=\sqrt{\frac{\sum_{n=1}^{N}(\hat{y_{n}}-y_{n})^2}{N}}
$$


#### technique of calculating cross entropy
$$
C(y,\hat{y})= -\sum_{i=1}^{2}(\hat{y_{i}}\times \log_{2}y_{i})=-\hat{y_{1}}\times \log_{2}y_{1}-\hat{y_{2}}\times \log_{2}y_{2}
$$

In [12]:
lr=0.0001  # learning rate

# randomly initialise parameters for each layer
# we design one perceptron first

W = np.random.randn(len(X[1]), 1)
b = np.random.randn()

print(W.shape)
print(b)

(9, 1)
-0.8236358218357203


### One Perceptron

$$
y_{pred}=x\times W + b 
$$

cost:
$$
c^1 = (y_{truth} - y_{pred})^2
$$

$$
\frac{\partial c}{\partial w} = \frac{\partial z}{\partial w}\times \frac{\partial c}{\partial z}
$$

### If we have one perceptron with an sigmpid function

$$
\sigma(z) = \frac{1}{1+e^{-z}}
$$

$$
y_{pred}=\sigma(x\times W + b)
$$

cost:
$$
c^1 = (y_{truth} - y_{pred})^2
$$

\begin{align}
\frac{\partial c}{\partial w} \\
&= \frac{\partial z}{\partial w}\times \frac{\partial c}{\partial z} \\
&= forward pass \times backward pass \\
&= \frac{\partial z}{\partial w}\times \frac{\partial a}{\partial z}\times \frac{c}{a} \\
&= input(eg. x1, x2,...)\times \sigma'(z)\times \frac{\partial z}{\partial a}
\end{align}

However, $\sigma'(z)$ is a constant. Therefore the derivate of one perceptron and one perceptron with sigmoind function is the same.

\begin{align}
\sigma'(z)
&=\frac{e^{-z}}{(1+e^{-z})^{2}} \\
&=\frac{1}{1+e^{-z}}\times (1-\frac{1}{1+e^{-z}}) \\
&=\sigma(z)\times (1-\sigma(z))
\end{align}

ref: https://towardsdatascience.com/derivative-of-the-sigmoid-function-536880cf918e

In [13]:
#def one_perceptron(X, y, W, b):
def one_perceptron(x, y_truth, W, b):

    # reshape x from (9,) to (9, 1)
    #x = x.reshape(x.shape[0], 1)
        
    y_pred = x.T @ W + b # x.T.shape=(1, 9), W.shape=(9, 1)
    y_pred = y_pred[0][0] # convert array to int
    
    # RMSD
    c = (y_pred-y_truth)**2 # represents the distance between y_pred and y_truth for one perceptron
    
    #update the parameters
    #dloss_dw = x * (y_pred-y_truth) #* 2
    #dloss_db = 1 * (y_pred-y_truth) #* 2
        
    #W = W - lr * dloss_dw
    #b = b - lr * dloss_db
        
    return y_pred, y_truth, c, #W, b

In [14]:
y_pred, y_truth, c = one_perceptron(X[0].reshape(X[0].shape[0], 1), y[0], W, b)
print(f"y_pred={y_pred}, y_truth={y_truth}, c={c}")

y_pred=-1.6000752536485336, y_truth=1, c=6.760391324635487


In [15]:
def sigmoid(z, derivative=False):
    
    if derivative == False:
        y_sigma = 1 / (1+math.exp(-z))
        
    if derivative == True:
        y_sigma = (1 / (1+math.exp(-z))) * (1 - (1 / (1+math.exp(-z))))
        
    return y_sigma

In [17]:
y_sigma = sigmoid(y_pred, derivative=True)
y_sigma

0.13975680783398306

# Training

## training one batch

We use N-folder cross validation technique. We set $N=5$ here, four folders for train and 1 folder for validation, then, repeat until every folder has been a validation folder. We obtain 5 hypothesis with the MSE score for each validation batch. We obtain the parameters (W, b) of the smallest MSE score of validation set and use them to train the whole training data. 

$$
MSE = \frac{1}{N}\times \sum_{i=1}^{n}(Y_{i}-\hat{Y_{i}})^2
$$

In [15]:
def train_one_batch(X, y, W, b): 
          
    # set up some constants
    loss_sum = 0.0
    mse = 0.0
    accuracy_sum = 0.0
    accuracy_avg = 0.0
    
    for idx, x in enumerate(X):
        
        # reshape x from (9,) to (9, 1)
        x = x.reshape(x.shape[0], 1)
  
        y_pred, y_truth, c = one_perceptron(x, y[idx], W, b)
    
        if c == 0:
            accuracy_sum += 1
                  
        loss_sum += c
        
            
        #update the parameters
        dloss_dw = x * (y_pred-y_truth) #* 2
        dloss_db = 1 * (y_pred-y_truth) #* 2
        
        W = W - lr * dloss_dw
        b = b - lr * dloss_db


    mse = loss_sum/len(X)
    accuracy_avg = accuracy_sum/len(X)
    
    print(f"loss_sum={loss_sum}, mse={mse}, \
    accuracy_sum={accuracy_sum}, accuracy_avg={accuracy_avg}")
    
    return W, b, mse

In [16]:
W1, b1, mse1 = train_one_batch(X, y, W, b)

print(f"W1={W1.shape}, b1={b1}, mse1={mse1}")

loss_sum=647.574839780206, mse=5.996063331298204,     accuracy_sum=0.0, accuracy_avg=0.0
W1=(9, 1), b1=-0.3184974634156516, mse1=5.996063331298204


In [None]:
# Generating 5 batches of train/validation data 
for i in 
train_data = np.random.shuffle(train_data)
train1 = 

In [17]:
def one_epoch(): # in this case, one epoch = 4 training folders and 1 validation set
    
    return W, b, validation_mse

In [None]:
epochs = 5