In [6]:
###Analysis for HEC with DNN and MLP 
#Last updated: May 16, 2024

import numpy as np
import pandas as pd
from numpy import loadtxt
from keras.models import Sequential
from keras.layers import Dense
from matplotlib import pyplot
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix

In [7]:
#Import data
dataset=pd.read_csv("C:/Users/kem99059/Desktop/Zambezi/Incidents/MachineLearningAnalysis/HEC2015_2021_From_R_100m.csv")
print(type(dataset))
dataset.head(5)

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,km_id,NDVI_max,NDVI_min,DISTPA,BUILDINGDE,DISTBUILDING,DISTROADS,DISTWATER,precipitation_max,Dist_Ag,Dist_CF,HV,slope,RAIDED
0,1105,2.310251,-0.316098,-0.390276,1.208716,-0.823016,-0.668585,-0.007311,0.54295,-0.327156,-1.518134,0.932397,1.346166,0
1,1106,1.158316,0.072941,-0.619508,0.421592,0.478244,-0.668585,-0.484078,-0.34037,-0.429011,-0.340782,1.371103,1.345913,0
2,1107,0.848879,0.154333,-0.619508,-0.670483,-0.823016,-0.668585,-0.03181,2.857097,-0.860476,0.146892,0.115055,1.346084,0
3,1108,2.109068,1.457296,2.091907,0.772304,-0.823016,-0.668585,1.901148,1.044164,0.865384,-0.340782,1.314744,0.975369,0
4,1109,0.013167,-0.503012,4.105167,-1.885284,-0.823016,2.287614,-1.807891,0.579173,3.35508,-0.340782,-0.324148,2.159631,0


In [11]:
# split into input (X) and output (y) variables
X = dataset.iloc[:,1:13].values
Y = dataset.iloc[:,13].values


#Scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)


StandardScaler()

In [9]:
#kfold from 
#https://github.com/christianversloot/machine-learning-articles/blob/main
#/how-to-use-k-fold-cross-validation-with-keras.md

#############################
#Define keras DNN model
#############################

# Model configuration
num_folds = 10
batch_size=200
loss_function='binary_crossentropy'
optimizer='AdaGrad'
no_epochs=5000
# Define per-fold score containers
acc_per_fold = []
loss_per_fold = []

# Define the K-fold Cross Validator
kfold = KFold(n_splits=num_folds, shuffle=True)

# K-fold Cross Validation model evaluation
fold_no = 1
for train, test in kfold.split(X,Y):
    
 # Define the model architecture    
    model = Sequential()
    model.add(Dense(12, input_dim=12, activation='relu'))
    model.add(Dense(12, activation='relu'))
    model.add(Dense(1, activation='sigmoid'))
        
   # Compile the model
    model.compile(loss=loss_function,
                optimizer=optimizer,
                metrics=['accuracy'])
    
    # Generate a print
    print('------------------------------------------------------------------------')
    print(f'Training for fold {fold_no} ...')

    # Fit data to model
    history = model.fit(X[train], Y[train],
              batch_size=batch_size,
              epochs=no_epochs,
              verbose=0)

    # Generate generalization metrics
    print(model)
    scores = model.evaluate(X[test], Y[test], verbose=0)
    print(scores)
    print(f'Score for fold {fold_no}: {model.metrics_names[0]} of {scores[0]}; {model.metrics_names[1]} of {scores[1]*100}%')
    acc_per_fold.append(scores[1] * 100)
    loss_per_fold.append(scores[0])
    
    #Make predictions using test data
    p_pred = model.predict(X)
    p_pred = p_pred.flatten()
    y_pred = np.where(p_pred > 0.5, 1, 0)
    #print(predictions)
    #Generate confusion matrices
    print(confusion_matrix(Y,y_pred))

    # Increase fold number
    fold_no = fold_no + 1

# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

------------------------------------------------------------------------
Training for fold 1 ...
<tensorflow.python.keras.engine.sequential.Sequential object at 0x000001C3F1FC80D0>
[0.6639571189880371, 0.6083915829658508]
Score for fold 1: loss of 0.6639571189880371; accuracy of 60.83915829658508%
[[496 217]
 [324 389]]
------------------------------------------------------------------------
Training for fold 2 ...
<tensorflow.python.keras.engine.sequential.Sequential object at 0x000001C3FBF3C880>
[0.6239559054374695, 0.692307710647583]
Score for fold 2: loss of 0.6239559054374695; accuracy of 69.2307710647583%
[[469 244]
 [272 441]]
------------------------------------------------------------------------
Training for fold 3 ...
<tensorflow.python.keras.engine.sequential.Sequential object at 0x000001C3FC39D9A0>
[0.6474098563194275, 0.5804196000099182]
Score for fold 3: loss of 0.6474098563194275; accuracy of 58.04196000099182%
[[516 197]
 [322 391]]
------------------------------------

In [12]:
# Summarize the DNN model
print(model.summary())


Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 12)                156       
_________________________________________________________________
dense_28 (Dense)             (None, 12)                156       
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 13        
Total params: 325
Trainable params: 325
Non-trainable params: 0
_________________________________________________________________
None


In [13]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


# predict probabilities for test set
yhat_probs = (model.predict(X) > 0.5).astype("int32")
# predict crisp classes for test set
yhat_classes = (model.predict(X) > 0.5).astype("int32")
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]


# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y, yhat_classes)
print('F1 score: %f' % f1)
# kappa
kappa = cohen_kappa_score(Y, yhat_classes)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(Y, yhat_probs)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(Y, yhat_classes)
print(matrix)

Accuracy: 0.644460
Precision: 0.656061
Recall: 0.607293
F1 score: 0.630736
Cohens kappa: 0.288920
ROC AUC: 0.644460
[[486 227]
 [280 433]]


In [14]:
######
#Neural networks with Multi-layer Perceptron classifier
#######
#Import data
dataset=pd.read_csv("C:/Users/kem99059/Desktop/Zambezi/Incidents/MachineLearningAnalysis/HEC2015_2021_From_R_100m.csv")

In [15]:
# split into input (X) and output (y) variables
X = dataset.iloc[:,1:13].values
Y = dataset.iloc[:,13].values

#Scale data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)


StandardScaler()

In [16]:
#kfold from 
#https://github.com/christianversloot/machine-learning-articles/blob/main
#/how-to-use-k-fold-cross-validation-with-keras.md
from sklearn.model_selection import KFold
from sklearn.neural_network import MLPClassifier

#############################
#Define MLP model
#############################

# Model configuration
kf = KFold(n_splits=10)
mlp = MLPClassifier(hidden_layer_sizes=(100, 100, 100), max_iter=100000, 
                    random_state=13)

#Train the MLP model
for train_indices,test_indices in kf.split(X):
    mlp.fit(X[train_indices], Y[train_indices.ravel()])

    #Make predictions 
    predictions = mlp.predict(X)
    print(Y)
    print(predictions)
    #Generate confusion matrices
    print(confusion_matrix(Y,predictions))
    #print(classification_report(Y,predictions))
    
# == Provide average scores ==
print('------------------------------------------------------------------------')
print('Score per fold')
for i in range(0, len(acc_per_fold)):
    print('------------------------------------------------------------------------')
print(f'> Fold {i+1} - Loss: {loss_per_fold[i]} - Accuracy: {acc_per_fold[i]}%')
print('------------------------------------------------------------------------')
print('Average scores for all folds:')
print(f'> Accuracy: {np.mean(acc_per_fold)} (+- {np.std(acc_per_fold)})')
print(f'> Loss: {np.mean(loss_per_fold)}')
print('------------------------------------------------------------------------')

[0 0 0 ... 1 1 1]
[1 0 0 ... 1 1 1]
[[654  59]
 [ 13 700]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[706   7]
 [ 73 640]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[667  46]
 [ 36 677]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[669  44]
 [ 49 664]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[627  86]
 [  6 707]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[706   7]
 [ 86 627]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[706   7]
 [ 98 615]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[626  87]
 [  4 709]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 1 1 1]
[[676  37]
 [ 41 672]]
[0 0 0 ... 1 1 1]
[0 0 0 ... 0 0 0]
[[709   4]
 [114 599]]
------------------------------------------------------------------------
Score per fold
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------------------------------------------------
------------------------------

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score
from sklearn.metrics import cohen_kappa_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix


# predict probabilities for test set
yhat_probs = (mlp.predict(X) > 0.5).astype("int32")
# predict crisp classes for test set
yhat_classes = (mlp.predict(X) > 0.5).astype("int32")
# reduce to 1d array
yhat_probs = yhat_probs[:, 0]
yhat_classes = yhat_classes[:, 0]


# accuracy: (tp + tn) / (p + n)
accuracy = accuracy_score(Y, yhat_classes)
print('Accuracy: %f' % accuracy)
# precision tp / (tp + fp)
precision = precision_score(Y, yhat_classes)
print('Precision: %f' % precision)
# recall: tp / (tp + fn)
recall = recall_score(Y, yhat_classes)
print('Recall: %f' % recall)
# f1: 2 tp / (2 tp + fp + fn)
f1 = f1_score(Y, yhat_classes)
print('F1 score: %f' % f1)
# kappa
kappa = cohen_kappa_score(Y, yhat_classes)
print('Cohens kappa: %f' % kappa)
# ROC AUC
auc = roc_auc_score(Y, yhat_probs)
print('ROC AUC: %f' % auc)
# confusion matrix
matrix = confusion_matrix(Y, yhat_classes)
print(matrix)

IndexError: too many indices for array: array is 1-dimensional, but 2 were indexed