# Mapping the vegetation distribution in Norway using deep learning
### Analysis pt. 1
Main workflow for reading the preprocessed data and setting up the experiments.

In [None]:
### Import required packages
# Data handling
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import LabelBinarizer

# Spatial
#import spacv
#import geopandas as gpd

# Metrics?
from sklearn.model_selection import cross_val_score

## 1. Read and partition data 

In [None]:
### Initialize Onehotencoder and Scaler
onehotencoder = OneHotEncoder(categories="auto")
sc = StandardScaler()

### Read confusion matrices, one hot-encoded (for NN), one not (decision trees)
#X_cat = pd.read_pickle("./DataFiles/FeatureMatrixCats.pkl")
#X_dum = pd.read_pickle("./DataFiles/FeatureMatrixDummy.pkl")

# Read target
#y_cat = pd.read_pickle("./DataFiles/TargetVTs.pkl")
y_dum = LabelBinarizer().fit_transform(y_cat)

### Normalize the NN feature matrix
X_dum_scaled = np.array(X_dum)

In [None]:
### Normalize the NN feature matrix
X_dum_scaled[:,3:65] = sc.fit_transform(X_dum_scaled[:,3:65]) # Omit x/y/plotid and cat. dummy variables

In [None]:
### Construct final feature matrices

# Remove variables from data frames that should not be in feature matrix
rmvars = ["x","y","plot_id","geology_norge1"]
X_cat_final = X_cat.drop(rmvars, axis=1)

# Remove from scaled numerical matrix as well, mind different col names for cat. variables!
rmvars = ["x","y","plot_id","geology_norge1_1","geology_norge1_2","geology_norge1_3"]
idx_rm = [X_dum.columns.get_loc(c) for c in rmvars if c in X_dum]
X_dum_scaled_final = np.delete(X_dum_scaled, idx_rm, axis=1)

In [None]:
### Perform train test split
# Set splitting parameters
testSetRatio = 0.2    # Proportion of data that should end up in test set
seed  = 77           # Random seed to make results reproducible

# Start with numerical scaled
X_dummy_train, X_dummy_test, y_dummy_train, y_dummy_test =\
train_test_split(X_dum_scaled_final, y_dum, test_size = testSetRatio,\
                 random_state = seed)

In [None]:
# Custom dense neural network

def getNN():
   
    from keras.models import Sequential
    from keras.layers import Dense
    from keras.layers import LeakyReLU
    from keras.layers import Dropout
    from keras import regularizers
    from keras import metrics
    #import tensorflow_addons as tfa

    ### Define metrics
    metrics = [
        metrics.CategoricalAccuracy(name="categorical_accuracy"),
        metrics.FalseNegatives(name="fn"),
        metrics.FalsePositives(name="fp"),
        metrics.TrueNegatives(name="tn"),
        metrics.TruePositives(name="tp"),
        metrics.Precision(name="precision"),
        metrics.Recall(name="recall"),
        metrics.AUC(name='auc')#,
        #tfa.metrics.CohenKappa(name='kappa')
    ]



    # define the keras model
    nn = Sequential()
    nn.add(Dense(256, 
                 input_dim=X_dummy_train.shape[1],
                kernel_regularizer='l1'))#, activation='relu'))
    nn.add(LeakyReLU(alpha=0.1))
    nn.add(Dropout(0.1))

    nn.add(Dense(128))#, activation='relu'))#,kernel_regularizer='l1'))
    nn.add(LeakyReLU(alpha=0.1))
    nn.add(Dropout(0.1))

    nn.add(Dense(64))#, activation='relu'))#,kernel_regularizer='l1'))
    nn.add(LeakyReLU(alpha=0.1))
    nn.add(Dropout(0.1))

    nn.add(Dense(64))#, activation='relu'))#,kernel_regularizer='l1'))
    nn.add(LeakyReLU(alpha=0.1))
    nn.add(Dropout(0.1))

    nn.add(Dense(y_dummy_train.shape[1], activation='softmax'))

    nn.compile(loss='categorical_crossentropy', optimizer='Adamax', metrics=metrics)
    
    return nn


In [None]:
### Set up neural network
nn = getNN()

In [None]:
# fit model
history = nn.fit(X_dummy_train, y_dummy_train, validation_data=(X_dummy_test, y_dummy_test), epochs=20, verbose=0)

In [None]:
### Print available metrics to plot
print(nn.metrics_names)
train_metrics = nn.evaluate(X_dummy_train, y_dummy_train, verbose=0)
print(train_metrics)

In [None]:
X_dummy_train.shape

In [None]:
import matplotlib.pyplot as plt 

# Evaluate the model
train_metrics = nn.evaluate(X_dummy_train, y_dummy_train, verbose=0)
test_metrics = nn.evaluate(X_dummy_test, y_dummy_test, verbose=0)
print('Accuracy assessment. Train: %.3f, Test: %.3f' % (train_metrics[1], test_metrics[1]))

# plot loss during training
plt.subplot(211)
plt.title('Loss')
plt.plot(history.history['loss'], label='train')
plt.plot(history.history['val_loss'], label='test')
plt.legend()

# plot accuracy during training
plt.subplot(212)
plt.title('Accuracy')
plt.plot(history.history['categorical_accuracy'], label='train')
plt.plot(history.history['val_categorical_accuracy'], label='test')
plt.legend()
plt.show()

In [None]:
### Save trained model in file?
#nn.save('./Results/TrainedModel/firstMLP')
#nn.save('./Results/TrainedModel/first_MLP.h5') # For representation

### Use trained model to calculate k-fold cv scores
------------------------------------------------------

In [None]:
from sklearn.model_selection import KFold

### Create cross validation folds
kf = KFold(n_splits=5,shuffle=True)
split_idx = kf.split(X_dum_scaled_final)

### Initialize vectors to store performance metrics
#print(nn.metrics_names)
# ['loss', 'categorical_accuracy', 'fn', 'fp', 'tn', 'tp', 'precision', 'recall', 'auc']

### Initialize empty lists
MODELS = []
cat_acc_cv, prec_cv, rec_cv, f1_cv, auc_cv = [],[],[],[],[]
cat_acc_cv_test, prec_cv_test, rec_cv_test, f1_cv_test, auc_cv_test = [],[],[],[],[]

In [None]:
### Loop through folds

for train_index, test_index in split_idx:
    
    ### Subset data to current fold
    X_dummy_train = X_dum_scaled_final[train_index,:]
    y_dummy_train = y_dum[train_index,:]
    
    X_dummy_test = X_dum_scaled_final[test_index,:]
    y_dummy_test = y_dum[test_index,:]
    
    ### Load NN
    cur_nn = getNN()
    
    ### fit model
    cur_history = cur_nn.fit(X_dummy_train, y_dummy_train, validation_data=(X_dummy_test, y_dummy_test), epochs=10, verbose=0)
    # Save model
    MODELS.append(cur_nn)
    
    ### Evaluate the model
    train_metrics = cur_nn.evaluate(X_dummy_train, y_dummy_train, verbose=0)
    test_metrics = cur_nn.evaluate(X_dummy_test, y_dummy_test, verbose=0)
    
    # Determine metric indices
    names = cur_nn.metrics_names
    
    # Save values
    cat_acc_cv.append(train_metrics[names.index('categorical_accuracy')])
    prec_cv.append(train_metrics[names.index('precision')])
    rec_cv.append(train_metrics[names.index('recall')])
    #kappa_cv.append(train_metrics[names.index('kappa')])
    auc_cv.append(train_metrics[names.index('auc')])
    
    cat_acc_cv_test.append(test_metrics[names.index('categorical_accuracy')])
    prec_cv_test.append(test_metrics[names.index('precision')])
    rec_cv_test.append(test_metrics[names.index('recall')])
    #kappa_cv_test.append(test_metrics[names.index('kappa')])
    auc_cv_test.append(test_metrics[names.index('auc')])
    

## Finally, calculate MACRO f1-score
f1_cv = [2*(pr*re)/(pr+re) for pr,re in zip(prec_cv,rec_cv)]
f1_cv_test = [2*(pr*re)/(pr+re) for pr,re in zip(prec_cv_test,rec_cv_test)]

In [None]:
df_train = pd.DataFrame(data={"categorical_accuracy": cat_acc_cv, "precision": prec_cv, "recall": rec_cv, "auc": auc_cv, "f1_score": f1_cv})
df_test = pd.DataFrame(data={"categorical_accuracy": cat_acc_cv_test, "precision": prec_cv_test, "recall": rec_cv_test, "auc": auc_cv_test, "f1_score": f1_cv_test})

#df_train.to_csv("./Results/Tables/NN_trainmetrics.csv", sep=',',index=False)
#df_test.to_csv("./Results/Tables/NN_testmetrics.csv", sep=',',index=False)

In [None]:
print(df_test)