In [1]:
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import compute_class_weight
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,recall_score,confusion_matrix, cohen_kappa_score, precision_score, recall_score
from sklearn.metrics import precision_recall_curve, auc, average_precision_score, f1_score, roc_auc_score, roc_curve
import pandas as pd
import math
from joblib import dump, load
import random
import pickle
import scipy.stats
from PIL import Image
import scipy.stats as stats

from Functions import post_processing
from Functions import general_functions
from Functions import feature_creation

from skimage.restoration import denoise_bilateral

import operator

In [2]:
zone_1_path = "dataset/zone_4.pickle"
zone_2_path = "dataset/zone_6.pickle"
zone_3_path = "dataset/zone_9.pickle"

In [3]:
file = open(zone_1_path, "rb")
zone_1 = pickle.load(file)

file = open(zone_2_path, "rb")
zone_2 = pickle.load(file)

file = open(zone_3_path, "rb")
zone_3 = pickle.load(file)

file = None

In [4]:
#zone_1_resampled = general_functions.create_balanced_dataset([zone_1_path])
#zone_2_resampled = general_functions.create_balanced_dataset([zone_2_path])
#zone_3_resampled = general_functions.create_balanced_dataset([zone_3_path])

#with open("dataset/zone_1_resampled.pickle", "wb") as file:
#    pickle.dump(zone_1_resampled, file)
#with open("dataset/zone_2_resampled.pickle", "wb") as file:
#    pickle.dump(zone_2_resampled, file)
#with open("dataset/zone_3_resampled.pickle", "wb") as file:
#    pickle.dump(zone_3_resampled, file)

In [5]:
with open("dataset/zone_1_resampled.pickle", "rb") as file:
    zone_1_resampled = pickle.load(file)
with open("dataset/zone_2_resampled.pickle", "rb") as file:
    zone_2_resampled = pickle.load(file)
with open("dataset/zone_3_resampled.pickle", "rb") as file:
    zone_3_resampled = pickle.load(file)

In [6]:
zone_1.shape

(7852140, 82)

In [7]:
experiment_arr = [(pd.concat([zone_1_resampled, zone_3_resampled], ignore_index=True), zone_2),
                  (pd.concat([zone_1_resampled, zone_2_resampled], ignore_index=True), zone_3)]

In [8]:
zone_1 = None
zone_2 = None
zone_3 = None
zone_1_resampled = None
zone_2_resampled = None
zone_3_resampled = None

In [19]:
most_important_features = experiment_arr[0][0].columns.tolist()[1:]
for num_features in [81, 70, 60, 50, 40]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:

        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        _max_features = 25 if num_features > 25 else num_features
        clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", max_features=_max_features,
                                     random_state=42, n_jobs=-1)

        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        proba = clf.predict_proba(X_test)[:,1:].reshape(2997,2620)
        
        proba_post_process = post_processing.proba_post_process(proba, 6, 0.4)
    
        labels_grid = post_processing.raster_to_zones(np.array(y_test).reshape(2997, 2620), 6, 4)

        for i, pred_var in enumerate(proba_post_process.reshape(-1)):
            y_test_all.append(labels_grid.reshape(-1)[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['hpmf_raw', 'skyview_raw', 'impundment_raw', 'slope_raw', 'DEM_ditch_detection', 'DEM_ditch_detection_no_streams', 'conic_mean', 'skyview_non_ditch', 'skyview_gabor', 'conic_mean_no_streams', 'skyview_gabor_no_streams', 'skyview_mean_2', 'skyview_mean_3', 'skyview_mean_4', 'skyview_mean_6', 'skyview_median_2', 'skyview_median_4', 'skyview_median_6', 'skyview_min_2', 'skyview_min_4', 'skyview_min_6', 'skyview_max_2', 'skyview_max_4', 'skyview_max_6', 'skyview_std_2', 'skyview_std_4', 'skyview_std_6', 'impundment_amplified', 'impoundment_amplified_no_streams', 'impundment_mean_2', 'impundment_mean_3', 'impundment_mean_4', 'impundment_mean_6', 'impundment_median_2', 'impundment_median_4', 'impundment_median_6', 'impundment_min_2', 'impundment_min_4', 'impundment_min_6', 'impundment_max_2', 'impundment_max_4', 'impundment_max_6', 'impundment_std_2', 'impundment_std_4', 'impundment_std_6', 'hpmf_filter', 'hpmf_gabor', 'hpmf_gabor_no_streams', 'hpmf_filter_no_st

Cohen's kappa score         0.6074974677335433
Accuracy score              0.9842839022228336
Recall score                0.5781367986653789
Precision score             0.6579925650557621

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'impundment_mean_2', 'hpmf_mean_4', 'hpmf_mean_3', 'impundment_median_4', 'hpmf_min_2', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'impundment_std_4', 'skyview_gabor_no_streams', 'slope_std_6', 'impundment_median_2', 'skyview_max_4', 'slope_min_6', 'hpmf_mean_6', 'impundment_mean_6', 'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'impundment_max_2', 'hpmf_min_4', 'hpmf_mean_2', 'slope_median_6', 'slope_non_ditch', 'impundment_std_6', 'hpmf_filter', 'impundment_median_6', 'skyview_max_2', 'hpmf_filter_no_streams', 'skyview_min_6', 'impundment_max_4', 'hpmf_gabor_no_streams', 'slope_mean_6', 'slope_st

Amount of features X_train: 40, X_test: 40
Amount of features X_train: 40, X_test: 40


Importances for experiment:
impundment_mean_3  -  0.22955582521855084
impundment_mean_4  -  0.12024382036414162
hpmf_mean_4  -  0.030181728607523647
impundment_amplified  -  0.028162619232376652
hpmf_median_4  -  0.027641721435081604
impundment_mean_2  -  0.026668935044070093
impoundment_amplified_no_streams  -  0.02426657240282676
skyview_max_6  -  0.02189465348457963
slope_std_6  -  0.020489618137113676
hpmf_mean_3  -  0.01806767137202291
skyview_max_4  -  0.01743659389469335
slope_min_6  -  0.017415744121230314
hpmf_mean_6  -  0.017250574352831345
skyview_gabor_no_streams  -  0.017122958016857495
hpmf_min_4  -  0.016838186083621715
skyview_gabor  -  0.01675801012920184
impundment_max_6  -  0.016529549983806343
skyview_median_6  -  0.01645913950140155
skyview_non_ditch  -  0.01641407015374431
slope_median_6  -  0.016371222499315323
slope_min_4  -  0.01566820129883951
impundment_median_2  -  0.0154

In [20]:
most_important_features = experiment_arr[0][0].columns.tolist()[1:]
for num_features in [81, 78, 76, 74, 72, 70, 68, 66, 64, 62, 60]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:

        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        _max_features = 25 if num_features > 25 else num_features
        clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", max_features=_max_features,
                                     random_state=42, n_jobs=-1)

        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        proba = clf.predict_proba(X_test)[:,1:].reshape(2997,2620)
        
        proba_post_process = post_processing.proba_post_process(proba, 6, 0.4)
    
        labels_grid = post_processing.raster_to_zones(np.array(y_test).reshape(2997, 2620), 6, 4)

        for i, pred_var in enumerate(proba_post_process.reshape(-1)):
            y_test_all.append(labels_grid.reshape(-1)[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['hpmf_raw', 'skyview_raw', 'impundment_raw', 'slope_raw', 'DEM_ditch_detection', 'DEM_ditch_detection_no_streams', 'conic_mean', 'skyview_non_ditch', 'skyview_gabor', 'conic_mean_no_streams', 'skyview_gabor_no_streams', 'skyview_mean_2', 'skyview_mean_3', 'skyview_mean_4', 'skyview_mean_6', 'skyview_median_2', 'skyview_median_4', 'skyview_median_6', 'skyview_min_2', 'skyview_min_4', 'skyview_min_6', 'skyview_max_2', 'skyview_max_4', 'skyview_max_6', 'skyview_std_2', 'skyview_std_4', 'skyview_std_6', 'impundment_amplified', 'impoundment_amplified_no_streams', 'impundment_mean_2', 'impundment_mean_3', 'impundment_mean_4', 'impundment_mean_6', 'impundment_median_2', 'impundment_median_4', 'impundment_median_6', 'impundment_min_2', 'impundment_min_4', 'impundment_min_6', 'impundment_max_2', 'impundment_max_4', 'impundment_max_6', 'impundment_std_2', 'impundment_std_4', 'impundment_std_6', 'hpmf_filter', 'hpmf_gabor', 'hpmf_gabor_no_streams', 'hpmf_filter_no_st

Cohen's kappa score         0.6035136831363332
Accuracy score              0.9841417753631494
Recall score                0.5738168408113091
Precision score             0.6546459911047001

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'impundment_median_4', 'impundment_mean_2', 'hpmf_mean_4', 'hpmf_mean_3', 'hpmf_min_2', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_median_4', 'hpmf_mean_2', 'skyview_max_6', 'impundment_std_4', 'skyview_gabor', 'impundment_median_2', 'skyview_gabor_no_streams', 'impundment_max_2', 'slope_std_6', 'slope_min_6', 'skyview_max_4', 'hpmf_mean_6', 'impundment_mean_6', 'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'slope_median_6', 'impundment_std_6', 'slope_non_ditch', 'hpmf_min_4', 'hpmf_filter', 'hpmf_filter_no_streams', 'impundment_median_6', 'impundment_max_4', 'skyview_min_6', 'skyview_max_2', 'hpmf_median_2', 'slope_std_4', 'slope_mean_6', 'h

Cohen's kappa score         0.6037633512992364
Accuracy score              0.9841830379998319
Recall score                0.572868557379928
Precision score             0.6564046842931305

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'impundment_mean_2', 'hpmf_mean_3', 'hpmf_mean_4', 'impundment_median_4', 'hpmf_min_2', 'impundment_amplified', 'impoundment_amplified_no_streams', 'impundment_max_2', 'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'impundment_std_4', 'skyview_gabor_no_streams', 'impundment_median_2', 'slope_std_6', 'skyview_max_4', 'hpmf_mean_6', 'slope_min_6', 'impundment_mean_6', 'skyview_non_ditch', 'slope_min_4', 'hpmf_min_4', 'impundment_max_6', 'hpmf_mean_2', 'slope_median_6', 'impundment_std_2', 'impundment_std_6', 'slope_non_ditch', 'hpmf_filter', 'impundment_median_6', 'hpmf_filter_no_streams', 'skyview_min_6', 'skyview_max_2', 'impundment_max_4', 'slope_mean_6', 'slope_std_4', 

Cohen's kappa score         0.5959455126570885
Accuracy score              0.9838621063811903
Recall score                0.5660198437088418
Precision score             0.6477752321234777

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'impundment_mean_2', 'hpmf_mean_4', 'hpmf_mean_3', 'impundment_median_4', 'hpmf_min_2', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'impundment_max_2', 'skyview_gabor_no_streams', 'impundment_std_4', 'slope_std_6', 'skyview_max_4', 'hpmf_mean_6', 'slope_min_6', 'impundment_median_2', 'skyview_non_ditch', 'impundment_max_6', 'impundment_mean_6', 'slope_min_4', 'hpmf_min_4', 'slope_median_6', 'impundment_std_6', 'hpmf_mean_2', 'slope_non_ditch', 'hpmf_filter', 'impundment_median_6', 'skyview_max_2', 'hpmf_filter_no_streams', 'skyview_min_6', 'impundment_max_4', 'skyview_median_6', 'slope_mean_6', 'slope_std_4',

Cohen's kappa score         0.6063989150532858
Accuracy score              0.9842770251167198
Recall score                0.5757134076740714
Precision score             0.6586306653809064

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2', 'hpmf_mean_3', 'impundment_amplified', 'impundment_median_4', 'impoundment_amplified_no_streams', 'hpmf_median_4', 'hpmf_min_2', 'skyview_max_6', 'impundment_median_2', 'skyview_gabor', 'skyview_gabor_no_streams', 'slope_std_6', 'impundment_std_4', 'skyview_max_4', 'slope_min_6', 'impundment_max_2', 'hpmf_mean_6', 'skyview_non_ditch', 'impundment_mean_6', 'slope_min_4', 'hpmf_min_4', 'impundment_max_6', 'slope_median_6', 'slope_non_ditch', 'impundment_median_6', 'hpmf_filter', 'impundment_std_6', 'skyview_max_2', 'hpmf_filter_no_streams', 'skyview_min_6', 'impundment_max_4', 'skyview_median_6', 'slope_mean_6', 'slope_std_4', 'hpmf_mean_2',

Cohen's kappa score         0.6016658284447876
Accuracy score              0.9841211440448082
Recall score                0.5701290719114935
Precision score             0.6552434003390651

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2', 'hpmf_mean_3', 'impundment_median_4', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_min_2', 'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'skyview_gabor_no_streams', 'impundment_std_4', 'slope_std_6', 'skyview_max_4', 'slope_min_6', 'hpmf_mean_6', 'impundment_median_2', 'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'impundment_mean_6', 'hpmf_min_4', 'slope_median_6', 'skyview_max_2', 'impundment_median_6', 'slope_non_ditch', 'hpmf_mean_2', 'hpmf_filter', 'impundment_std_6', 'hpmf_filter_no_streams', 'skyview_min_6', 'skyview_median_6', 'slope_std_4', 'slope_mean_6', 'hpmf_gabor', 'impundment_max_4', 'slop

In [21]:
most_important_features = ['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2', 'hpmf_mean_3',
                           'impundment_median_4', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_min_2',
                           'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'skyview_gabor_no_streams', 'impundment_std_4',
                           'slope_std_6', 'skyview_max_4', 'slope_min_6', 'hpmf_mean_6', 'impundment_median_2',
                           'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'impundment_mean_6', 'hpmf_min_4',
                           'slope_median_6', 'skyview_max_2', 'impundment_median_6', 'slope_non_ditch', 'hpmf_mean_2',
                           'hpmf_filter', 'impundment_std_6', 'hpmf_filter_no_streams', 'skyview_min_6', 'skyview_median_6',
                           'slope_std_4', 'slope_mean_6', 'hpmf_gabor', 'impundment_max_4', 'slope_min_2', 'hpmf_std_6',
                           'skyview_std_6', 'hpmf_gabor_no_streams', 'slope_max_4', 'skyview_mean_6', 'hpmf_min_6',
                           'skyview_median_4', 'hpmf_max_6', 'skyview_min_4', 'skyview_std_4', 'hpmf_std_4',
                           'impundment_max_2', 'slope_std_2', 'skyview_median_2', 'impundment_std_2', 'slope_median_4', 
                           'slope_median_2', 'hpmf_std_2', 'slope_mean_2', 'skyview_std_2', 'slope_mean_4']
for num_features in [60, 58, 56, 54, 52, 50, 48, 46, 44, 42, 40, 38, 36, 34, 32, 30]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:

        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        _max_features = 25 if num_features > 25 else num_features
        clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", max_features=_max_features,
                                     random_state=42, n_jobs=-1)

        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        proba = clf.predict_proba(X_test)[:,1:].reshape(2997,2620)
        
        proba_post_process = post_processing.proba_post_process(proba, 6, 0.4)
    
        labels_grid = post_processing.raster_to_zones(np.array(y_test).reshape(2997, 2620), 6, 4)

        for i, pred_var in enumerate(proba_post_process.reshape(-1)):
            y_test_all.append(labels_grid.reshape(-1)[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2', 'hpmf_mean_3', 'impundment_median_4', 'impundment_amplified', 'impoundment_amplified_no_streams', 'hpmf_min_2', 'hpmf_median_4', 'skyview_max_6', 'skyview_gabor', 'skyview_gabor_no_streams', 'impundment_std_4', 'slope_std_6', 'skyview_max_4', 'slope_min_6', 'hpmf_mean_6', 'impundment_median_2', 'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'impundment_mean_6', 'hpmf_min_4', 'slope_median_6', 'skyview_max_2', 'impundment_median_6', 'slope_non_ditch', 'hpmf_mean_2', 'hpmf_filter', 'impundment_std_6', 'hpmf_filter_no_streams', 'skyview_min_6', 'skyview_median_6', 'slope_std_4', 'slope_mean_6', 'hpmf_gabor', 'impundment_max_4', 'slope_min_2', 'hpmf_std_6', 'skyview_std_6', 'hpmf_gabor_no_streams', 'slope_max_4', 'skyview_mean_6', 'hpmf_min_6', 'skyview_median_4', 'hpmf_max_6', 'skyview_min_4', 'skyview_std_4', 'hpmf_std_4', 'impundment_max_2', 'slope_std_2', 'skyview_median

Amount of features X_train: 56, X_test: 56
Amount of features X_train: 56, X_test: 56


Importances for experiment:
impundment_mean_3  -  0.17908589880184866
impundment_mean_4  -  0.10894420935050472
hpmf_mean_4  -  0.042994232522818615
impundment_mean_2  -  0.03945388116244267
impundment_median_4  -  0.02636487030485648
impundment_amplified  -  0.024699534699629634
hpmf_mean_3  -  0.024269234824795052
impoundment_amplified_no_streams  -  0.02162971225534819
hpmf_median_4  -  0.020334613942374943
skyview_max_6  -  0.017177862494069076
hpmf_min_2  -  0.014578557150701194
skyview_gabor_no_streams  -  0.014415942092000209
skyview_gabor  -  0.014414373938641457
slope_std_6  -  0.01428661332580039
slope_min_6  -  0.013882682017468739
skyview_max_4  -  0.013855884576944092
hpmf_mean_6  -  0.013673741357441115
impundment_median_2  -  0.013587886965462377
impundment_max_2  -  0.013079920014369917
skyview_non_ditch  -  0.012823698182254902
slope_min_4  -  0.012282876431870295
hpmf_min_4  -  0.0

Cohen's kappa score         0.6045200560203954
Accuracy score              0.9843320419656297
Recall score                0.5690754236544033
Precision score             0.6630247974465996

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2', 'impundment_amplified', 'hpmf_mean_3', 'impoundment_amplified_no_streams', 'impundment_median_4', 'hpmf_median_4', 'skyview_max_6', 'hpmf_min_2', 'skyview_gabor', 'slope_std_6', 'hpmf_mean_6', 'skyview_gabor_no_streams', 'slope_min_6', 'skyview_max_4', 'hpmf_min_4', 'impundment_median_2', 'skyview_non_ditch', 'slope_min_4', 'impundment_max_6', 'slope_median_6', 'impundment_std_4', 'hpmf_gabor_no_streams', 'impundment_mean_6', 'impundment_median_6', 'skyview_max_2', 'hpmf_filter', 'slope_non_ditch', 'skyview_min_6', 'hpmf_filter_no_streams', 'hpmf_mean_2', 'impundment_std_6', 'slope_mean_6', 'skyview_median_6', 'slope_min_2', 'slope_std_4',

Cohen's kappa score         0.5999635967817281
Accuracy score              0.9840936356203531
Recall score                0.567073491965932
Precision score             0.6553823672674135

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'impundment_mean_2', 'hpmf_mean_4', 'impundment_amplified', 'hpmf_median_4', 'hpmf_mean_3', 'impoundment_amplified_no_streams', 'impundment_median_4', 'skyview_max_6', 'slope_std_6', 'skyview_max_4', 'hpmf_mean_6', 'hpmf_min_2', 'skyview_gabor', 'skyview_gabor_no_streams', 'slope_min_6', 'hpmf_min_4', 'skyview_non_ditch', 'skyview_median_6', 'slope_min_4', 'impundment_median_2', 'impundment_max_6', 'skyview_min_6', 'slope_median_6', 'impundment_std_4', 'skyview_max_2', 'hpmf_gabor_no_streams', 'impundment_median_6', 'hpmf_std_6', 'hpmf_filter', 'skyview_std_6', 'impundment_mean_6', 'slope_non_ditch', 'slope_mean_6', 'hpmf_filter_no_streams', 'slope_min_2', 'skyview_median_2',

Cohen's kappa score         0.6029024672319632
Accuracy score              0.9841922074746502
Recall score                0.5704451663886206
Precision score             0.6575974735819264

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'hpmf_median_4', 'impundment_amplified', 'impundment_mean_2', 'impoundment_amplified_no_streams', 'skyview_max_6', 'hpmf_mean_3', 'slope_std_6', 'hpmf_mean_6', 'slope_min_6', 'skyview_max_4', 'hpmf_min_4', 'hpmf_min_2', 'skyview_gabor', 'skyview_gabor_no_streams', 'slope_median_6', 'impundment_max_6', 'skyview_non_ditch', 'skyview_median_6', 'skyview_min_6', 'slope_min_4', 'slope_std_4', 'impundment_median_2', 'impundment_median_6', 'skyview_max_2', 'hpmf_std_6', 'hpmf_gabor_no_streams', 'slope_mean_6', 'impundment_std_4', 'skyview_std_6', 'hpmf_filter', 'slope_min_2', 'impundment_mean_6', 'skyview_median_2', 'hpmf_filter_no_streams', 'impundment_std_6']


Amo

Amount of features X_train: 32, X_test: 32
Amount of features X_train: 32, X_test: 32


Importances for experiment:
impundment_mean_3  -  0.30333097095313954
impundment_mean_4  -  0.07987929172708863
hpmf_median_4  -  0.033858728438593555
impundment_amplified  -  0.03160370002356851
impoundment_amplified_no_streams  -  0.026258510997367666
skyview_max_6  -  0.0256187038885613
skyview_non_ditch  -  0.022404665198738877
hpmf_filter  -  0.02239251367416987
impundment_mean_2  -  0.02224815514425965
slope_std_6  -  0.022018910654939144
impundment_max_6  -  0.02166811112403384
hpmf_mean_6  -  0.021006091313490665
skyview_min_6  -  0.020969152854043775
slope_min_6  -  0.02066555744164135
slope_median_6  -  0.020235334308618496
skyview_gabor_no_streams  -  0.01978148185691334
skyview_max_4  -  0.01960035124733274
hpmf_mean_4  -  0.019475427422090623
skyview_median_6  -  0.01934198856069541
hpmf_min_4  -  0.01894694948232018
slope_min_4  -  0.018784750000795363
skyview_gabor  -  0.0182960207323

In [22]:
most_important_features = ['impundment_mean_3', 'impundment_mean_4', 'hpmf_median_4', 'impundment_amplified',
                           'impoundment_amplified_no_streams', 'skyview_max_6', 'skyview_non_ditch', 'hpmf_filter',
                           'impundment_mean_2', 'slope_std_6', 'impundment_max_6', 'hpmf_mean_6', 'skyview_min_6',
                           'slope_min_6', 'slope_median_6', 'skyview_gabor_no_streams', 'skyview_max_4', 'hpmf_mean_4',
                           'skyview_median_6', 'hpmf_min_4', 'slope_min_4', 'skyview_gabor', 'skyview_max_2',
                           'impundment_median_6', 'hpmf_std_6', 'slope_std_4', 'impundment_mean_6', 'hpmf_min_2',
                           'slope_mean_6', 'impundment_std_4']
for num_features in [30, 28, 26, 24, 22, 20, 18, 16, 14, 12, 10, 8, 6, 4, 2]:
    
    most_important_features = most_important_features[:num_features]
    print(f"Features used in experiment:\n{most_important_features}")
    print("\n")
    feature_importances = {i:0 for i in most_important_features}
    y_test_all = []
    pred_all = []
    
    
    for (training_dataset, test_dataset) in experiment_arr:

        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        _max_features = 25 if num_features > 25 else num_features
        clf = RandomForestClassifier(n_estimators=200, class_weight="balanced", max_features=_max_features,
                                     random_state=42, n_jobs=-1)

        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")

        proba = clf.predict_proba(X_test)[:,1:].reshape(2997,2620)
        
        proba_post_process = post_processing.proba_post_process(proba, 6, 0.4)
    
        labels_grid = post_processing.raster_to_zones(np.array(y_test).reshape(2997, 2620), 6, 4)

        for i, pred_var in enumerate(proba_post_process.reshape(-1)):
            y_test_all.append(labels_grid.reshape(-1)[i])
            pred_all.append(pred_var)

        importances = clf.feature_importances_

        
        for i, importance in enumerate(importances):
            feature_importances[most_important_features[i]] += importance
        
    for importance_name in most_important_features:
        feature_importances[importance_name] /= 2
    
    
    
    most_important_features = list(dict(sorted(feature_importances.items(),
                                               key=operator.itemgetter(1),reverse=True)).keys())

    print("\n")
    print(f"Importances for experiment:")
    for key in most_important_features:
        print(key, " - ", feature_importances[key])
    print("\n")
    print(f"Amount of features used:   {num_features}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")

Features used in experiment:
['impundment_mean_3', 'impundment_mean_4', 'hpmf_median_4', 'impundment_amplified', 'impoundment_amplified_no_streams', 'skyview_max_6', 'skyview_non_ditch', 'hpmf_filter', 'impundment_mean_2', 'slope_std_6', 'impundment_max_6', 'hpmf_mean_6', 'skyview_min_6', 'slope_min_6', 'slope_median_6', 'skyview_gabor_no_streams', 'skyview_max_4', 'hpmf_mean_4', 'skyview_median_6', 'hpmf_min_4', 'slope_min_4', 'skyview_gabor', 'skyview_max_2', 'impundment_median_6', 'hpmf_std_6', 'slope_std_4', 'impundment_mean_6', 'hpmf_min_2', 'slope_mean_6', 'impundment_std_4']


Amount of features X_train: 30, X_test: 30
Amount of features X_train: 30, X_test: 30


Importances for experiment:
impundment_mean_3  -  0.3044545087715057
impundment_mean_4  -  0.08096936707104063
hpmf_median_4  -  0.03780765766355358
impundment_amplified  -  0.03200924004739092
hpmf_mean_4  -  0.027199578972953605
impoundment_amplified_no_streams  -  0.026710614934202077
impundment_mean_2  -  0.02644767

Amount of features X_train: 22, X_test: 22
Amount of features X_train: 22, X_test: 22


Importances for experiment:
impundment_mean_3  -  0.3661024998568824
impundment_mean_4  -  0.04808080710710179
hpmf_median_4  -  0.046243122126269624
impundment_amplified  -  0.036131605079245906
skyview_gabor_no_streams  -  0.03550554496503846
skyview_max_6  -  0.03300648633188505
impundment_max_6  -  0.03216149952154544
hpmf_mean_6  -  0.030251528927090765
slope_median_6  -  0.0301935203470806
hpmf_filter  -  0.029397876238147255
impoundment_amplified_no_streams  -  0.029080839097043135
slope_std_6  -  0.028748274048098453
skyview_non_ditch  -  0.02829470819490504
hpmf_min_4  -  0.027382680340581396
skyview_min_6  -  0.026817355286710767
skyview_median_6  -  0.02636100059224395
slope_min_6  -  0.025975206878105395
skyview_max_4  -  0.025800441670476452
impundment_median_6  -  0.025297615582538145
impundment_mean_2  -  0.023759382908327034
slope_min_4  -  0.02279837746946569
slope_std_4  -  0.02260

Amount of features X_train: 10, X_test: 10
Amount of features X_train: 10, X_test: 10


Importances for experiment:
impundment_mean_3  -  0.3908593663868433
impundment_amplified  -  0.08534543733425909
skyview_max_6  -  0.0753470020552641
impundment_mean_4  -  0.07259573216225848
hpmf_median_4  -  0.06535152604180632
skyview_gabor_no_streams  -  0.06419530852297099
impundment_max_6  -  0.06275577746898112
slope_median_6  -  0.0619892707736107
slope_std_6  -  0.060864265239933615
skyview_median_6  -  0.06069631401407219


Amount of features used:   10
Cohen's kappa score         0.5594106453964741
Accuracy score              0.9807857475796407
Recall score                0.5835104047765388
Precision score             0.5556335908498043

------------------------------------------------------

Features used in experiment:
['impundment_mean_3', 'impundment_amplified', 'skyview_max_6', 'impundment_mean_4', 'hpmf_median_4', 'skyview_gabor_no_streams', 'impundment_max_6', 'slope_median_6']




In [23]:
from itertools import product

In [24]:
params = [["gini", "entropy"],
          [35, None],
          [2, 10],
          ["balanced", None],
          [200, 300]]

In [25]:
most_important_features = ['impundment_mean_3', 'impundment_mean_4', 'hpmf_mean_4', 'impundment_mean_2',
                           'impundment_amplified', 'hpmf_median_4', 'impoundment_amplified_no_streams', 'hpmf_mean_3',
                           'skyview_max_6', 'slope_std_6', 'impundment_median_4', 'hpmf_min_2', 'hpmf_mean_6',
                           'skyview_max_4', 'slope_min_6', 'skyview_gabor', 'hpmf_min_4', 'skyview_gabor_no_streams',
                           'impundment_max_6', 'skyview_non_ditch', 'skyview_median_6', 'impundment_std_4', 'skyview_min_6',
                           'slope_min_4', 'slope_median_6', 'slope_std_4', 'impundment_median_2', 'skyview_max_2',
                           'impundment_median_6', 'hpmf_std_6', 'hpmf_gabor_no_streams', 'skyview_std_6', 'hpmf_filter',
                           'impundment_mean_6', 'slope_mean_6', 'slope_min_2', 'skyview_median_2', 'hpmf_filter_no_streams',
                           'impundment_std_6', 'slope_non_ditch']

In [26]:
param_combos = list(product(*params))

In [27]:
len(param_combos)

32

In [None]:
params_and_score = []
j = 0
for combo in param_combos:
    j += 1
    y_test_all = []
    pred_all = []
    
    params = {'criterion': combo[0],
    'max_depth': combo[1],
    'min_samples_split': combo[2],
    'class_weight': combo[3],
    'n_estimators': combo[4]}
    print(f"params: {params}")
    
    for (training_dataset, test_dataset) in experiment_arr:

        X_train = training_dataset.filter(items=most_important_features).loc[:, training_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_train = training_dataset["label_3m"]
        training_dataset = None

        clf = RandomForestClassifier(**params, n_jobs=-1, random_state=42)
        print(clf)

        clf.fit(X_train, y_train)

        X_test = test_dataset.filter(items=most_important_features).loc[:, test_dataset.filter(items=most_important_features).columns != "label_3m"]
        y_test = test_dataset["label_3m"]
        
        print(f"Amount of features X_train: {len(X_train.columns)}, X_test: {len(X_test.columns)}")
  
        
        proba = clf.predict_proba(X_test)[:,1:].reshape(2997,2620)
        
        proba_post_process = post_processing.proba_post_process(proba, 6, 0.4)
    
        labels_grid = post_processing.raster_to_zones(np.array(y_test).reshape(2997, 2620), 6, 4)

        for i, pred_var in enumerate(proba_post_process.reshape(-1)):
            y_test_all.append(labels_grid.reshape(-1)[i])
            pred_all.append(pred_var)

    print("\n")
    print(f"Iteration {j} of {len(param_combos)}")
    print("Cohen's kappa score        ", cohen_kappa_score(y_test_all, pred_all))
    print("Accuracy score             ", accuracy_score(y_test_all, pred_all))
    print("Recall score               ", recall_score(y_test_all, pred_all))
    print("Precision score            ", precision_score(y_test_all, pred_all))
    print("\n------------------------------------------------------\n")
    params_and_score.append((params, cohen_kappa_score(y_test_all, pred_all)))

print(params_and_score)

params: {'criterion': 'gini', 'max_depth': 35, 'min_samples_split': 2, 'class_weight': 'balanced', 'n_estimators': 200}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight='balanced',
                       criterion='gini', max_depth=35, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=200,
                       n_jobs=-1, oob_score=False, random_state=89, verbose=0,
                       warm_start=False)


In [None]:
sorted(params_and_score,key=lambda x:(-x[1],x[0]))