### Accuracy Analysis of the Created Classification Map

Import libraries

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rasterio
import os
import time
from sklearn.metrics import f1_score, classification_report, confusion_matrix, cohen_kappa_score, accuracy_score, precision_score, recall_score, roc_auc_score, roc_curve, auc
import seaborn as sns


##### 1) Comparing classification results with assigned classes for validation data

In [None]:
os.environ['USE_PYGEOS'] = '0'
shp_folders = ['']
output_folder = ''
tif_folder = ''

tif_files = [file for file in os.listdir(tif_folder) if file.endswith('.tif')]

start = time.process_time()

for shp_folder in shp_folders:
    for shp_file_name in os.listdir(shp_folder):
        if shp_file_name.endswith('.shp'):
            shp_file_path = os.path.join(shp_folder, shp_file_name)
            base_name_without_extension = os.path.splitext(shp_file_name)[0]
            shp_crs = gpd.read_file(shp_file_path).crs  
            for tif_file_name in tif_files:
                tif_file_to_process = os.path.join(tif_folder, tif_file_name)
                with rasterio.open(tif_file_to_process) as tif_src:
                    tif_crs = tif_src.crs

                pointData = gpd.read_file(shp_file_path)
                pointData = pointData[pointData['geometry'].notnull()]
                pointData['geometry'] = pointData['geometry'].apply(lambda geom: geom if geom.type == 'Point' else geom.centroid)

                def get_raster_values(tif_file, coords):
                    with rasterio.open(tif_file) as src:
                        values = [int(x[0]) for x in src.sample(coords)] 
                    return values
                column_name = os.path.splitext(os.path.basename(tif_file_to_process))[0]
                coord_list = [(x, y) for x, y in zip(pointData["geometry"].x, pointData["geometry"].y)]
                pointData[column_name] = get_raster_values(tif_file_to_process, coord_list)
                result_df = pd.DataFrame({
                    'geometry': pointData['geometry'],
                    'target': pointData['class'],
                    'predicted': pointData[column_name]
                })
                output_file_name = f'{base_name_without_extension}_predicted.csv'
                output_csv_path = os.path.join(output_folder, output_file_name)
                result_df.to_csv(output_csv_path, index=False)
                print(f"\nCRS of {shp_file_name}: {shp_crs}")
                print(f"CRS of {tif_file_name}: {tif_crs}")
                print(f"\nFor the number of training samples in {shp_file_name}:", result_df.shape[0])
                print(f"Saved result to {output_csv_path}")
                print(result_df.head(6000))

print("\nProcessing time is [s]", time.process_time() - start)
print("\nAll results saved successfully")

##### 2) Accuracy Metrics

In [None]:
csv_folder = ''
output_file_path = ''  
confusion_matrix_folder = ''
csv_files = [file for file in os.listdir(csv_folder) if file.endswith('.csv')]
confusion_matrices = []
start = time.process_time()
with open(output_file_path, 'w') as output_file:
    for csv_file_name in csv_files:
        csv_file_path = os.path.join(csv_folder, csv_file_name)
        df = pd.read_csv(csv_file_path)
        df['target'] = pd.to_numeric(df['target'], errors='coerce')
        df['predicted'] = pd.to_numeric(df['predicted'], errors='coerce')
        df_filtered = df[(df['target'].between(1, 6)) & (df['predicted'].between(1, 6))]
        f1_all_classes = "{:.4f}".format(f1_score(df_filtered['target'], df_filtered['predicted'], average='weighted'))
        f1_scores_per_class = {}
        for class_label in range(1, 7):  
            f1_score_class = "{:.4f}".format(f1_score(df_filtered['target'], df_filtered['predicted'], labels=[class_label], average=None)[0])
            f1_scores_per_class[class_label] = f1_score_class
        output_file.write("\nF1-scores for each class:\n")
        for class_label, score in f1_scores_per_class.items():
            output_file.write(f"Class {class_label}: {score}\n")         
        f1_all_classes = "{:.4f}".format(f1_score(df_filtered['target'], df_filtered['predicted'], average='weighted'))
        f1_class_6 = "{:.4f}".format(f1_score(df_filtered['target'], df_filtered['predicted'], labels=[6], average=None)[0])

        overall_accuracy_all_classes = accuracy_score(df_filtered['target'], df_filtered['predicted'])
        kappa_score_all_classes = cohen_kappa_score(df_filtered['target'], df_filtered['predicted'])
        class_6_indices = df_filtered['target'] == 6
        overall_accuracy_class_6 = accuracy_score(df_filtered[class_6_indices]['target'], df_filtered[class_6_indices]['predicted'])
        kappa_score_class_6 = cohen_kappa_score(df_filtered[class_6_indices]['target'], df_filtered[class_6_indices]['predicted'])

        overall_accuracy = "{:.4f}".format(overall_accuracy_all_classes)
        kappa_score = "{:.4f}".format(kappa_score_all_classes)
        auc_per_class = {}
        roc_per_class = {}
        for class_label in range(1, 7): 
            binary_labels = df_filtered['target'] == class_label
            fpr, tpr, thresholds = roc_curve(binary_labels, (df_filtered['predicted'] == class_label).astype(int))
            auc_value = "{:.4f}".format(auc(fpr, tpr))
            auc_per_class[f' class {class_label}'] = auc_value
            roc_per_class[class_label] = (fpr, tpr, thresholds)
        user_accuracy_per_class = {}
        producer_accuracy_per_class = {}
        for class_label in range(1, 7):
            binary_target = (df_filtered['target'] == class_label).astype(int)
            binary_predicted = (df_filtered['predicted'] == class_label).astype(int)
            user_accuracy = "{:.4f}".format(precision_score(binary_target, binary_predicted))
            producer_accuracy = "{:.4f}".format(recall_score(binary_target, binary_predicted))
            user_accuracy_per_class[f'User_Accuracy_class_{class_label}'] = user_accuracy
            producer_accuracy_per_class[f'Producer_Accuracy_class_{class_label}'] = producer_accuracy
        auc_values = [float(auc_value) for auc_value in auc_per_class.values()]
        auc_average = "{:.4f}".format(sum(auc_values) / len(auc_values))

        output_file.write("_" * 70)
        output_file.write(f"\nResult for {csv_file_name}:\n")

        output_file.write("\nF1-scores for each class:\n")
        for class_label, score in f1_scores_per_class.items():
            output_file.write(f"Class {class_label}: {score}\n")
            
        output_file.write(f"\nF1-score for all classes: {f1_all_classes}\n")
        output_file.write(f"F1-score for class 6: {f1_class_6}\n")
        
        output_file.write(f"\nCohen's Kappa: {kappa_score}\n")
        output_file.write(f"Overall Accuracy: {overall_accuracy}\n")
        
        output_file.write(f"\nAUC average for all classes: {auc_average}\n")
        for class_label, auc_value in auc_per_class.items():
            output_file.write(f"AUC for{class_label}: {auc_value}\n")
            
        class_report = classification_report(
            df_filtered['target'],
            df_filtered['predicted'],
            digits=4,
        )

        output_file.write(f"\nClassification Report:\n")
        output_file.write(class_report + '\n')s
        for class_label in range(1, 7):
            output_file.write(f"\nUser Accuracy for class {class_label}: {user_accuracy_per_class[f'User_Accuracy_class_{class_label}']}\n")
            output_file.write(f"Producer Accuracy for class {class_label}: {producer_accuracy_per_class[f'Producer_Accuracy_class_{class_label}']}\n")
        conf_matrix = confusion_matrix(df_filtered['target'], df_filtered['predicted'])
        TP = conf_matrix[1, 1]
        FP = conf_matrix[0, 1]
        TN = conf_matrix[0, 0]
        FN = conf_matrix[1, 0]
        TPR = "{:.4f}".format(TP / (TP + FN) * 100)
        FPR = "{:.4f}".format(FP / (FP + TN) * 100)
        TNR = "{:.4f}".format(TN / (TN + FP) * 100) 
        FNR = "{:.4f}".format(FN / (TP + FN) * 100) 
        
        output_file.write(f"\nTrue Positive Rate (TPR): {TPR}%\n")
        output_file.write(f"False Positive Rate (FPR): {FPR}%\n")
        output_file.write(f"True Negative Rate (TNR): {TNR}%\n")
        output_file.write(f"False Negative Rate (FNR): {FNR}%\n")
        
        for class_label, roc_values in roc_per_class.items():
            fpr, tpr, _ = roc_values
            fpr_percent = [val * 100 for val in fpr]
            tpr_percent = [val * 100 for val in tpr]
            
            plt.figure(figsize=(7/ 2.54, 7 / 2.54))
            plt.plot(fpr_percent, tpr_percent, color='darkorange', lw=2, label='Krzywa ROC')
            plt.plot([0, 100], [0, 100], color='navy', lw=2, linestyle='--')
            plt.xlim([0.0, 100.0])
            plt.ylim([0.0, 100.0])
            plt.xlabel('FPR [%]', fontsize=10)
            plt.ylabel('TPR [%]', fontsize=10)
            plt.title(f'Krzywa ROC dla klasy {class_label}', fontsize=12)
            plt.legend(loc='lower right')
            roc_curve_filename = f'{class_label}_{csv_file_name}_roc_curve.png'
            roc_curve_filepath = os.path.join(confusion_matrix_folder, roc_curve_filename)
            plt.savefig(roc_curve_filepath, bbox_inches='tight', pad_inches=0.1, dpi=400)
            plt.tight_layout()
            plt.close()

        conf_matrix = confusion_matrix(df_filtered['target'], df_filtered['predicted'])
        confusion_matrix_filename = os.path.splitext(csv_file_name)[0] + '_confusion_matrix.png'
        confusion_matrix_filepath = os.path.join(confusion_matrix_folder, confusion_matrix_filename)
        labels = ['1', '2', '3', '4', '5', '6']
        plt.figure(figsize=(10/2.54, 8/2.54))
        ax = sns.heatmap(conf_matrix * 100 / conf_matrix.sum(axis=1)[:, None], annot=True, cmap="Greens", fmt='.2f', cbar=True, xticklabels=labels, yticklabels=labels)
        confusion_matrix_title = os.path.splitext(csv_file_name)[0][:-10]
        ax.set_xlabel("Przewidywane", fontsize=9)
        ax.set_ylabel("Rzeczywiste", fontsize=9)
        plt.title(f"Macierz Pomyłek [%]", fontsize=9)
        ax.set_xticklabels(ax.get_xticklabels(), ha="right", fontsize=9)
        ax.set_yticklabels(ax.get_yticklabels(), fontsize=9)
        for text in ax.texts:
            text.set_fontsize(8)
        plt.tight_layout(pad=1.5)
        plt.savefig(confusion_matrix_filepath, bbox_inches='tight', pad_inches=0.1, dpi=800)
        plt.close()
        confusion_matrices.append(conf_matrix)
        
print("Processing time in [s]", time.process_time() - start)
print("\nAll results saved successfully")