### Import libraries

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import rasterio
from rasterio.plot import show
import os
from raster2xyz.raster2xyz import Raster2xyz
from sklearn.preprocessing import MinMaxScaler, RobustScaler, StandardScaler
import time
from geocube.api.core import make_geocube
from sklearn.impute import SimpleImputer
import seaborn as sns
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score, validation_curve
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, roc_curve, precision_score, recall_score, roc_auc_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel, RFE
import gc
from shapely.geometry import Point
from shapely import wkt


This code creates a DataFrame df from a list of lists data. Each inner list contains two elements: a class name and its corresponding class code. The class names represent different land cover types:
- water,
- urbanized, 
- soil,
- cropland,
- grassland,
- forest/treecover,
##### and the class codes are numerical identifiers for each class.

In [None]:
data=[['water',1],['urbanized',2],['soil',3],['cropland',4],['grassland',5],['forest/treecover',6]]
df = pd.DataFrame(data,columns=['classname','classcode'])
df


The code loads training sample data from a CSV file, handles missing and infinite values, computes class statistics, and merges class information. It then sorts and prints summary statistics about the training samples, including class contributions. Finally, it displays a DataFrame with class contributions.



In [None]:
training_samples_inf= pd.read_csv("")

training_samples_inf.replace([np.inf, -np.inf, 0.0, -1000000000.0, 9999.0,], np.nan, inplace=True)
training_samples=training_samples_inf.dropna(axis=0)
original_features=training_samples.iloc[:,3:training_samples.shape[1]]
column_names=training_samples.columns

stat = []
unique=training_samples['class'].unique()
for i in unique:
    no=len(training_samples[training_samples['class']== i])
    stat = stat + [no]
stat = np.array(stat)
combined = np.vstack((unique, stat)).T
df1 = pd.DataFrame(combined)
df1.columns=['classcode','no_of_found_images_within_AOI']
df1['%']=round(df1['no_of_found_images_within_AOI']/df1['no_of_found_images_within_AOI'].sum(),2)
df2=df1.merge(df,on='classcode')


df2= df2.sort_values('classcode')
print('shape of the training samples with NaN, Inf/no.features',training_samples_inf.shape )
print('shape of the training samples without NaN, Inf/no.features', training_samples.shape)
print('Contribution of each class in training samples')
df2

This cell trains a Random Forest classifier on a dataset, computes the confusion matrix, generates a classification report, saves the trained model, and calculates the out-of-bag error and score.

In [None]:
target=training_samples['class']
start = time.process_time()
randomforest=RandomForestClassifier(n_jobs=-1,n_estimators=500,class_weight="balanced",bootstrap=True, oob_score=True)
rf_model_initial=randomforest.fit(original_features,target)
y_rf_train_pred=rf_model_initial.predict(original_features)

conf=confusion_matrix(target,y_rf_train_pred)
labels=['1:water','2:urbanized','3:soil','4:cropland','5:grassland','6:forest/treecover']

plt.title('confusion matrix')
ax=sns.heatmap(conf,annot=True,cmap="Blues",fmt='g',cbar=True,  xticklabels= labels,  yticklabels=labels)
plt.figure(figsize = (9,8))
ax.set(xlabel="Predicted", ylabel="Actual")

report=metrics.classification_report(target,y_rf_train_pred, digits=3, output_dict=True)
df = pd.DataFrame(report).transpose()
print("Classification accuracy report s:", df)
print("Procesing time in[ [s]", time.process_time() - start)
plt.savefig(""+'Confusion_matrix_original.png', dpi=300)

filename = ""+'Original_model2022.sav'

pickle.dump(rf_model_initial, open(filename, 'wb'))

oob_error = 1 - randomforest.oob_score_
print(f'OOB error: {oob_error:.3f}')
print(f'OOB score: {randomforest.oob_score_:.3f}')


This code predicts target values using cross-validated Random Forest classifier, computes and plots the confusion matrix, saves the plot, prints classification accuracy report, and calculates key evaluation metrics.

In [None]:
y_train_pred = cross_val_predict(rf_model_initial, original_features, target, cv=5)

# Compute confusion matrix
conf = confusion_matrix(target, y_train_pred)

# Define class labels
labels = ['1:water', '2:urbanized', '3:soil', '4:cropland', '5:grassland', '6:forest/treecover']

# Plot confusion matrix
plt.figure(figsize=(9, 8))
plt.title('Confusion Matrix')
ax = sns.heatmap(conf, annot=True, cmap="Blues", fmt='g', cbar=True, xticklabels=labels, yticklabels=labels)
ax.set(xlabel="Predicted", ylabel="Actual")
plt.savefig("" + 'Confusion_matrix.png', dpi=300)
plt.show()

report = classification_report(target, y_train_pred, digits=3, output_dict=True)
df = pd.DataFrame(report).transpose()
print("Classification accuracy report:\n", df)

overall_accuracy = report['accuracy']
precision_macro = report['macro avg']['precision']
recall_macro = report['macro avg']['recall']
f1_macro = report['macro avg']['f1-score']

print("Overall Accuracy:", overall_accuracy)
print("Precision (Macro):", precision_macro)
print("Recall (Macro):", recall_macro)
print("F1-Score (Macro):", f1_macro)



Prepares DataFrames for predicted and true class labels, and for point geometry data.
Merges these DataFrames and renames columns.
Creates a DataFrame with a Boolean column indicating correct predictions.
Converts point geometry data to Shapely geometries.
Filters for falsely or correctly classified instances.
Creates GeoDataFrames and saves them to shapefiles based on classification accuracy.

In [None]:
from shapely import wkt
predicted=pd.DataFrame(y_rf_train_pred)
point_geometry=training_samples.iloc[:,2]
ground_truth=pd.DataFrame(target)
df=pd.merge(point_geometry,predicted,left_index=True, right_index=True)
df=pd.merge(df,ground_truth,left_index=True, right_index=True)
df.columns=['point_geometry', 'predicted_class','target_class']
df_equal=pd.DataFrame(df['predicted_class'] == df['target_class'])
df1=pd.merge(df,df_equal,left_index=True, right_index=True)
df1.columns=['point_geom', 'pred_class','target','equals']
df1['id_geom']=df1['point_geom'].apply(wkt.loads)
df1_false= df1[df1['equals'] == False]

gdf= geopandas.GeoDataFrame(df1_false, geometry='id_geom',crs='EPSG:32631')
gdf.to_file(""+'False_classifciation.shp')
df_true= df1[df1['equals'] == True]
gdf= geopandas.GeoDataFrame(df_true, geometry='id_geom',crs='EPSG:32631')
gdf.to_file(""+'true_classification.shp')
print('number of falsely classified',df1_false.shape[0])
print('number of correctly classified',df_true.shape[0])
print('Total classified',df_true.shape[0]+df1_false.shape[0])


This code snippet calculates the absolute correlation matrix for original features and visualizes it using a heatmap. It then saves the heatmap plot as an image file named 'Correlation_matrix.png'.

In [None]:
corr_matrix=round(original_features.corr().abs(),2)
plt.figure(figsize=(20,20))
sns.heatmap(corr_matrix, annot=False,
        xticklabels=corr_matrix.columns,
        yticklabels=corr_matrix.columns)
plt.xlabel('Input features',fontsize= 22, fontweight='semibold')
plt.ylabel('Input features',fontsize= 22,fontweight='semibold')
plt.savefig(""+'Correlation_matrix.png', dpi=300)