### Establish current working directory

In [1]:
# sorting out working directory
import os

# Get the current working directory
current_directory = os.getcwd()

# Print the current working directory
print("Current Working Directory:", current_directory)


Current Working Directory: c:\Users\klara\OneDrive\Skrivebord\peatland_classification


### Imports

In [2]:
from rasterio.plot import show
import matplotlib.pyplot as plt
import geopandas as gpd
import rasterio
import rasterio.mask
import numpy as np
import pandas as pd
import os
import pickle

### Check directory

In [3]:
# Define the list of folder paths
folder_paths = ["2017", "2018", "2020", "2021", "2022", "2023"]

# Loop through each folder path
for folder_path in folder_paths:
    # Check if the folder exists
    if os.path.exists(folder_path) and os.path.isdir(folder_path):
        # List all files in the folder
        files = os.listdir(folder_path)

        if files:
            print(f"Files in the '{folder_path}' folder:")
            for file in files:
                print(file)
        else:
            print(f"The '{folder_path}' folder is empty.")
    else:
        print(f"The '{folder_path}' folder does not exist in the current working directory.")


Files in the '2017' folder:
2017_modified.pkl
Files in the '2018' folder:
2018_AOI_2_actively_eroding.pkl
2018_AOI_2_drained.pkl
2018_AOI_3_actively_eroding.pkl
aoi_1_gridpoints_bare_peat.pkl
AOI_3_bare_peat.pkl
AOI_4_bare_peat.pkl
AOI_5_bare_peat.pkl
Files in the '2020' folder:
2020_AOI_1_drained.pkl
2020_AOI_2_drained.pkl
2020_AOI_2_modified.pkl
2020_AOI_3_restored.pkl
2020_AOI_4_restored.pkl
Files in the '2021' folder:
2021_AOI_1_drained.pkl
2021_AOI_1_modified.pkl
2021_AOI_2_drained.pkl
2021_AOI_3_restored.pkl
2021_AOI_4_drained.pkl
2021_AOI_5_restored.pkl
2021_AOI_6_actively_eroding.pkl
2021_AOI_6_drained.pkl
2021_AOI_6_modified.pkl
2021_AOI_6_restored.pkl
2021_AOI_7_actively_eroding.pkl
2021_AOI_7_drained.pkl
2021_AOI_7_modified.pkl
2021_AOI_8_modified.pkl
Files in the '2022' folder:
2022_AOI_1_restored.pkl
2022_AOI_2_drained.pkl
2022_AOI_2_modified.pkl
2022_AOI_4_actively_eroding.pkl
2022_AOI_4_drained.pkl
2022_AOI_4_restored.pkl
2022_AOI_5_actively_eroding.pkl
2022_AOI_5_draine

### Concatenating datasets

In [4]:
# List of file paths
file_paths = [
    # 2017
    "2017/2017_modified.pkl",
    
    # 2018
    "2018/2018_AOI_2_actively_eroding.pkl",
    "2018/2018_AOI_2_drained.pkl",
    "2018/2018_AOI_3_actively_eroding.pkl",
    "2018/aoi_1_gridpoints_bare_peat.pkl",
    "2018/AOI_3_bare_peat.pkl",
    "2018/AOI_4_bare_peat.pkl",
    "2018/AOI_5_bare_peat.pkl",

    # 2020
    "2020/2020_AOI_1_drained.pkl",
    "2020/2020_AOI_2_drained.pkl",
    "2020/2020_AOI_2_modified.pkl",
    "2020/2020_AOI_3_restored.pkl",
    "2020/2020_AOI_4_restored.pkl",

    # 2021
    "2021/2021_AOI_1_drained.pkl",
    "2021/2021_AOI_1_modified.pkl",
    "2021/2021_AOI_2_drained.pkl",
    "2021/2021_AOI_3_restored.pkl",
    "2021/2021_AOI_4_drained.pkl",
    "2021/2021_AOI_5_restored.pkl",
    "2021/2021_AOI_6_actively_eroding.pkl",
    "2021/2021_AOI_6_drained.pkl",
    "2021/2021_AOI_6_modified.pkl",
    "2021/2021_AOI_6_restored.pkl",
    "2021/2021_AOI_7_actively_eroding.pkl",
    "2021/2021_AOI_7_drained.pkl",
    "2021/2021_AOI_7_modified.pkl",
    "2021/2021_AOI_8_modified.pkl",

    # 2022
    "2022/2022_AOI_1_restored.pkl",
    "2022/2022_AOI_2_drained.pkl",
    "2022/2022_AOI_2_modified.pkl",
    "2022/2022_AOI_4_actively_eroding.pkl",
    "2022/2022_AOI_4_drained.pkl",
    "2022/2022_AOI_4_restored.pkl",
    "2022/2022_AOI_5_actively_eroding.pkl",
    "2022/2022_AOI_5_drained.pkl",
    "2022/2022_AOI_5_modified.pkl",
    "2022/2022_AOI_5_restored.pkl",
    "2022/2022_AOI_7_restored.pkl",
    "2022/2022_AOI_8_actively_eroding.pkl",
    "2022/2022_AOI_8_drained.pkl",
    "2022/2022_AOI_8_restored.pkl",
    "2022/2022_AOI_9_restored.pkl",

    # 2023
    "2023/2023_AOI_1_restored.pkl",
    "2023/2023_AOI_2_drained.pkl",
    "2023/2023_AOI_3_drained.pkl",
    "2023/2023_AOI_3_modified.pkl",
    "2023/2023_AOI_3_restored.pkl",
    "2023/2023_AOI_5_restored.pkl",
    "2023/2023_AOI_6_restored.pkl",
    "2023/2023_AOI_7_restored.pkl",
    "2023/2023_AOI_8_actively_eroding.pkl",
    "2023/2023_AOI_8_drained.pkl",
    "2023/2023_AOI_8_modified.pkl",
    "2023/2023_AOI_9_restored.pkl",
    "2023/2023_AOI_10_actively_eroding.pkl",
    "2023/2023_AOI_10_modified.pkl",
    "2023/2023_AOI_10_restored.pkl",
    "2023/2023_AOI_11_restored.pkl",
    "2023/2023_AOI_12_drained.pkl",
    "2023/2023_AOI_12_modified.pkl",
]

# Initialize an empty list to store dataframes
dataframes = []

# Load each file and append to the list
for file_path in file_paths:
    with open(file_path, 'rb') as file:
        df = pd.read_pickle(file)
        dataframes.append(df)

# Concatenate all dataframes into one
combined_df = pd.concat(dataframes, ignore_index=True)

# Now combined_df is your combined dataset
# You can inspect it by printing or using combined_df.head()

# Optionally, save the combined dataframe as a new pickle file
combined_df.to_pickle("combined_dataset.pkl")

# To confirm, you can load and view the combined dataset
with open("combined_dataset.pkl", 'rb') as file:
    loaded_combined_df = pd.read_pickle(file)
print(loaded_combined_df.head())



        B1       B2       B3       B4       B5       B6       B7       B8  \
0  0.00495  0.01885  0.03610  0.05020  0.09065  0.13930  0.15855  0.18270   
1  0.01040  0.02405  0.03980  0.05720  0.10385  0.16755  0.19370  0.21600   
2  0.01510  0.02905  0.06635  0.04515  0.12920  0.38280  0.47410  0.49955   
3  0.01345  0.02925  0.06315  0.04920  0.12335  0.28515  0.33710  0.39055   
4  0.01575  0.02970  0.06900  0.04870  0.13360  0.39940  0.49485  0.52430   

       B8A       B9      B11     B12  sample_location_id  class  
0  0.19205  0.19175  0.20290  0.1097              201701      2  
1  0.23420  0.24700  0.22290  0.1204              201701      2  
2  0.50775  0.50890  0.24765  0.1219              201701      2  
3  0.37655  0.42050  0.23555  0.1175              201701      2  
4  0.53900  0.48935  0.23950  0.1177              201701      2  


In [5]:
# The final dataset
combined_df

Unnamed: 0,B1,B2,B3,B4,B5,B6,B7,B8,B8A,B9,B11,B12,sample_location_id,class
0,0.00495,0.01885,0.03610,0.05020,0.09065,0.13930,0.15855,0.18270,0.19205,0.19175,0.20290,0.1097,201701,2
1,0.01040,0.02405,0.03980,0.05720,0.10385,0.16755,0.19370,0.21600,0.23420,0.24700,0.22290,0.1204,201701,2
2,0.01510,0.02905,0.06635,0.04515,0.12920,0.38280,0.47410,0.49955,0.50775,0.50890,0.24765,0.1219,201701,2
3,0.01345,0.02925,0.06315,0.04920,0.12335,0.28515,0.33710,0.39055,0.37655,0.42050,0.23555,0.1175,201701,2
4,0.01575,0.02970,0.06900,0.04870,0.13360,0.39940,0.49485,0.52430,0.53900,0.48935,0.23950,0.1177,201701,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1152806,0.03720,0.04660,0.06660,0.08560,0.13300,0.21200,0.24360,0.26880,0.28690,0.27960,0.28600,0.1640,202312,2
1152807,0.03840,0.04250,0.07690,0.08000,0.14000,0.25190,0.28530,0.34760,0.33920,0.32180,0.29830,0.1563,202312,2
1152808,0.04280,0.04960,0.06970,0.08880,0.12820,0.18960,0.21990,0.25540,0.25480,0.25390,0.27920,0.1567,202312,2
1152809,0.03170,0.04110,0.06480,0.07660,0.13350,0.23340,0.27290,0.30900,0.30940,0.29640,0.30290,0.1617,202312,2


In [6]:
# Select one row for each class (0-4)
rows_for_classes = []
for class_value in range(5):  # Assuming classes are labeled 0-4
    rows_for_classes.append(combined_df[combined_df['class'] == class_value].head(1))

# Concatenate the selected rows into a new DataFrame
subset_df = pd.concat(rows_for_classes)

# Reset the index of the new DataFrame (optional)
subset_df.reset_index(drop=True, inplace=True)

In [7]:
# Convert the subset DataFrame to LaTeX code
latex_table = subset_df.to_latex(index=False)  # 'index=False' omits the DataFrame index from the table
print(latex_table)

\begin{tabular}{rrrrrrrrrrrrrr}
\toprule
B1 & B2 & B3 & B4 & B5 & B6 & B7 & B8 & B8A & B9 & B11 & B12 & sample_location_id & class \\
\midrule
0.021300 & 0.057550 & 0.079600 & 0.083700 & 0.131700 & 0.262550 & 0.328350 & 0.357700 & 0.363300 & 0.344850 & 0.256700 & 0.149200 & 201802 & 0 \\
0.018650 & 0.051050 & 0.075800 & 0.061800 & 0.126550 & 0.279400 & 0.324800 & 0.358750 & 0.370500 & 0.353950 & 0.208050 & 0.106750 & 201802 & 1 \\
0.004950 & 0.018850 & 0.036100 & 0.050200 & 0.090650 & 0.139300 & 0.158550 & 0.182700 & 0.192050 & 0.191750 & 0.202900 & 0.109700 & 201701 & 2 \\
0.029700 & 0.025800 & 0.042300 & 0.074550 & 0.097150 & 0.116900 & 0.140800 & 0.154600 & 0.187250 & 0.182100 & 0.307900 & 0.237500 & 201801 & 3 \\
0.058200 & 0.068800 & 0.088200 & 0.114600 & 0.162900 & 0.219800 & 0.248800 & 0.262200 & 0.292050 & 0.294050 & 0.293600 & 0.178200 & 202003 & 4 \\
\bottomrule
\end{tabular}



In [8]:
class_counts = combined_df['class'].value_counts()

# This will print the count of each class
print(class_counts)

# 0 = actively eroding
# 1 = drained
# 2 = modified
# 3 = bare peat
# 4 = restored

class
1    494861
4    249503
2    218725
0    133347
3     56375
Name: count, dtype: int64


In [13]:
sum(class_counts)

1152811

In [9]:
# Group by 'sample_location_id', then count the occurrences of each class
location_class_counts = combined_df.groupby('sample_location_id')['class'].value_counts()

# Convert the Series to a DataFrame and reset the index
location_class_counts_df = location_class_counts.unstack().fillna(0)

# Sum across the rows to get the total number of pixels for each location
location_class_counts_df['total'] = location_class_counts_df.sum(axis=1)

# Now the DataFrame includes a 'Total_Pixels' column
location_class_counts_df

class,0,1,2,3,4,total
sample_location_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
201701,0.0,0.0,5553.0,0.0,0.0,5553.0
201801,0.0,0.0,0.0,26778.0,0.0,26778.0
201802,4289.0,3567.0,0.0,0.0,0.0,7856.0
201803,22341.0,0.0,0.0,319.0,0.0,22660.0
201804,0.0,0.0,0.0,24553.0,0.0,24553.0
201805,0.0,0.0,0.0,4725.0,0.0,4725.0
202001,0.0,8323.0,0.0,0.0,0.0,8323.0
202002,0.0,14663.0,20088.0,0.0,0.0,34751.0
202003,0.0,0.0,0.0,0.0,10059.0,10059.0
202004,0.0,0.0,0.0,0.0,26037.0,26037.0


In [10]:
# Storing and exporting as pickle file
location_class_counts_df.to_pickle("location_class_counts.pkl")

In [12]:
# Convert the subset DataFrame to LaTeX code
latex_table_1 = location_class_counts_df.to_latex(index=True)  # 'index=False' omits the DataFrame index from the table
print(latex_table_1)

\begin{tabular}{lrrrrrr}
\toprule
class & 0 & 1 & 2 & 3 & 4 & total \\
sample_location_id &  &  &  &  &  &  \\
\midrule
201701 & 0.000000 & 0.000000 & 5553.000000 & 0.000000 & 0.000000 & 5553.000000 \\
201801 & 0.000000 & 0.000000 & 0.000000 & 26778.000000 & 0.000000 & 26778.000000 \\
201802 & 4289.000000 & 3567.000000 & 0.000000 & 0.000000 & 0.000000 & 7856.000000 \\
201803 & 22341.000000 & 0.000000 & 0.000000 & 319.000000 & 0.000000 & 22660.000000 \\
201804 & 0.000000 & 0.000000 & 0.000000 & 24553.000000 & 0.000000 & 24553.000000 \\
201805 & 0.000000 & 0.000000 & 0.000000 & 4725.000000 & 0.000000 & 4725.000000 \\
202001 & 0.000000 & 8323.000000 & 0.000000 & 0.000000 & 0.000000 & 8323.000000 \\
202002 & 0.000000 & 14663.000000 & 20088.000000 & 0.000000 & 0.000000 & 34751.000000 \\
202003 & 0.000000 & 0.000000 & 0.000000 & 0.000000 & 10059.000000 & 10059.000000 \\
202004 & 0.000000 & 0.000000 & 0.000000 & 0.000000 & 26037.000000 & 26037.000000 \\
202101 & 0.000000 & 4053.000000 & 7446.