In [None]:
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
import seaborn as sns
import matplotlib.pyplot as plt

# 1 load pickle file with results

In [None]:
df = pd.read_pickle('results_pickle_file.pickle')
print('Available Columns')
df.columns

# 2 K-Means Clustering

In [None]:
# select columsn thata re used by Kmeans
cols_to_cluster = [
    'mon_images',
    #'bfast_magnitude', 'bfast_means', 
    'cusum_confidence', 'cusum_magnitude', 
    'ts_mean', 'ts_sd', 'ts_min', 'ts_max', 
    'bs_slope_mean', 'bs_slope_sd', 'bs_slope_min', 'bs_slope_max'
]

# Standardize the data
X_std = StandardScaler().fit_transform(df[cols_to_cluster])

# run kmeans
kmeans = KMeans(n_clusters=8, random_state=42).fit(df[cols_to_cluster])
df['Kmeans'] = kmeans.predict(df[cols_to_cluster])

# print number of poitns per clusters
clusters, counts = np.unique(df.Kmeans, return_counts=True)
print(clusters, counts)
pd.DataFrame({'cluster': clusters, 'counts':counts}).plot(kind='bar', title='Nr. of Points per cluster')

# 3 Plot Stats

In [None]:
cols_to_plot = [
    'mon_images',
    'cusum_confidence', 'cusum_magnitude', 
    'ts_mean', 'ts_sd', 'ts_min', 'ts_max', 
    'bs_slope_mean', 'bs_slope_sd', 'bs_slope_min', 'bs_slope_max'
]

for i, col in enumerate(cols_to_plot):
    plt.figure(i)
    sns.boxplot(x="Kmeans", y=col, data=df)

# 4 Select subset of samples for each cluster

In [None]:
nr_of_samples_per_cluster = 30
subset_df = pd.DataFrame(columns=df.columns)

for cluster in df.Kmeans.unique():
    subset_df = pd.concat([
        subset_df,
        df[df.Kmeans == cluster].sample(nr_of_samples_per_cluster)
    ])
    
print(f'{len(subset_df)} samples have been selected in total')
gpd.GeoDataFrame(subset_df, geometry='geometry').plot()

# 5 Convert to CEO file

In [None]:
out_csv_file = 'path/to/subset_results.csv'

subset_df['LON'] = gpd.GeoDataFrame(subset_df).geometry.x
subset_df['LAT'] = gpd.GeoDataFrame(subset_df).geometry.y
subset_df['PLOTID'] = gpd.GeoDataFrame(subset_df).point_id

cols = subset_df.columns.tolist()
cols = [e for e in cols if e not in ('LON', 'LAT', 'PLOTID')]
new_cols = ['LON', 'LAT', 'PLOTID'] + cols
subset_df = subset_df[new_cols]
subset_df.to_csv(out_csv_file, index=False)