In [1]:
from pathlib import Path
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
import pandas as pd

csv_dir = Path('../csvs')

In [2]:
metrics = pd.read_csv(csv_dir / "site_representative_metrics.csv")
metrics = metrics.set_index('site')
metrics = metrics.drop(columns=['site_type', 'year_estab', 'years_since_dist', 'elev_mean', 'slope_mean', 'forest_type'])
metrics

Unnamed: 0_level_0,mean__chm,max__chm,sd__chm,cv__chm,mean__crr,mean__fhd,mean__veg_height_cv,cv__veg_height_median,mean__veg_height_kurt,sd__crr,...,mean__groundstorey_capture,mean__understorey_capture,mean__midstorey_capture,mean__upperstorey_capture,sd__groundstorey_capture,sd__understorey_capture,sd__midstorey_capture,sd__upperstorey_capture,mean__canopy_cover_gt1m,sd__canopy_cover_gt1m
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AGG_O_01,17.795598,38.8500,9.588873,0.449199,0.556861,1.905483,0.611210,0.630970,0.002143,0.145432,...,0.485461,0.340963,0.152032,0.372651,0.304348,0.217247,0.120561,0.233728,0.672886,0.202604
AGG_O_05,17.953837,32.0180,6.743576,0.347906,0.629936,1.809090,0.544776,0.527803,0.700429,0.163310,...,0.465148,0.274245,0.276296,0.404785,0.373857,0.258669,0.102098,0.230319,0.717404,0.165475
AGG_O_07,13.855632,40.6126,9.134959,0.704845,0.569777,1.561877,0.574764,0.897312,0.206545,0.156232,...,0.325641,0.536895,0.000000,0.150111,0.297278,0.243842,0.000000,0.149971,0.638135,0.236373
AGG_Y_02,9.439318,30.3848,4.969538,0.690717,0.564454,1.394114,0.450972,0.570737,0.347001,0.129658,...,0.636835,0.422181,0.285981,0.060774,0.287005,0.257476,0.083674,0.106688,0.655073,0.234546
AGG_Y_03,11.753187,41.9452,7.956366,0.938366,0.525111,1.623247,0.580116,1.097862,0.030835,0.135251,...,0.484721,0.498216,0.309569,0.029361,0.304852,0.278710,0.090056,0.103083,0.626108,0.205937
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
ULO_271,21.914483,36.3238,5.465952,0.251930,0.634345,1.806537,0.591421,0.500539,0.917677,0.175958,...,0.418785,0.268355,0.239480,0.481072,0.329154,0.268604,0.266516,0.246275,0.721113,0.197962
ULY_Y_231,25.492242,49.5954,14.996001,0.593612,0.430132,1.710456,0.952009,1.195313,1.402636,0.196310,...,0.561955,0.261851,0.124661,0.281549,0.277385,0.240317,0.185151,0.281334,0.548123,0.265201
ULY_Y_232,12.777133,52.2856,14.471975,1.145557,0.442244,1.315641,0.825957,1.815506,0.619507,0.164692,...,0.699039,0.297647,0.101104,0.117943,0.257525,0.241808,0.170153,0.205177,0.439321,0.271231
ULY_Y_25,31.919078,54.6412,17.079089,0.540517,0.459317,1.770590,0.928554,1.198691,0.853794,0.196590,...,0.589710,0.557484,0.094196,0.254375,0.331031,0.287227,0.167680,0.244311,0.721464,0.204327


In [3]:
# Scale and center the metrics
scaler = StandardScaler()
scaled_metrics = scaler.fit_transform(metrics)

# First fit PCA with full set of components
# and find out how many components needed to explain 95% of the variance
pca = PCA()
pca.fit(scaled_metrics)
cumulative_variance = np.cumsum(pca.explained_variance_ratio_)
cumulative_variance

# n_components_95 = np.argmax(cumulative_variance >= 0.95) + 1

n = 2

pca = PCA(n_components=n)
pca.fit(scaled_metrics)
pca_scores = pca.transform(scaled_metrics)


pca_scores_df = pd.DataFrame(
    pca_scores,
    columns=pca.get_feature_names_out(),
    index=metrics.index
)

pca_loadings_df = pd.DataFrame(pca.components_.T, columns=pca.get_feature_names_out(), index=metrics.columns)
pca_loadings_df.index.name = 'metric'


pca_scores_df.to_csv(csv_dir / f"pca_scores_n{n}.csv")
pca_loadings_df.to_csv(csv_dir / f"pca_loadings_n{n}.csv")

In [8]:
np.cumsum(pca.explained_variance_ratio_) * 100

array([34.56560244, 61.31245597])

In [9]:
pca.explained_variance_ratio_ * 100

array([34.56560244, 26.74685353])