In [None]:
import pandas as pd
import umap
import matplotlib.pyplot as plt
import os
import warnings

In [None]:
features_sector = pd.read_csv("../data_generator/data/features_sector.csv", dtype={"company_code": str})
feature_columns = [column for column in features_sector.columns if column != "company_code"]
sum = features_sector[feature_columns].sum(axis=1).to_frame(name="sum")
features_sector = pd.concat([features_sector, sum], axis=1)
print(features_sector.head())
print(len(features_sector))

In [None]:
# preprocessing
## filtering
sum_threshold = 100
features_filtering = features_sector[features_sector["sum"] >= sum_threshold].reset_index(drop=True)
features_filtering = features_filtering.drop(columns=["company_code", "sum"]).to_numpy()


In [None]:
# UMAP
os.makedirs("figures", exist_ok=True)
for metric in ["euclidean", "manhattan", "chebyshev", "minkowski", "canberra", "braycurtis", "mahalanobis", "cosine", "correlation"]:
    export_dir = f"figures/{metric}"
    os.makedirs(export_dir, exist_ok=True)
    for n_neighbors in [5, 10, 15, 30, 50, 100]:
        for min_dist in [0.0, 0.1, 0.3, 0.5, 0.8, 0.99]:
            reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=42)
            embedding = reducer.fit_transform(features_filtering)

            # make scatter plot
            plt.scatter(embedding[:, 0], embedding[:, 1], s=1)
            plt.title(f"UMAP (metric={metric}, n_neighbors={n_neighbors}, min_dist={min_dist})", fontsize=10)
            plt.text(
                0.97,
                0.97,
                f"n = {len(features_filtering)}",
                horizontalalignment="right",
                verticalalignment="top",
                transform=plt.gca().transAxes,
                fontsize=10,
                color="black")
            plt.tight_layout()
            plt.savefig(f"{export_dir}/umap_metric_{metric}_n_neighbors_{n_neighbors}_min_dist_{min_dist}.png")
            plt.close()

            print(f"Saved: metric={metric}, n_neighbors={n_neighbors}, min_dist={min_dist}")


In [None]:
n_neighbors_list = [5, 10, 15, 30, 50, 100]
min_dist_list = [0.0, 0.1, 0.3, 0.5, 0.8, 0.99]
nrows = len(n_neighbors_list)
ncols = len(min_dist_list)
warnings.filterwarnings("ignore", message="n_jobs value.*overridden")

os.makedirs("figures", exist_ok=True)
for metric in ["euclidean", "manhattan", "chebyshev", "minkowski", "canberra", "braycurtis", "mahalanobis", "cosine", "correlation"]:
    export_dir = f"figures/{metric}"
    os.makedirs(export_dir, exist_ok=True)

    fig, ax = plt.subplots(nrows, ncols, figsize=(16,9))
    plt.suptitle(f"UMAP metric={metric}", fontsize=10)
    fig.tight_layout()

    i_row = 0
    i_col = 0
    for n_neighbors in n_neighbors_list:
        for min_dist in min_dist_list:
            reducer = umap.UMAP(n_components=2, n_neighbors=n_neighbors, min_dist=min_dist, metric=metric, random_state=42)
            embedding = reducer.fit_transform(features_filtering)

            ax[i_row, i_col].scatter(embedding[:, 0], embedding[:, 1], s=0.1)
            ax[i_row, i_col].tick_params(axis="both", labelsize=5)
            print(f"Plot: metric={metric}, n_neighbors={n_neighbors}, min_dist={min_dist}")
            i_col = i_col + 1
        
        i_row = i_row + 1
        i_col = 0

    plt.savefig(f"{export_dir}/umap_metric_{metric}_parameter_comparison.png")
    plt.show()