## ideas:

- cluster map
- heatmap, spending at hour on weekday

In [26]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
import umap
from src.dashboard_utility import get_all_bank_data


import plotly.io as pio
pio.templates.default = "plotly"

In [30]:
comdirect_df, traderepublic_df, olb_df = get_all_bank_data()
df = comdirect_df.copy()

In [31]:
df = df[df["amount"] < 0]
df["amount"] = df["amount"].abs()

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["details"].tolist(), normalize_embeddings=True)

kmeans = KMeans(n_clusters=10, random_state=42)
df["cluster"] = kmeans.fit_predict(embeddings)

reducer = umap.UMAP(
    n_neighbors=8,      # lower = more tightly clustered groups
    min_dist=0.0,       # closer packing
    spread=1.0,
    metric="cosine"     # IMPORTANT for embeddings
)
df[["x", "y"]] = reducer.fit_transform(embeddings)
 

agg = df.groupby(["details", "cluster"]).agg(
    total_amount=("amount", "sum"),
    x=("x", "mean"),
    y=("y", "mean")
).reset_index()

agg["cluster"] = agg["cluster"].astype("category")

import plotly.express as px
fig = px.scatter(
    agg,
    x="x", y="y",
    size="total_amount",
    color="cluster",
    hover_name="details",
    size_max=60
)
# st.plotly_chart(fig, use_container_width=True)
fig.show()

In [14]:
print(agg["cluster"].nunique(), agg["cluster"].unique()[:10])

10 [9 3 7 4 0 6 5 1 2 8]
