## ideas:

- cluster map
- heatmap, spending at hour on weekday

In [None]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
import umap
from src.dashboard_utility import get_all_bank_data
import plotly.express as px

import plotly.io as pio
pio.templates.default = "plotly"

In [None]:
comdirect_df, traderepublic_df, olb_df = get_all_bank_data()
df = comdirect_df.copy()

In [None]:
df = df[df["amount"] < 0]
df["amount"] = df["amount"].abs()

model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(df["details"].tolist(), normalize_embeddings=True)

In [None]:
kmeans = KMeans(n_clusters=10, random_state=9042003)
df["cluster"] = kmeans.fit_predict(embeddings)

In [None]:
# reducer = umap.UMAP(
#     n_neighbors=8,      
#     min_dist=0.0,       
#     spread=1.0,
#     metric="cosine"     
# )
# df[["x", "y"]] = reducer.fit_transform(embeddings)

tsne = TSNE(
    n_components=2,
    perplexity=30,
    learning_rate=200,
    metric="cosine"
)

df[["x", "y"]] = tsne.fit_transform(embeddings)

agg = df.groupby(["details", "cluster"]).agg(
    total_amount=("amount", "sum"),
    x=("x", "mean"),
    y=("y", "mean")
).reset_index()

agg["cluster"] = agg["cluster"].astype("category")

In [None]:
fig = px.scatter(
    agg,
    x="x", y="y",
    size="total_amount",
    color="cluster",
    hover_name="details",
    size_max=60
)
# st.plotly_chart(fig, use_container_width=True)
fig.show()

In [None]:
print(agg["cluster"].nunique(), agg["cluster"].unique()[:10])

In [None]:
agg

In [None]:
cluster_sum = agg.groupby("cluster", as_index=False)["total_amount"].sum()
cluster_sum