In [1]:
import warnings
warnings.filterwarnings('ignore')
import umap

import seaborn as sns
import math
import os
import torch
import numpy as np
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import hdbscan

from PIL import Image
from sklearn.metrics import silhouette_score
from matplotlib.colors import ListedColormap, Normalize
from scipy import  ndimage
from scipy.stats import skew, kurtosis
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import DBSCAN, KMeans

from jb3 import nasdaq

# Data prepare

In [29]:
sc = nasdaq.get_nasdaq_screener(limit=500,country="United States", sector="Health care")
sc.to_csv("sc.csv", index=False)
sc = pd.read_csv("sc.csv")
sc

Unnamed: 0,symbol,name,lastsale,netchange,pctchange,marketCap,url
0,LLY,Eli Lilly and Company Common Stock,"$1,010.31",-4.18,-0.412%,955130663535,/market-activity/stocks/lly
1,JNJ,Johnson & Johnson Common Stock,$201.93,-0.55,-0.272%,486508959947,/market-activity/stocks/jnj
2,ABBV,AbbVie Inc. Common Stock,$226.08,-2.63,-1.15%,399570317603,/market-activity/stocks/abbv
3,UNH,UnitedHealth Group Incorporated Common Stock (DE),$330.91,-2.58,-0.774%,299751057744,/market-activity/stocks/unh
4,MRK,"Merck & Company, Inc. Common Stock (new)",$99.72,-1.17,-1.16%,247507287290,/market-activity/stocks/mrk
...,...,...,...,...,...,...,...
495,IRD,"Opus Genetics, Inc. Common Stock",$2.27,0.04,1.794%,156548752,/market-activity/stocks/ird
496,RVMDW,"Revolution Medicines, Inc. Warrant",$0.8011,0.0011,0.138%,154868496,/market-activity/stocks/rvmdw
497,SAVA,"Cassava Sciences, Inc. Common Stock",$3.19,-0.13,-3.916%,154102188,/market-activity/stocks/sava
498,CGTX,"Cognition Therapeutics, Inc. Common Stock",$1.73,-0.03,-1.705%,152714466,/market-activity/stocks/cgtx


In [None]:
df = nasdaq.download_ticker_data(sc.symbol.tolist(), pivot_col="close", clean=True)

In [None]:
df.to_csv("CG-data.csv", index=True)

In [13]:
df = pd.read_csv("CG-data.csv", parse_dates=True, index_col = 0)
df

Unnamed: 0_level_0,AARD,ABBV,ABEO,ABSI,ABT,ACAD,ACHC,ACHV,ACLX,ACRS,...,XOMA,XOMAO,XOMAP,XRAY,YDES,ZBH,ZBIO,ZTS,ZVRA,ZYME
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2015-11-30,,58.15,116.25,,44.92,37.95,69.01,5675.9432,,23.55,...,26.600,,,60.66,,99.0881,,46.70,277.12,
2015-12-01,,59.02,111.75,,45.46,37.51,69.34,5763.9424,,23.24,...,29.200,,,61.97,,100.7656,,47.01,268.64,
2015-12-02,,57.72,104.50,,45.22,37.48,69.21,3585.9641,,21.40,...,28.000,,,62.44,,100.3143,,46.37,266.88,
2015-12-03,,56.12,101.50,,44.15,35.24,66.13,3101.9690,,20.55,...,27.600,,,62.61,,97.4695,,45.20,277.44,
2015-12-04,,57.18,101.25,,45.30,36.15,67.62,2661.9734,,21.13,...,27.200,,,63.08,,98.6369,,46.46,271.36,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2025-11-21,9.37,236.28,4.48,2.93,128.11,23.56,15.18,3.9600,90.19,2.51,...,31.750,25.4100,26.56,10.58,8.45,92.1800,35.80,122.06,8.60,24.02
2025-11-24,9.50,229.51,4.59,2.96,127.19,23.90,15.54,3.9700,74.96,2.75,...,31.800,25.4400,26.49,10.73,8.83,93.5200,38.25,122.87,8.65,24.99
2025-11-25,9.37,231.80,4.88,3.07,128.05,24.39,16.57,4.5900,73.00,2.73,...,31.845,25.3900,26.49,11.09,9.29,97.0700,39.86,127.89,8.40,26.18
2025-11-26,9.67,227.66,5.15,3.09,128.54,24.81,17.15,4.8400,73.25,2.88,...,32.290,25.4199,26.62,11.29,10.52,97.5500,40.53,127.69,8.60,27.33


In [14]:
df_2020 = df[df.index.year == 2020]
df_2021 = df[df.index.year == 2021]
df_2022 = df[df.index.year == 2022]
df_2023 = df[df.index.year == 2023]
df_2024 = df[df.index.year == 2024]
df_2025 = df[df.index.year == 2025]

# --- Odstranimo stolpce z NaN ---
df_2020 = df_2020.drop(columns=df_2020.columns[df_2020.isna().any()], errors="ignore")
df_2021 = df_2021.drop(columns=df_2021.columns[df_2021.isna().any()], errors="ignore")
df_2022 = df_2022.drop(columns=df_2022.columns[df_2022.isna().any()], errors="ignore")
df_2023 = df_2023.drop(columns=df_2023.columns[df_2023.isna().any()], errors="ignore")
df_2024 = df_2024.drop(columns=df_2024.columns[df_2024.isna().any()], errors="ignore")
df_2025 = df_2025.drop(columns=df_2025.columns[df_2025.isna().any()], errors="ignore")

df_full = df[df.index.year >= 2020].drop(columns=df.columns[df.isna().any()], errors="ignore")


In [15]:

def preprocess_prices(df):
    df = df.copy()

    # detrending
    df_detr = df - df.rolling(window=20, min_periods=1).mean()

    # standard scaling
    df_scaled = (df_detr - df_detr.mean()) / df_detr.std()

    # odstrani stolpce z NaN
    df_scaled = df_scaled.dropna(axis=1)

    return df_scaled


df_2020_scaled = preprocess_prices(df_2020)
df_2021_scaled = preprocess_prices(df_2021)
df_2022_scaled = preprocess_prices(df_2022)
df_2023_scaled = preprocess_prices(df_2023)
df_2024_scaled = preprocess_prices(df_2024)
df_2025_scaled = preprocess_prices(df_2025)

df_full_scaled = preprocess_prices(df[df.index.year >= 2020])

print(
    f"shape(df_2020) = {df_2020.shape},    log = {df_2020_scaled.shape}\n"
    f"shape(df_2021) = {df_2021.shape},    log = {df_2021_scaled.shape}\n"
    f"shape(df_2022) = {df_2022.shape},    log = {df_2022_scaled.shape}\n"
    f"shape(df_2023) = {df_2023.shape},    log = {df_2023_scaled.shape}\n"
    f"shape(df_2024) = {df_2024.shape},    log = {df_2024_scaled.shape}\n"
    f"shape(df_2025) = {df_2025.shape},    log = {df_2025_scaled.shape}\n"
    f"shape(df_full) = {df_full.shape},    log = {df_full_scaled.shape}\n"
)

shape(df_2020) = (253, 327),    log = (253, 327)
shape(df_2021) = (252, 376),    log = (252, 376)
shape(df_2022) = (251, 437),    log = (251, 437)
shape(df_2023) = (250, 448),    log = (250, 448)
shape(df_2024) = (252, 456),    log = (252, 456)
shape(df_2025) = (228, 486),    log = (228, 486)
shape(df_full) = (1486, 239),    log = (1486, 327)



In [16]:
def transfer_entropy(X, Y, delay=1, gaussian_sigma=None):

    X = np.asarray(X, dtype=float)
    Y = np.asarray(Y, dtype=float)

    # Filtri
    if not np.isfinite(X).all() or not np.isfinite(Y).all():
        return 0.0

    if len(X) <= delay or len(Y) <= delay:
        return 0.0

    X_t   = X[delay:]
    X_tm1 = X[:-delay]
    Y_tm1 = Y[:-delay]

    if np.std(X_t) < 1e-12 or np.std(X_tm1) < 1e-12 or np.std(Y_tm1) < 1e-12:
        return 0.0

    n = float(len(X_t))
    eps = 1e-12

    # Bins
    binX = min(20, max(4, int(np.sqrt(len(X)))))
    binY = min(20, max(4, int(np.sqrt(len(Y)))))

    pXYZ, _ = np.histogramdd(
        np.vstack([X_t, Y_tm1, X_tm1]).T,
        bins=[binX, binY, binX]
    )
    pXX, _ = np.histogramdd(np.vstack([X_t, X_tm1]).T, bins=[binX, binX])
    pYX, _ = np.histogramdd(np.vstack([Y_tm1, X_tm1]).T, bins=[binY, binX])
    pX,  _ = np.histogram(X_tm1, bins=binX)

    pXYZ = pXYZ / n + eps
    pXX  = pXX  / n + eps
    pYX  = pYX  / n + eps
    pX   = pX   / n + eps

    if gaussian_sigma is not None:
        sigma = min(gaussian_sigma, 0.8)
        pXYZ = ndimage.gaussian_filter(pXYZ, sigma=sigma)
        pXX  = ndimage.gaussian_filter(pXX,  sigma=sigma)
        pYX  = ndimage.gaussian_filter(pYX,  sigma=sigma)
        pX   = ndimage.gaussian_filter(pX,   sigma=sigma)

    TE = 0.0
    for i in range(binX):
        for j in range(binY):
            for k in range(binX):
                TE += pXYZ[i,j,k] * np.log2((pXYZ[i,j,k] * pX[k]) /
                                            (pXX[i,k] * pYX[j,k]))

    return max(0.0, TE)

def compute_te_matrix(df, delay=1, sigma=None):
    cols = df.columns
    n = len(cols)

    A = np.zeros((n, n))
    data = df.values

    for i in tqdm(range(n), desc="Calculating TE Matrix"):
        Xi = data[:, i]
        for j in range(n):
            if i == j:
                continue
            Yj = data[:, j]

            A[i, j] = transfer_entropy(
                Xi, Yj,
                delay=delay,
                gaussian_sigma=sigma
            )

    return pd.DataFrame(A, index=cols, columns=cols)

def autocorr(x, lag=1):
    return np.corrcoef(x[lag:], x[:-lag])[0,1]

def compute_node_features(df):
    features = {}
    for col in df.columns:
        x = df[col].values.astype(float)
        feats = {
            "min": np.min(x),
            "max": np.max(x),
            "mean": np.mean(x),
            "std": np.std(x),
            "var": np.var(x),
            "skew": skew(x),
            "kurtosis": kurtosis(x)
        }
        features[col] = feats
    return pd.DataFrame(features).T

def get_market_cap_only(tickers, sc_df):

    features = {}
    for ticker in tickers:
        try:
            row = sc_df.loc[sc_df["symbol"] == ticker]

            if not row.empty:
                raw_cap = row.iloc[0].marketCap
                clean_cap = float(str(raw_cap).replace(",", "").replace("$", ""))

                features[ticker] = {
                    "market_cap": clean_cap
                }
            else:
                features[ticker] = {"market_cap": 0}

        except Exception as e:
            features[ticker] = {"market_cap": 0}

    return pd.DataFrame(features).T

def get_node_sizes(nodes_list, node_features, min_size=50, max_size=400):
    caps = node_features.loc[nodes_list, "market_cap"].values.astype(float)

    positive_caps = caps[caps > 0]
    if len(positive_caps) == 0:
        positive_caps = np.array([1.0])

    min_positive = positive_caps.min()
    caps = np.where(caps > 0, caps, min_positive)

    caps = np.log(caps).reshape(-1, 1)

    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler(feature_range=(min_size, max_size))
    sizes = scaler.fit_transform(caps).flatten()

    return sizes

def build_graph(A, X):
    A = A.values
    n = A.shape[0]
    edge_index = []
    edge_attr = []

    for i in range(n):
        for j in range(n):
            if A[i,j]!=0 and i!=j:
                edge_index.append([i,j])
                edge_attr.append([A[i,j]])

    return torch.tensor(X.values, dtype=torch.float), \
           torch.tensor(edge_index, dtype=torch.long).T, \
           torch.tensor(edge_attr, dtype=torch.float)

def compute_all_metrics(G):
    in_deg = dict(G.in_degree(weight=None))
    out_deg = dict(G.out_degree(weight=None))
    in_deg_w = dict(G.in_degree(weight="weight"))
    out_deg_w = dict(G.out_degree(weight="weight"))
    total_deg = {n: in_deg[n]+out_deg[n] for n in G.nodes()}
    total_deg_w = {n: in_deg_w[n]+out_deg_w[n] for n in G.nodes()}
    pagerank = nx.pagerank(G, weight="weight")
    hubs, auth = nx.hits(G)
    bet = nx.betweenness_centrality(G, weight="weight")
    clo = nx.closeness_centrality(G, distance=lambda u,v,d: 1/d["weight"])
    try:
        eig = nx.eigenvector_centrality_numpy(G, weight="weight")
    except nx.AmbiguousSolution:
        eig = nx.eigenvector_centrality(G, weight="weight", max_iter=500)

    df = pd.DataFrame({
        "in_degree": in_deg, "out_degree": out_deg, "total_degree": total_deg,
        "in_degree_w": in_deg_w, "out_degree_w": out_deg_w, "total_degree_w": total_deg_w,
        "pagerank": pagerank, "hub": hubs, "authority": auth,
        "betweenness": bet, "closeness": clo, "eigenvector": eig
    })
    return df.sort_values("pagerank", ascending=False)

def plot_te_graph(G, cluster_labels, node_features, figsize=(10,8), seed=42, save_path=None):
    fig, ax = plt.subplots(figsize=figsize)
    pos = nx.spring_layout(G, seed=seed, k=0.15)

    cluster_dict = cluster_labels.to_dict()
    node_colors = [cluster_dict.get(node, -1) for node in G.nodes()]
    unique_clusters = sorted(list(set(node_colors)))
    n_clusters = len(unique_clusters)
    base_cmap = plt.colormaps['tab20'].resampled(n_clusters)
    cluster_cmap = ListedColormap(base_cmap.colors[:n_clusters])

    node_sizes = get_node_sizes(list(G.nodes()), node_features, min_size=50, max_size=800)

    nx.draw_networkx_nodes(
        G, pos,
        node_size=node_sizes,
        node_color=node_colors,
        cmap=cluster_cmap,
        edgecolors='black',
        linewidths=0.5,
        ax=ax
    )


    weights = [d['weight'] for u, v, d in G.edges(data=True)]
    if len(weights) > 0:
        norm = Normalize(vmin=min(weights), vmax=max(weights))

        nx.draw_networkx_edges(
            G, pos,
            arrowstyle="-|>",
            arrowsize=10,
            width=[0.5 + 2*(w - min(weights)) / (max(weights) - min(weights)) for w in weights],
            edge_color=weights,
            edge_cmap=plt.cm.plasma_r,   # OBRNJENA KOLORNA MAPA
            edge_vmin=min(weights),
            edge_vmax=max(weights),
            alpha=0.65,
            ax=ax
        )

        sm = plt.cm.ScalarMappable(cmap=plt.cm.plasma_r, norm=norm)
        sm.set_array([])
        cbar = fig.colorbar(sm, ax=ax, fraction=0.03, pad=0.02)
        cbar.set_label("Transfer Entropy Weight")

    nx.draw_networkx_labels(G, pos, font_size=6, font_weight="bold", ax=ax)

    ax.set_title("TE Graph – Size by Market Cap")
    ax.axis("off")
    plt.tight_layout()

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        fig.savefig(os.path.join(save_path, "te_graph_marketcap.png"),
                    dpi=300, bbox_inches='tight')

    plt.show()

def scale_features(node_features):
    scaler = StandardScaler()
    scaled = scaler.fit_transform(node_features.values)
    return pd.DataFrame(scaled, index=node_features.index, columns=node_features.columns)

def build_knn_graph(node_features, k=5):
    X = node_features.values
    tickers = node_features.index.tolist()
    nbrs = NearestNeighbors(n_neighbors=k+1).fit(X)
    distances, indices = nbrs.kneighbors(X)
    G_knn = nx.Graph()
    G_knn.add_nodes_from(tickers)
    for i, neigh_list in enumerate(indices):
        for j in neigh_list[1:]:
            G_knn.add_edge(tickers[i], tickers[j], weight=np.linalg.norm(X[i]-X[j]))
    return G_knn

def run_dbscan(node_features, eps=0.5, min_samples=5):
    X = node_features.values
    model = DBSCAN(eps=eps, min_samples=min_samples)
    labels = model.fit_predict(X)

    clusters = pd.Series(labels, index=node_features.index, name="dbscan_cluster")

    return clusters, model

def run_hdbscan(node_features, min_cluster_size=5, min_samples=None):
    X = node_features.values

    clusterer = hdbscan.HDBSCAN(
        min_cluster_size=min_cluster_size,
        min_samples=min_samples,
        metric='euclidean'
    )
    labels = clusterer.fit_predict(X)

    clusters = pd.Series(labels, index=node_features.index, name="hdbscan_cluster")

    return clusters, clusterer

def run_kmeans(node_features, n_clusters=5, random_state=42):
    X = node_features.values
    model = KMeans(n_clusters=n_clusters, random_state=random_state)
    labels = model.fit_predict(X)

    score = silhouette_score(X, labels)

    clusters = pd.Series(labels, index=node_features.index, name="kmeans_cluster")

    return clusters, model, score

def plot_knn_with_clusters(
    G_knn,
    cluster_labels,
    node_features,
    title="KNN Graph Colored by Clusters",
    figsize=(10, 8),
    seed=42,
    save_path=None
):
    fig, ax = plt.subplots(figsize=figsize)

    pos = nx.spring_layout(G_knn, seed=seed, k=0.15)

    cluster_dict = cluster_labels.to_dict()
    node_colors = [cluster_dict.get(n, -1) for n in G_knn.nodes()]
    unique_clusters = sorted(set(node_colors))

    cmap = plt.cm.get_cmap("tab20", len(unique_clusters))

    node_sizes = get_node_sizes(
        list(G_knn.nodes()),
        node_features,
        min_size=40,
        max_size=700
    )

    nx.draw_networkx_edges(
        G_knn, pos,
        alpha=0.15,
        edge_color='gray',
        width=1.0,
        ax=ax
    )

    nodes = nx.draw_networkx_nodes(
        G_knn, pos,
        node_size=node_sizes,
        node_color=node_colors,
        cmap=cmap,
        linewidths=0.4,
        edgecolors='black',
        ax=ax
    )

    for (node, (x, y), size) in zip(G_knn.nodes(), pos.values(), node_sizes):
        if size > np.percentile(node_sizes, 80):  # top 20% po velikosti
            ax.text(
                x, y, node,
                fontsize=7,
                fontweight="bold",
                ha="center",
                va="center"
            )

    for c in unique_clusters:
        ax.scatter([], [], c=cmap(unique_clusters.index(c)), label=f"Cluster {c}")

    ax.legend(
        loc="upper left",
        fontsize=8,
        frameon=True,
        facecolor='white',
        edgecolor='black'
    )

    ax.set_title(f"{title} (Node Size = Market Cap)", fontsize=12, fontweight='bold')
    ax.axis("off")

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        fig.savefig(
            os.path.join(save_path, "knn_clusters_marketcap.png"),
            dpi=300,
            bbox_inches='tight'
        )

    plt.show()

def plot_results_table(results, save_path=None):
    fig, ax = plt.subplots(figsize=(15,6))
    ax.axis('tight')
    ax.axis('off')
    table = ax.table(cellText=results["metrics"].round(3).values,
                     colLabels=results["metrics"].columns,
                     rowLabels=results["metrics"].index,
                     loc='center')
    table.auto_set_font_size(False)
    table.set_fontsize(8)
    table.scale(1, 1.5)

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        fig.savefig(os.path.join(save_path, "results_table.png"), dpi=300, bbox_inches='tight')
    plt.show()

def plot_top_per_metric(metrics_df, n=5, n_cols=3, save_path=None, csv_path=None):
    metrics = metrics_df.columns
    n_metrics = len(metrics)
    n_rows = math.ceil(n_metrics / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(6 * n_cols, 3.8 * n_rows))
    axes = axes.flatten()

    # zbiranje top-n v tabelo
    rows_for_csv = []

    for i, metric in enumerate(metrics):
        ax = axes[i]

        top_nodes = metrics_df[metric].nlargest(n).reset_index()
        top_nodes.columns = ["node", "value"]

        # dodaj rank za CSV
        top_nodes["rank"] = top_nodes["value"].rank(ascending=False, method="first").astype(int)
        top_nodes["metric"] = metric

        # shrani v zbirnik
        rows_for_csv.append(top_nodes)

        color = sns.color_palette("viridis", 8)[3]

        sns.barplot(
            data=top_nodes,
            x="value",
            y="node",
            color=color,
            ax=ax
        )

        ax.set_title(f"Top {n} Nodes by {metric}", fontsize=11, fontweight="bold")
        ax.set_xlabel(metric, fontsize=10)
        ax.set_ylabel("")

        ax.grid(axis="x", linestyle="--", alpha=0.4)

        ax.tick_params(axis='y', labelsize=9)
        ax.tick_params(axis='x', labelsize=8)

    # onemogoči prazne grafe
    for j in range(len(metrics), len(axes)):
        axes[j].axis("off")

    plt.tight_layout()

    # shrani sliko
    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        fig.savefig(
            os.path.join(save_path, "top_per_metric.png"),
            dpi=300,
            bbox_inches="tight"
        )

    # shrani CSV z top-N metriko
    if csv_path is not None:
        os.makedirs(csv_path, exist_ok=True)
        final_df = pd.concat(rows_for_csv, ignore_index=True)
        final_df.to_csv(
            os.path.join(csv_path, "top_per_metric.csv"),
            index=False
        )

    plt.show()

def run_umap(
    node_features,
    n_neighbors=15,
    min_dist=0.1,
    n_components=2,
    metric="euclidean",
    random_state=42
):

    X = node_features.values

    reducer = umap.UMAP(
        n_neighbors=n_neighbors,
        min_dist=min_dist,
        n_components=n_components,
        metric=metric,
        random_state=random_state
    )

    embedding = reducer.fit_transform(X)

    cols = [f"UMAP_{i+1}" for i in range(n_components)]
    umap_df = pd.DataFrame(embedding, index=node_features.index, columns=cols)

    return umap_df, reducer

def plot_umap(
    umap_df,
    node_features,
    cluster_labels=None,
    figsize=(10, 8),
    title="UMAP Projection",
    save_path=None,
    suffix="default"
):
    plt.figure(figsize=figsize)


    if cluster_labels is not None:
        cluster_dict = cluster_labels.to_dict()
        labels = np.array([cluster_dict.get(node, -1) for node in umap_df.index])

        unique_labels = sorted(np.unique(labels))
        n_labels = len(unique_labels)

        palette = sns.color_palette("tab20", n_colors=n_labels)
        color_map = {lbl: palette[i] for i, lbl in enumerate(unique_labels)}

        if -1 in unique_labels:
            color_map[-1] = (0.6, 0.6, 0.6)

        colors = [color_map[lbl] for lbl in labels]
    else:
        colors = "gray"
        labels = None

    sizes = get_node_sizes(
        umap_df.index,
        node_features,
        min_size=40,
        max_size=600
    )

    plt.scatter(
        umap_df.iloc[:, 0],
        umap_df.iloc[:, 1],
        c=colors,
        s=sizes,
        alpha=0.75,
        edgecolor="black",
        linewidth=0.2
    )

    size_threshold = np.percentile(sizes, 85)
    for idx, (x, y) in enumerate(zip(umap_df.iloc[:, 0], umap_df.iloc[:, 1])):
        if sizes[idx] >= size_threshold:
            plt.text(
                x, y,
                umap_df.index[idx],
                fontsize=7,
                ha="center",
                va="center",
                fontweight="bold"
            )

    if cluster_labels is not None:
        legend_elements = []
        for lbl in unique_labels:
            legend_elements.append(
                plt.Line2D(
                    [0], [0],
                    marker='o',
                    color='w',
                    label=f"Cluster {lbl}",
                    markersize=8,
                    markerfacecolor=color_map[lbl],
                    markeredgecolor='black',
                    markeredgewidth=0.3
                )
            )
        plt.legend(
            handles=legend_elements,
            title="Clusters",
            loc="upper right",
            fontsize=8,
            frameon=True
        )

    plt.xlabel("UMAP 1", fontsize=11)
    plt.ylabel("UMAP 2", fontsize=11)
    plt.title(f"{title} (Size = Market Cap)", fontsize=14, fontweight="bold")

    plt.grid(False)

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        plt.savefig(
            os.path.join(save_path, f"umap_{suffix}.png"),
            dpi=300,
            bbox_inches="tight"
        )

    plt.show()

def plot_graph_metrics_distribution(results, n_cols=2, save_path=None):
    df = results["metrics"]
    metrics_names = df.columns
    n_metrics = len(metrics_names)
    n_rows = math.ceil(n_metrics / n_cols)

    fig, axes = plt.subplots(n_rows, n_cols, figsize=(16, 4.5 * n_rows))
    axes = axes.flatten()

    hist_color = sns.color_palette("Blues", 6)[3]
    kde_color = sns.color_palette("Reds", 6)[4]

    for i, metric in enumerate(metrics_names):
        ax = axes[i]
        data = df[metric].dropna()

        # Histogram
        sns.histplot(
            data,
            ax=ax,
            bins=30,
            color=hist_color,
            kde=False,
            edgecolor="black",
            alpha=0.75
        )

        # KDE
        sns.kdeplot(
            data,
            ax=ax,
            color=kde_color,
            linewidth=2
        )

        mean_val = data.mean()
        median_val = data.median()

        ax.axvline(
            mean_val,
            color="red",
            linestyle="dashed",
            linewidth=1,
            label=f"Mean = {mean_val:.3f}"
        )
        ax.axvline(
            median_val,
            color="black",
            linestyle="dotted",
            linewidth=1,
            label=f"Median = {median_val:.3f}"
        )

        ax.set_title(f"Porazdelitev metrike: {metric}", fontsize=12, fontweight="bold")
        ax.set_xlabel("Vrednost", fontsize=10)
        ax.set_ylabel("Frekvenca", fontsize=10)

        ax.grid(True, linestyle="--", linewidth=0.5, alpha=0.35)

        ax.legend(fontsize=8)

    for j in range(i + 1, len(axes)):
        axes[j].axis("off")

    plt.tight_layout()

    if save_path is not None:
        os.makedirs(save_path, exist_ok=True)
        fig.savefig(
            os.path.join(save_path, "graph_metrics_distribution.png"),
            dpi=300,
            bbox_inches="tight"
        )

    plt.show()

def dynamic_te_threshold(A):
    vals = A.values.flatten()
    vals = vals[vals > 0]

    soft = np.nanpercentile(vals, 80)
    medium = vals.mean()
    hard = np.nanpercentile(vals, 95)

    return {"soft": soft, "medium": medium, "hard": hard}

In [17]:
def run_full_pipeline(
        df,
        sc,
        delay=30,
        sigma=0.75,
        te_threshold_="medium",
        knn_k=10,
        dbscan_eps=1.2,
        dbscan_min=5,
        kmeans_n=12,
        topk=3,
        plot_top_n=5,
        umap_n_neighbors=20,
        umap_min_dist=0.25,
        min_hdbscan_cluster=6,
        metric="euclidean"
):

    results = {}

    # -------------------------------------------------------
    # 0) Folder
    # -------------------------------------------------------
    if not isinstance(df.index, pd.DatetimeIndex):
        df.index = pd.to_datetime(df.index)

    years = sorted(df.index.year.unique())
    folder = str(years[0]) if len(years) == 1 else "FULL"
    os.makedirs(folder, exist_ok=True)
    print("Folder:", folder)

    # -------------------------------------------------------
    # 1) Market Cap
    # -------------------------------------------------------
    market_cap_features = get_market_cap_only(df.columns, sc)
    results["market_cap"] = market_cap_features

    # -------------------------------------------------------
    # 2) Node features
    # -------------------------------------------------------
    node_features = compute_node_features(df)
    results["node_features"] = node_features

    # -------------------------------------------------------
    # 3) TE matrix
    # -------------------------------------------------------
    adj_path = os.path.join(folder, "A.csv")

    if os.path.exists(adj_path):
        A = pd.read_csv(adj_path, index_col=0)
        print("Loaded TE matrix:", A.shape)
    else:
        A = compute_te_matrix(df, delay=delay, sigma=sigma)
        A.to_csv(adj_path)
        print("Computed TE matrix:", A.shape)

    results["A_raw"] = A

    # -------------------------------------------------------
    # 4) Threshold / top-k
    # -------------------------------------------------------

    thr_dict = dynamic_te_threshold(A)
    te_threshold_value = thr_dict[te_threshold_]

    print(f"TE threshold ({te_threshold_}): {te_threshold_value:.6f}")

    A_thr = A.where(A >= te_threshold_value, 0)

    if topk is not None:
        for row in A_thr.index:
            topk_idx = A_thr.loc[row].nlargest(topk).index
            A_thr.loc[row] = 0
            A_thr.loc[row, topk_idx] = A.loc[row, topk_idx]

    max_val = A_thr.values.max() if A_thr.values.max() > 0 else 1
    A_scaled = A_thr / max_val
    results["A_thr"] = A_scaled

    # -------------------------------------------------------
    # 5) Directed TE graph
    # -------------------------------------------------------
    G = nx.from_pandas_adjacency(A_scaled, create_using=nx.DiGraph)
    G.remove_edges_from((u, v) for u, v, d in G.edges(data=True) if d["weight"] == 0)
    results["G"] = G

    print(f"TE graph: nodes={G.number_of_nodes()}, edges={G.number_of_edges()}")

    # -------------------------------------------------------
    # 6) Graph metrics
    # -------------------------------------------------------
    df_metrics = compute_all_metrics(G)
    results["metrics"] = df_metrics

    # -------------------------------------------------------
    # 7) Scaling for feature-space clustering
    # -------------------------------------------------------
    scaled_features = scale_features(node_features)
    results["scaled_features"] = scaled_features

    # -------------------------------------------------------
    # 8) KNN graph
    # -------------------------------------------------------
    G_knn = build_knn_graph(scaled_features, k=knn_k)
    results["G_knn"] = G_knn

    # -------------------------------------------------------
    # 9) DBSCAN / HDBSCAN / KMeans
    # -------------------------------------------------------
    db_labels, _ = run_dbscan(scaled_features, eps=dbscan_eps, min_samples=dbscan_min)
    hdb_labels, hdb_model = run_hdbscan(scaled_features, min_cluster_size=min_hdbscan_cluster)
    kmeans_labels, kmeans_model, k_sil = run_kmeans(scaled_features, n_clusters=kmeans_n)

    results["dbscan_labels"] = db_labels
    results["hdbscan_labels"] = hdb_labels
    results["kmeans_labels"] = kmeans_labels
    results["kmeans_model"] = kmeans_model
    results["kmeans_silhouette"] = k_sil

    # -------------------------------------------------------
    # 10) UMAP embedding
    # -------------------------------------------------------
    umap_df, umap_model = run_umap(
        scaled_features,
        n_neighbors=umap_n_neighbors,
        min_dist=umap_min_dist,
        metric=metric
    )

    results["umap_df"] = umap_df
    results["umap_model"] = umap_model

    # -------------------------------------------------------
    # 11) Plots
    # -------------------------------------------------------
    plot_umap(umap_df, market_cap_features, db_labels,
              save_path=folder, title="UMAP + DBSCAN", suffix="dbscan")

    plot_umap(umap_df, market_cap_features, hdb_labels,
              save_path=folder, title="UMAP + HDBSCAN", suffix="hdbscan")

    plot_umap(umap_df, market_cap_features, kmeans_labels,
              save_path=folder, title="UMAP + KMeans", suffix="kmeans")

    plot_te_graph(G, db_labels, node_features=market_cap_features, save_path=folder)

    plot_knn_with_clusters(G_knn, db_labels, node_features=market_cap_features,
                           save_path=folder, title="KNN + DBSCAN")

    plot_knn_with_clusters(G_knn, kmeans_labels, node_features=market_cap_features,
                           save_path=folder, title="KNN + KMeans")

    plot_top_per_metric(df_metrics, n=plot_top_n, n_cols=4, save_path=folder, csv_path=folder)

    plot_graph_metrics_distribution(results, n_cols=2, save_path=folder)
    plot_results_table({"metrics": df_metrics}, save_path=folder)

    return results

In [None]:
results = run_full_pipeline(
    df=df_2025_scaled,
    sc=sc,
    delay=2,
    sigma=0.45,
    te_threshold_="hard",
    topk=4,
    knn_k=10,
    dbscan_eps=0.44,
    dbscan_min=3,
    min_hdbscan_cluster=3,
    metric="correlation",
    kmeans_n=5,
    umap_n_neighbors=12,
    umap_min_dist=0.15,
    plot_top_n=5
)

In [None]:
dfs = [df_2020_scaled, df_2021_scaled, df_2022_scaled, df_2023_scaled, df_2024_scaled, df_2025_scaled, df_full_scaled]

for df in dfs:
    results = run_full_pipeline(
    df=df,
    sc=sc,
    delay=2,
    sigma=0.45,
    te_threshold_="hard",
    topk=4,
    knn_k=10,
    dbscan_eps=0.44,
    dbscan_min=3,
    min_hdbscan_cluster=3,
    metric="correlation",
    kmeans_n=5,
    umap_n_neighbors=12,
    umap_min_dist=0.15,
    plot_top_n=5
    )

In [25]:
ROOT = "."
SAVE_DIR = os.path.join(ROOT, "temporal")
os.makedirs(SAVE_DIR, exist_ok=True)

years = ["2020", "2021", "2022", "2023", "2024", "2025"]

image_types = [
    "knn_clusters_marketcap.png",
    "umap_hdbscan.png",
    "umap_kmeans.png",
    "umap_dbscan.png",
    "te_graph_marketcap.png",
    "top_per_metric.png",
    "graph_metrics_distribution.png"
]

for img_name in image_types:
    loaded = []

    if img_name == "top_per_metric.png":
        cols = 4
    else:
        cols = 3

    for year in years:
        folder = os.path.join(ROOT, year)
        path = os.path.join(folder, img_name)
        if os.path.exists(path):
            loaded.append((year, Image.open(path)))

    if len(loaded) == 0:
        continue

    rows = (len(loaded) + cols - 1) // cols

    fig, axes = plt.subplots(rows, cols, figsize=(cols * 4.5, rows * 3.8))

    if rows == 1 and cols == 1:
        axes = [axes]
    else:
        axes = axes.flatten()

    for ax, (year, img) in zip(axes, loaded):
        ax.imshow(img)
        ax.set_title(year, fontsize=10)
        ax.axis("off")

    for ax in axes[len(loaded):]:
        ax.axis("off")

    plt.subplots_adjust(
        left=0.01, right=0.99, top=0.97, bottom=0.03,
        wspace=0.05, hspace=0.05
    )

    out_path = os.path.join(SAVE_DIR, f"temporal_{img_name}")
    plt.savefig(out_path, dpi=200)
    plt.close()

    print(f"Shranjeno: {out_path}")

Shranjeno: ./temporal/temporal_knn_clusters_marketcap.png
Shranjeno: ./temporal/temporal_umap_hdbscan.png
Shranjeno: ./temporal/temporal_umap_kmeans.png
Shranjeno: ./temporal/temporal_umap_dbscan.png
Shranjeno: ./temporal/temporal_te_graph_marketcap.png
Shranjeno: ./temporal/temporal_top_per_metric.png
Shranjeno: ./temporal/temporal_graph_metrics_distribution.png


In [26]:


# ============================================
# 1) Load
# ============================================

def load_top5_all_years(base_folder, years):
    dfs = []
    for year in years:
        folder = os.path.join(base_folder, year)
        file_path = os.path.join(folder, "top_per_metric.csv")

        if os.path.exists(file_path):
            df = pd.read_csv(
                file_path,
                names=["node", "value", "rank", "metric"],
                dtype={"node": str, "metric": str}
            )
            df["value"] = pd.to_numeric(df["value"], errors="coerce")
            df["rank"] = pd.to_numeric(df["rank"], errors="coerce")
            df["year"] = year
            dfs.append(df)
        else:
            print(f"Manjka datoteka: {file_path}")

    return pd.concat(dfs, ignore_index=True)


# ============================================
# 2) RANK EVOLUTION – s stabilnimi nodi
# ============================================

def plot_rank_evolution(df, save_dir, min_years=3):

    metrics = df["metric"].unique()
    year_order = ["2020", "2021", "2022", "2023", "2024", "2025"]

    for metric in metrics:

        sub = df[df["metric"] == metric]

        # Izberi node, ki se pojavijo vsaj v min_years letih
        stable_nodes = sub["node"].value_counts()
        stable_nodes = stable_nodes[stable_nodes >= min_years].index

        sub = sub[sub["node"].isin(stable_nodes)]

        if sub.empty:
            continue

        pivot = sub.pivot_table(
            index="node",
            columns="year",
            values="rank",
            aggfunc="first"
        )

        # sortiraj stolpce po pravem vrstnem redu
        pivot = pivot[[y for y in year_order if y in pivot.columns]]

        if pivot.empty:
            continue

        plt.figure(figsize=(14, 7))

        for node in pivot.index:
            plt.plot(
                pivot.columns,
                pivot.loc[node],
                marker="o",
                linewidth=2,
                label=node
            )

        plt.gca().invert_yaxis()
        plt.title(f"Rank Evolution (Stable Nodes ≥ {min_years} Years) – {metric}", fontsize=18)
        plt.xlabel("Leto", fontsize=14)
        plt.ylabel("Rank (1 najboljši)", fontsize=14)
        plt.grid(alpha=0.3)
        plt.legend(title="Node", bbox_to_anchor=(1.02, 1), loc="upper left")

        plt.savefig(
            os.path.join(save_dir, f"rank_evolution_filtered_{metric}.png"),
            dpi=200, bbox_inches="tight"
        )
        plt.close()


# ============================================
# 3) HEATMAP vrednosti
# ============================================

def plot_value_heatmap(df, save_dir):

    metrics = df["metric"].unique()
    year_order = ["2020","2021","2022","2023","2024","2025"]

    for metric in metrics:

        sub = df[df["metric"] == metric]

        pivot = sub.pivot_table(
            index="node",
            columns="year",
            values="value",
            aggfunc="first"
        )

        pivot = pivot[[y for y in year_order if y in pivot.columns]]

        if pivot.empty or pivot.isna().all().all():
            continue

        plt.figure(figsize=(12, 7))
        sns.heatmap(pivot, annot=True, fmt=".2f", cmap="viridis")
        plt.title(f"Metric Values Through Years – {metric}", fontsize=16)

        plt.savefig(
            os.path.join(save_dir, f"value_heatmap_{metric}.png"),
            dpi=200, bbox_inches="tight"
        )
        plt.close()


# ============================================
# 4) JACCARD matrika
# ============================================

def plot_jaccard(df, save_dir):

    metrics = df["metric"].unique()
    years = ["2020","2021","2022","2023","2024","2025"]

    for metric in metrics:

        sub = df[df["metric"] == metric]

        jac = pd.DataFrame(index=years, columns=years, dtype=float)

        for y1 in years:
            s1 = set(sub[sub["year"] == y1]["node"])
            for y2 in years:
                s2 = set(sub[sub["year"] == y2]["node"])
                if len(s1 | s2) == 0:
                    jac.loc[y1, y2] = None
                else:
                    jac.loc[y1, y2] = len(s1 & s2) / len(s1 | s2)

        plt.figure(figsize=(10, 8))
        sns.heatmap(jac.astype(float), annot=True, cmap="Blues", vmin=0, vmax=1)
        plt.title(f"Jaccard Similarity – {metric}", fontsize=16)

        plt.savefig(
            os.path.join(save_dir, f"jaccard_{metric}.png"),
            dpi=200, bbox_inches="tight"
        )
        plt.close()


# ============================================
# 5) FREQUENCY
# ============================================

def plot_frequency(df, save_dir, top_n=20):

    freq = df["node"].value_counts().head(top_n)

    plt.figure(figsize=(10, 9))
    sns.barplot(x=freq.values, y=freq.index, palette="viridis")
    plt.title(f"Top {top_n} Most Frequent Top-5 Nodes", fontsize=16)
    plt.xlabel("Frekvenca", fontsize=14)
    plt.ylabel("Node", fontsize=14)

    plt.savefig(
        os.path.join(save_dir, f"top_frequency_top{top_n}.png"),
        dpi=200, bbox_inches="tight"
    )
    plt.close()


# ============================================
# 6) MASTER
# ============================================

def make_all_plots(base_folder=".", save_folder="top5_results"):

    os.makedirs(save_folder, exist_ok=True)

    years = ["2020","2021","2022","2023","2024","2025"]
    df = load_top5_all_years(base_folder, years)

    plot_rank_evolution(df, save_folder)
    plot_value_heatmap(df, save_folder)
    plot_jaccard(df, save_folder)
    plot_frequency(df, save_folder)

    print("Vsi grafi shranjeni v:", save_folder)

In [27]:
make_all_plots(".")

Vsi grafi shranjeni v: top5_results


In [31]:
sc[ sc["symbol"].isin(["ABBV", "ABEO", "ABSI", "ABT"]) ]

Unnamed: 0,symbol,name,lastsale,netchange,pctchange,marketCap,url
2,ABBV,AbbVie Inc. Common Stock,$226.08,-2.63,-1.15%,399570317603,/market-activity/stocks/abbv
6,ABT,Abbott Laboratories Common Stock,$125.08,-0.32,-0.255%,217498103131,/market-activity/stocks/abt
350,ABSI,Absci Corporation Common Stock,$3.58,-0.15,-4.021%,538330081,/market-activity/stocks/absi
429,ABEO,Abeona Therapeutics Inc. Common Stock,$4.87,UNCH,--,263911928,/market-activity/stocks/abeo
