In [1]:
# Import necessary libraries
import numpy as np
import ast
import psycopg2
import hdbscan
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
import pandas as pd
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
import plotly.express as px
import plotly.io as pio
from sklearn.metrics import confusion_matrix
import plotly.graph_objects as go
from sklearn.metrics import pairwise_distances_argmin_min
pio.renderers.default = 'notebook'

In [2]:
# Connect to PostgreSQL database
conn = psycopg2.connect(
    dbname="manga_db",
    user="user_master",
    password="PassMaster97",
    host="localhost",
    port="5432"
)
cur = conn.cursor()

In [3]:
# Fetch data from the database
df = pd.read_sql("""
    SELECT DISTINCT ON (mb.original_title)
    mb.id,
    mb.title,
    mb.is_editorial,
    mb.original_title,
    me.embeddings_synopsis_tags,
    me.embeddings_features
    FROM mangas_base mb
    JOIN mangas_embeddings me ON mb.id = me.manga_id
    WHERE me.embeddings_synopsis_tags IS NOT NULL
    AND me.embeddings_features IS NOT NULL
    ORDER BY mb.original_title, mb.is_editorial DESC;
""", conn)


pandas only supports SQLAlchemy connectable (engine/connection) or database string URI or sqlite3 DBAPI2 connection. Other DBAPI2 objects are not tested. Please consider using SQLAlchemy.



In [4]:
# Function to safely evaluate string representations of lists
def safe_eval(val):
    if pd.isna(val) or val is None:
        return np.array([]) 
    try:
        return np.array(ast.literal_eval(val))
    except Exception as e:
        print(f"❌ Error con valor: {val}\n{e}")
        return np.array([])

# Apply the safe_eval function to the embeddings columns
df["emb_syn_tags_np"] = df["embeddings_synopsis_tags"].apply(safe_eval)
df["emb_features_np"] = df["embeddings_features"].apply(safe_eval)

# Filter the DataFrame to only include rows with non-empty embeddings
df_filtered = df[
    (df["emb_syn_tags_np"].apply(lambda x: x.size > 0)) &
    (df["emb_features_np"].apply(lambda x: x.size > 0))
]


# Convert the embeddings to numpy arrays
X_tags = np.vstack(df_filtered["emb_syn_tags_np"].values)
X_feat = np.vstack(df_filtered["emb_features_np"].values)

In [5]:
def optimize_hdbscan(X, min_cluster_range=(5, 30), min_samples_range=(1, 15), metric="euclidean"):
    results = []

    for min_cluster_size in range(*min_cluster_range):
        for min_samples in range(*min_samples_range):
            try:
                clusterer = hdbscan.HDBSCAN(min_cluster_size=min_cluster_size,
                                            min_samples=min_samples,
                                            metric=metric)
                labels = clusterer.fit_predict(X)

                n_clusters = len(set(labels)) - (1 if -1 in labels else 0)
                noise_ratio = np.sum(labels == -1) / len(labels)

                if n_clusters > 1:  # Necesitamos al menos 2 clusters para calcular Silhouette
                    sil_score = silhouette_score(X, labels[labels != -1])
                else:
                    sil_score = -1

                results.append({
                    "min_cluster_size": min_cluster_size,
                    "min_samples": min_samples,
                    "n_clusters": n_clusters,
                    "noise_ratio": round(noise_ratio, 3),
                    "silhouette": round(sil_score, 3)
                })
            except Exception as e:
                # En algunos casos el clustering puede fallar (por ejemplo, 0 clusters)
                results.append({
                    "min_cluster_size": min_cluster_size,
                    "min_samples": min_samples,
                    "n_clusters": 0,
                    "noise_ratio": 1.0,
                    "silhouette": -1
                })

    df_results = pd.DataFrame(results)
    df_results = df_results.sort_values(by=["silhouette", "n_clusters"], ascending=[False, False])
    return df_results

In [None]:
df_grid_tags = optimize_hdbscan(X_tags, min_cluster_range=(5, 25), min_samples_range=(2, 10))
df_grid_feat = optimize_hdbscan(X_feat, min_cluster_range=(5, 25), min_samples_range=(2, 10))


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


In [None]:
best_params_tags = df_grid_tags.iloc[0]
print("Best params:", best_params_tags)

best_params_feat = df_grid_tags.iloc[0]
print("Best params:", best_params_feat)

In [None]:
hdbscan_tags = hdbscan.HDBSCAN(
    min_cluster_size=int(best_params_tags["min_cluster_size"]),
    min_samples=int(best_params_tags["min_samples"]),
    metric="euclidean"
)

hdbscan_feat = hdbscan.HDBSCAN(
    min_cluster_size=int(best_params_feat["min_cluster_size"]),
    min_samples=int(best_params_feat["min_samples"]),
    metric="euclidean"
)

labels_tags = hdbscan_tags.fit_predict(X_tags)
labels_feat = hdbscan_feat.fit_predict(X_feat)

df_filtered["cluster_tags"] = labels_tags
df_filtered["cluster_features"] = labels_feat

In [None]:
# Agrupate by cluster and is_editorial
cluster_counts = df_filtered.groupby(["cluster_tags", "is_editorial"]).size().reset_index(name="count")

# Plot the distribution of clusters
plt.figure(figsize=(10, 6))
sns.barplot(data=cluster_counts, x="cluster_tags", y="count", hue="is_editorial")
plt.title("Distribution by clusters (synopsis + tags)")
plt.xlabel("Cluster")
plt.ylabel("Number of licenses")
plt.show()

In [None]:
# Agrupate
cluster_counts = df_filtered.groupby(["cluster_features", "is_editorial"]).size().reset_index(name="count")

# Plot the distribution of clusters
plt.figure(figsize=(10, 6))
sns.barplot(data=cluster_counts, x="cluster_features", y="count", hue="is_editorial")
plt.title("Distribution by clusters (features)")
plt.xlabel("Cluster")
plt.ylabel("Number of licenses")
plt.show()