In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pandas as pd
import numpy as np
import sqlite3
import regex as re
from spacy.lang.en.stop_words import STOP_WORDS
import matplotlib.pyplot as plt
from sklearn.metrics import silhouette_score
from sklearn.metrics import calinski_harabasz_score, davies_bouldin_score
import umap
import matplotlib.cm as cm

In [None]:
con = sqlite3.connect("wiki_articles_diseases.db")

df= pd.read_sql_query("SELECT * from wiki_articles_diseases_extended", con)
con.close()

df

In [None]:
def filter_words(row):
    # Combine both fields, protect against NaN
    text = f"{row['nav'] or ''}|{row['entities'] or ''}"
    # Split on | or #
    words = re.split(r'\||\#', text)
    # Remove single-character words like "a." and lowercase
    words = [re.sub(r'\b[A-Za-z]\.', '', word) for word in words]
    words = [re.sub(r'[\xa0\u200b\u202f]', ' ', word) for word in words]
    return [w.lower().strip() for w in words if len(w.strip()) > 1 and w not in STOP_WORDS]

# Apply the function row-wise to create a list per row
df['voc'] = df.apply(filter_words, axis=1)
df['voc'][0]

In [None]:
tfidf = TfidfVectorizer(ngram_range=(1,2),min_df=5, max_df=0.7)

rejoined_docs = [' '.join(doc) for doc in df['voc']]

dt = tfidf.fit_transform(rejoined_docs)
dt

In [None]:
import warnings
warnings.filterwarnings("ignore", message="n_jobs value 1 overridden to 1 by setting random_state")


best_score = -1
best_params = None

for n_comp in range(2,11):
    for n_neigh in range(5,15)
        for k in range(4,11):
            umap_model = umap.UMAP(n_neighbors=n_neigh, n_components=n_comp, random_state=42)
            embedding = umap_model.fit_transform(dt)
            kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
            labels = kmeans.fit_predict(embedding)
            score = silhouette_score(embedding, labels)
            if score > best_score:
                best_score = score
                best_params = (n_comp, n_neigh, k)
print(f"Best params: n_components={best_params[0]}, n_neighbors={best_params[1]}, k={best_params[2]} with silhouette={best_score:.4f}")

In [None]:
umap_model = umap.UMAP(n_neighbors=5, n_components=9, random_state=42) # Parameters changed to match grid above
embedding = umap_model.fit_transform(dt)
kmeans = KMeans(n_clusters=9, random_state=42, n_init=10) # Parameters changed to match grid above
labels = kmeans.fit_predict(embedding)

sizes = []
for i in range(9):
    sizes.append({"cluster": i,
                  "size" : np.sum(kmeans.labels_==i)})
    


print(silhouette_score(embedding, labels))
pd.DataFrame(sizes).set_index("cluster").plot.bar(figsize=(16,9))

In [None]:
#Save Embeddings
#np.save("umap_embeddings.npy", embedding)

In [None]:
df['cluster'] = kmeans.labels_

df.head()

In [None]:
#2D Projection
cmap = cm.get_cmap('tab20', 9)

plt.figure(figsize=(10, 6))
plt.scatter(embedding[:, 0], embedding[:, 1], c=df['cluster'], cmap=cmap)
plt.title("2D Projection of Clusters")
plt.xlabel("UMAP-1")
plt.ylabel("UMAP-2")
plt.colorbar(label="Cluster")
plt.show()

In [None]:
from sklearn.metrics import silhouette_samples
s_scores = silhouette_samples(embedding, df['cluster'])
df['silhouette_score'] = s_scores
df.groupby('cluster')['silhouette_score'].mean()

In [None]:
df[df['cluster'] == 1]['title']

In [None]:
df["voc"] = df["voc"].apply(lambda x: "|".join(x))
df['voc']

In [None]:
#Save Clustered Documents
sql = sqlite3.connect("clustered_diseases.db")
df.to_sql("clustered_diseases", sql, if_exists="replace")

In [None]:
from scipy.sparse import csr_matrix, vstack

cluster_centroids = {}

# Clusters 0-N
for cluster_id in df['cluster'].unique():
    
    # Get all docs in the cluster
    cluster_indices = df.index[df['cluster'] == cluster_id].tolist()
    
    # Get vectors of docs in cluster
    cluster_vectors = dt[cluster_indices]
    
    # Get mean TF-IDF of docs in cluster
    cluster_mean = csr_matrix(cluster_vectors.mean(axis=0))
    
    # Store in Dictionary
    cluster_centroids[cluster_id] = cluster_mean

In [None]:
centroid_matrix = vstack([cluster_centroids[cid] for cid in sorted(cluster_centroids)])

In [None]:
#Save Cluster Centroids
from scipy.sparse import save_npz

save_npz("cluster_centroids.npz", centroid_matrix)

In [None]:
# Reformat voc back to a list
df['voc'] = df.apply(filter_words, axis=1)

In [None]:
import joblib
joblib_filename = 'fitted_tfidf_vectorizer.joblib'
joblib.dump(dt, joblib_filename)

In [None]:
joblib_umap_filename = 'fitted_umap_model.joblib'
joblib.dump(umap_model, joblib_umap_filename)