# Directory Deep Content Cluster Visualization

## Overview

- Deep dive the provided directory, a Zettelkasten note-taking repo, to visualize content clusters.


In [None]:
#!/usr/bin/env python3

import os
import re
import yaml
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

## Configuration

In [None]:
ZettelkastenRoot = "/home/hittjw/Documents/GitHub/obsidian/Zettelkasten" # Update this path
n_clusters = 5

In [None]:

def extract_note_data(filepath):
    with open(filepath, 'r', encoding='utf-8') as f:
        content = f.read()
    
    # 1. Extract YAML Tags
    yaml_tags = []
    yaml_match = re.search(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
    if yaml_match:
        try:
            meta = yaml.safe_load(yaml_match.group(1))
            if 'tags' in meta:
                yaml_tags = meta['tags'] if isinstance(meta['tags'], list) else [meta['tags']]
        except: pass

    # 2. Extract CamelCase Hashtags (e.g., #DataScience)
    hashtags = re.findall(r'#([A-Z][a-z]+(?:[A-Z][a-z]+)+)', content)
    
    # 3. Clean Body (removing front matter for the trigram analysis)
    body = re.sub(r'^---\s*\n(.*?)\n---\s*\n', '', content, flags=re.DOTALL)
    
    # Combine everything into a single "feature string"
    # We repeat tags to give them more weight in the vectorizer
    feature_text = " ".join(yaml_tags) + " " + " ".join(hashtags) + " " + body
    return feature_text, filepath.name

# --- Data Collection ---
data = []
for path in Path(ZettelkastenRoot).rglob('*.md'):
    text, name = extract_note_data(path)
    if text.strip():
        data.append({'name': name, 'text': text})

df = pd.DataFrame(data)

# --- Vectorization (Focusing on Trigrams and Tags) ---
vectorizer = TfidfVectorizer(
    stop_words='english',
    ngram_range=(1, 3), # This captures your trigrams!
    max_features=1000
)
X = vectorizer.fit_transform(df['text'])

# --- Clustering ---
model = KMeans(n_clusters=n_clusters, random_state=42, n_init=10)
df['cluster'] = model.fit_predict(X)

# --- Visualization (PCA) ---
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray())
df['x'] = coords[:, 0]
df['y'] = coords[:, 1]

plt.figure(figsize=(10, 7))
for i in range(n_clusters):
    cluster_data = df[df['cluster'] == i]
    plt.scatter(cluster_data['x'], cluster_data['y'], label=f'Cluster {i}')

plt.title("Zettelkasten Content Clusters")
plt.legend()
plt.show()

# --- Print Top Clues per Cluster ---
print("### Top Cluster Identifiers ###")
order_centroids = model.cluster_centers_.argsort()[:, ::-1]
terms = vectorizer.get_feature_names_out()
for i in range(n_clusters):
    top_terms = [terms[ind] for ind in order_centroids[i, :5]]
    print(f"Cluster {i}: {', '.join(top_terms)}")