# Directory Deep Content Cluster Visualization

## Overview

- Deep dive the provided directory, a Zettelkasten note-taking repo, to visualize content clusters.


In [4]:

import os
import re
import sys
import yaml
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# --- Configuration ---
ZETTEL_ROOT = "/home/hittjw/Documents/GitHub/obsidian/Zettelkasten"
N_CLUSTERS = 5
TAG_WEIGHT = 10  # Number of times to repeat tags to boost their signal

def extract_note_data(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except (OSError, IOError) as e:
        sys.stderr.write(f"read_error: {filepath} - {e}\n")
        return None, None

    # 1. Extract YAML Tags
    yaml_tags = []
    yaml_match = re.search(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
    if yaml_match:
        try:
            meta = yaml.safe_load(yaml_match.group(1))
            if meta and 'tags' in meta:
                tags = meta['tags']
                yaml_tags = tags if isinstance(tags, list) else [tags]
        except yaml.YAMLError:
            pass

    # 2. Extract CamelCase Hashtags
    hashtags = re.findall(r'#([A-Z][a-z]+(?:[A-Z][a-z]+)+)', content)
    
    # 3. Strip front matter for body
    body = re.sub(r'^---\s*\n(.*?)\n---\s*\n', '', content, flags=re.DOTALL)
    
    # Boost weight of metadata by repeating strings
    weighted_metadata = ( " ".join(map(str, yaml_tags)) + " " + " ".join(hashtags) + " " ) * TAG_WEIGHT
    
    feature_text = weighted_metadata + body
    return feature_text.strip(), filepath.name

# --- Data Collection ---
data = []
files = list(Path(ZETTEL_ROOT).rglob('*.[mM][dD]'))

for path in files:
    text, name = extract_note_data(path)
    if text:
        data.append({'name': name, 'text': text})

if not data:
    sys.stderr.write("extraction_failed: No content found.\n")
    sys.exit(1)

df = pd.DataFrame(data)

# --- Vectorization with Bi-grams and Custom Stop Words ---
custom_stops = {'https', 'http', 'com', 'www', 'org', 'net', 'html', 'eof'}
all_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stops))

# ngram_range=(2, 2) enforces strict bi-grams
vectorizer = TfidfVectorizer(
    stop_words=all_stop_words,
    ngram_range=(2, 2),
    max_features=1500
)

X = vectorizer.fit_transform(df['text'])

# --- Clustering ---
model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df['cluster'] = model.fit_predict(X)

# --- Dimensionality Reduction ---
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray())
df['x'], df['y'] = coords[:, 0], coords[:, 1]

# --- Visualization ---
plt.figure(figsize=(12, 8))
for i in range(N_CLUSTERS):
    cluster_slice = df[df['cluster'] == i]
    plt.scatter(cluster_slice['x'], cluster_slice['y'], label=f"Cluster {i}", alpha=0.6, edgecolors='w')

plt.title("Zettelkasten Bi-gram Clusters (Weighted by Tags)")
plt.xlabel("PC1")
plt.ylabel("PC2")
plt.legend()
plt.grid(True, linestyle='--', alpha=0.5)
plt.show()

# --- Cluster Summary ---
print("\n--- Top Bi-gram Identifiers per Cluster ---")
terms = vectorizer.get_feature_names_out()
centroids = model.cluster_centers_.argsort()[:, ::-1]

for i in range(N_CLUSTERS):
    top_terms = [terms[ind] for ind in centroids[i, :7]]
    print(f"Cluster {i}: {', '.join(top_terms)}")
    


--- Top Bi-gram Identifiers per Cluster ---
Cluster 0: details real, capture details, persona capture, useful customer, content vault, people reference, avatar buyer
Cluster 1: artificial intelligence, ai systems, internet marketer, write better, language model, decision making, emotional regulation
Cluster 2: book author, risk management, direct response, united states, social media, long term, lead generation
Cluster 3: martinsville va, national bank, 30 years, feels like, don want, point notes, notes reference
Cluster 4: mental focus, thinking critical, mental clarity, critical thinking, think clearly, problem solving, quality life
