# Directory Deep Content Cluster Visualization

## Overview

- Deep dive the provided directory, a Zettelkasten note-taking repo, to visualize content clusters.


In [7]:
import os
import re
import sys
import yaml
import pandas as pd
import matplotlib.pyplot as plt
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA

# --- Configuration ---
ZETTEL_ROOT = "/home/hittjw/Documents/GitHub/obsidian/Zettelkasten"
N_CLUSTERS = 5
TAG_WEIGHT = 5 

def clean_headers(text):
    """Removes Markdown headers with 2 words or fewer."""
    lines = text.split('\n')
    filtered_lines = []
    for line in lines:
        if line.startswith('#'):
            header_content = re.sub(r'^#+\s*', '', line).strip()
            if len(header_content.split()) > 2:
                filtered_lines.append(line)
        else:
            filtered_lines.append(line)
    return '\n'.join(filtered_lines)

def split_camel_case(text):
    """Splits #CamelCase into 'Camel Case'."""
    return re.sub(r'([a-z])([A-Z])', r'\1 \2', text)

def extract_note_data(filepath):
    try:
        with open(filepath, 'r', encoding='utf-8') as f:
            content = f.read()
    except (OSError, IOError) as e:
        sys.stderr.write(f"read_error: {filepath} - {e}\n")
        return None, None

    # 1. YAML Tags with NoneType protection
    yaml_tags = []
    yaml_match = re.search(r'^---\s*\n(.*?)\n---\s*\n', content, re.DOTALL)
    if yaml_match:
        try:
            meta = yaml.safe_load(yaml_match.group(1))
            if meta and 'tags' in meta:
                raw_tags = meta['tags']
                if isinstance(raw_tags, list):
                    # Filter out None values from empty list entries
                    yaml_tags = [str(t) for t in raw_tags if t is not None]
                elif raw_tags is not None:
                    yaml_tags = [str(raw_tags)]
        except yaml.YAMLError:
            pass

    # 2. Hashtags - Extracted and Split
    hashtag_raw = re.findall(r'#([A-Z][a-z]+(?:[A-Z][a-z]+)+)', content)
    hashtag_split = [split_camel_case(ht) for ht in hashtag_raw]
    
    # 3. Clean Body (Strip front matter, then filter headers)
    body = re.sub(r'^---\s*\n(.*?)\n---\s*\n', '', content, flags=re.DOTALL)
    body = clean_headers(body)
    
    # Safely join strings
    meta_str = (" ".join(yaml_tags) + " " + " ".join(hashtag_split) + " ") * TAG_WEIGHT
    
    feature_text = meta_str + body
    return feature_text.strip(), filepath.name

# --- Data Loading ---
data = []
files = list(Path(ZETTEL_ROOT).rglob('*.[mM][dD]'))
for path in files:
    text, name = extract_note_data(path)
    if text:
        data.append({'name': name, 'text': text})

if not data:
    sys.stderr.write("extraction_failed: No content found.\n")
    sys.exit(1)

df = pd.DataFrame(data)

# --- Stop Word Refinement ---
custom_stops = {'https', 'http', 'com', 'www', 'org', 'net', 'html', 'eof'}
all_stop_words = list(ENGLISH_STOP_WORDS.union(custom_stops))

# --- Vectorization ---
vectorizer = TfidfVectorizer(
    stop_words=all_stop_words,
    ngram_range=(2, 2), 
    max_features=2000
)

X = vectorizer.fit_transform(df['text'])

# --- Clustering ---
model = KMeans(n_clusters=N_CLUSTERS, random_state=42, n_init=10)
df['cluster'] = model.fit_predict(X)

# --- PCA ---
pca = PCA(n_components=2)
coords = pca.fit_transform(X.toarray())
df['x'], df['y'] = coords[:, 0], coords[:, 1]

# --- Identification ---
print("\n--- Top Bi-gram Identifiers per Cluster ---")
terms = vectorizer.get_feature_names_out()
centroids = model.cluster_centers_.argsort()[:, ::-1]

for i in range(N_CLUSTERS):
    top_terms = [terms[ind] for ind in centroids[i, :8]]
    print(f"Cluster {i}: {', '.join(top_terms)}")


# --- Visualization ---
plt.figure(figsize=(10, 6))
for i in range(N_CLUSTERS):
    slice_df = df[df['cluster'] == i]
    plt.scatter(slice_df['x'], slice_df['y'], label=f"Cluster {i}", alpha=0.5)

plt.title("Zettelkasten Bi-gram Clusters (Sanitized)")
plt.legend()
plt.show()




--- Top Bi-gram Identifiers per Cluster ---
Cluster 0: pasted image, knowledge workers, remote work, knowledge worker, worker knowledge, work work, white collar, knowledge management
Cluster 1: risk management, united states, long term, decision making, time management, subject matter, executive function, business development
Cluster 2: book author, direct response, social media, map content, lead generation, city martinsville, real estate, dan kennedy
Cluster 3: customer avatar, avatar buyer, buyer persona, details real, persona capture, people reference, capture details, useful customer
Cluster 4: martinsville va, 30 years, use space, point notes, notes reference, city martinsville, years old, feels like
