In [1]:
# built-in libs
import os
import importlib

# obsidiantools requirements
import numpy as np
import pandas as pd
import networkx as nx


In [2]:
# Similarities and cache embedding model
import docsim
import tfidf
docsim_obj = docsim.DocSim(verbose=True)
cached_model = docsim_obj.model

[nltk_data] Downloading package punkt to /Users/jacksong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Loading default GloVe word vector model: glove-wiki-gigaword-50
Model loaded


In [54]:
from pathlib import Path
# Set up vault and pull documents
VAULT_DIR = Path(
    "/Users/jacksong/Library/Mobile Documents/iCloud~md~obsidian/Documents/Incredex"
)
import obsidiantools.api as otools  # api shorthand
vault = otools.Vault(VAULT_DIR).connect().gather()

def get_full_text(name):
    return name + ". " + vault.get_text(name)
documents = {name: get_full_text(name) for name in vault.file_index.keys()}

In [55]:
# Get updated version of docsim, use the cached model
importlib.reload(docsim)
docsim_obj = docsim.DocSim(model=cached_model, verbose=True)

In [56]:
# Get updated version of tfidf
importlib.reload(tfidf);

[nltk_data] Downloading package punkt to /Users/jacksong/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [64]:
from collections import Counter

def intersects(lst1, lst2):
    for value in lst1:
        if value in lst2:
            return True
    return False

def check_has_common_tags(first, second, vault):
    tag1 = vault.get_tags(first)
    tag2 = vault.get_tags(second)
    return intersects(tag1, tag2)

def check_has_link(first, second, vault):
    backs1 = vault.get_backlinks(first)
    backs2 = vault.get_backlinks(second)
    if first in backs2:
        return True
    if second in backs1:
        return True
    return False

def trim_string(s: str, limit: int, ellipsis='…') -> str:
    s = s.strip()
    if len(s) > limit:
        return s[:limit].strip() + ellipsis
    return s

ignores = ['root']

def pretty(pairs, vault):
    occur = Counter()
    for fir, sec, score in pairs:
        if fir in ignores or sec in ignores:
            continue
        occur[fir] += 1
        occur[sec] += 1
        has_common = check_has_common_tags(fir, sec, vault)
        has_link = check_has_link(fir, sec, vault)
        link = "@" if has_link else " "
        tag = "#" if has_common else " "
        firstr = trim_string(fir, 30)
        secstr = trim_string(sec, 30)
        scostr = trim_string(str(score), 4)
        print()
        print("{:<1}{:<1}   {:<35} {:<35}   {:<10}".format(link, tag, firstr, secstr, scostr));
    
    print()
    print(occur.most_common(5))
    
    

In [58]:
# GloVe global scores
glove_results = docsim_obj.top_pairs(documents, 30);

In [59]:
# TDIDF global scores
tfidf_results = tfidf.top_pairs(documents, 30);

In [65]:
pretty(tfidf_results, vault)


     Mango and mayo in shrimp tacos      Cooking                               0.67…     

@    economic success is driven by…      innovation is iteration               0.55…     

@    economic success is driven by…      hypotheses just aren't that im…       0.51…     

     Incredex                            Artists to Copy                       0.49…     

     Finding good working groups         How to build collaborative web…       0.49…     

@    Technical Due Diligence             Due Diligence                         0.47…     

     Personal Brand and Persona          Backlog of things to do               0.45…     

     Stuff to 3D pPrint                  3D Print Board Games                  0.45…     

@    Bird in the hand                    Focus makes everything better         0.44…     

     Where meaning comes from in ev…     Finding good working groups           0.43…     

     Document similarity is a multi…     Incredex                              0.43…     

In [66]:
pretty(glove_results, vault)


 #   Getting the most from work          Adjacent Opportunities                0.97…     

     hypotheses just aren't that im…     DNS retro                             0.83…     

     Purpose                             GDPR                                  0.81…     

     You're not above the hype trai…     Cardiogram Premium Conversion         0.81…     

     Choosing between different ema…     Finding good working groups           0.80…     

     Sleep is the best investment        Meetings with Harish                  0.77…     

     Subitize                            DNS retro                             0.76…     

     Climate Crisis                      GDPR                                  0.75…     

@    Technical Due Diligence             Due Diligence                         0.75…     

     Communication and Emotional Im…     Trying Psilocybin                     0.75…     

     Purpose                             Cardiogram Premium Conversion         0.75…     