# Note tagger

In [2]:
%load_ext autoreload
%autoreload 2

In [14]:
import os
import sys
import aiohttp
import json
import re
import os
import time
import llm_notes
from pathlib import Path
import asyncio
import llm_notes
import yaml
import networkx as nx
from llm_notes.note import load_note_directory, Note
from pprint import pprint
from typing import List, Optional, Dict
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import functools

from langchain.llms import OpenAI
from langchain.chat_models import ChatOpenAI
from langchain.document_loaders import (
    TextLoader,
    DirectoryLoader,
    UnstructuredMarkdownLoader,
)
from langchain.indexes import VectorstoreIndexCreator
from langchain.vectorstores.chroma import Chroma
from langchain.vectorstores import FAISS
from langchain.callbacks import StdOutCallbackHandler
from langchain.chains import LLMChain
from langchain.llms import OpenAI
from langchain.prompts import PromptTemplate
from langchain.text_splitter import CharacterTextSplitter
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import AIMessage, HumanMessage, SystemMessage
import logging

api_key = os.environ.get("OPENAI_API_KEY")

In [18]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)
# create console handler with a higher log level
debug_stream = logging.StreamHandler(sys.stdout)
debug_stream.setLevel(logging.DEBUG)
# create formatter and add it to the handlers
debug_fmtr = logging.Formatter("%(asctime)s - %(message)s")  # noqa


debug_stream.setFormatter(debug_fmtr)
logger.handlers = []
logger.addHandler(debug_stream)

In [19]:
logger.info("test")

2023-07-06 08:09:03,493 - test


### Set up note collection and graph

In [5]:
notes = load_note_directory(Path("./example_notes/"))

In [6]:
n = notes[3]
print(n)

self.title='relational note taking', self.tags=['#notetaking'], self.sources=['https://thesephist.com/posts/inc/']


In [7]:
n.metadata

NoteMetadata(title='relational note taking.md', tags=['#notetaking'], note_links=['My Note taking principles', 'Zettelkasten', 'Evergreen Notes', 'My Note taking principles', 'General Zettelkasten Principles', 'Evergreen Notes', 'Obsidian', 'Roam research', 'Notion'], back_links=[], short_title='rltnl_note_tkng.')

In [8]:
note_graph = llm_notes.note.construct_note_graph(notes)

In [9]:
note_graph.edges

EdgeView([('ldd_qstns_dtls.', 'rtls_play'), ('ldd_qstns_dtls.', 'story_rspns_plyrs_over_just_chrct_rspns'), ('ldd_qstns_dtls.', 'frtfl_void_-_rich_never_exhst_dtl'), ('ldd_qstns_dtls.', 'ldd_qstns_cmbt'), ('ldd_qstns_dtls.', 'ldd_qstns_chrct_crtn'), ('ldd_qstns_dtls.', 'swrds_wtht_mstr'), ('give_vs_take_cnvrs_styls_cnvrs_drknb', 'imprv_thtre_stryt_frmwr'), ('give_vs_take_cnvrs_styls_cnvrs_drknb', 'frtfl_void_-_rich_never_exhst_dtl'), ('give_vs_take_cnvrs_styls_cnvrs_drknb', 'ldd_qstns_dtls'), ('give_vs_take_cnvrs_styls_cnvrs_drknb', 'ldd_sttmn_dtls'), ('give_vs_take_cnvrs_styls_cnvrs_drknb', 'chrch_intrr'), ('imprv_thtre_stryt_frmwr', 'sktch_imprv_is_wrkng_bckwr_build_lrgr_wrldv'), ('imprv_thtre_stryt_frmwr', 'prprn_imprv_rpgs'), ('imprv_thtre_stryt_frmwr', 'blnce_btwn_imprv_prior_scene_rlply'), ('imprv_thtre_stryt_frmwr', 'how_nmss_systm_crts_strs'), ('imprv_thtre_stryt_frmwr', 'how_nmss_systm_crts_strs.'), ('rltnl_note_tkng.', 'my_note_tkng_prncp'), ('rltnl_note_tkng.', 'zttlk'), ('r

#### Tags present in notes

In [None]:
all_tags = set([])
for n in notes:
    all_tags.update(n.tags)

curated_tags = {
    "#bayesian",
    "#deeplearning",
    "#gamedesign",
    "#graph",
    "#homebrew",
    "#list",
    "#mythoughts",
    "#notetaking",
    "#pbta",
    "#pluginrules",
    "#probability",
    "#resource",
    "#review",
    "#rpgs",
    "#rules",
    "#stub",
    "#thoughtcollection",
    "#thoughts",
    "#tool",
    "#tutorial",
    "#writing",
}
curated_tag_descriptions = {
    "#bayesian: notes with this tag are about bayesian statistics",
    "#deeplearning: notes with this tag are about AI and deep learning",
    "#gamedesign: notes with this tag are about game design in videogames or tabletop games",
    "#graph: notes with this tag are about graphs, graph structures, or graph algorithms",
    "#homebrew: notes with this tag are about my own homebrew rules or fiction for tabletop games",
    "#list: notes with this tag are a collection of links to other notes or related resources",
    "#mythoughts: notes with this tag are my own thoughts and opinions",
    "#notetaking: notes with this tag are about the note taking process",
    "#pbta: notes with this tag are about powered by the apocalypse tabletop games",
    "#pluginrules: notes with this tag are about game rules or systems that can be added to other games",
    "#probability: notes with this tag are about mathematical probability",
    "#resource: notes with this tag are about a specific resource",
    "#review: notes with this tag are a review",
    "#rpgs: notes with this tag are about roleplaying games or tabletop games",
    "#rules: notes with this tag area bout game rules or systems",
    "#stub: notes with this tag are stubs that need to be expanded, they have little content of their own",
    "#thoughtcollection: notes with this tag are a collection of thoughts or ideas",
    "#thoughts: notes with this tag are my thoughts",
    "#tool: notes with this tag are about a specific tool",
    "#tutorial: notes with this tag are about tutorails",
    "#writing: notes with this tag are on the topic of writing",
}

### Tagger structure and components
- consume text
- get adjacent notes
- determine if these notes are similar to the current note, and if so
- get similar text in vector store and consider the texts that are similar to the current text
- for both similar and linked notes, use the tags that are present in those notex, or the global set of tags as the space to tag from.

## Manual openai api use, minimal langchain

### Tagging prompt to analyze tags

In [20]:
sys_prompt = """
You are Adrian, an expert in topic classification.
You are tasked with reading a note, as well as any other related notes,
and determining if it has any missing topic tags.
"""
preamble_user_prompt = """
A tag should be related to the broad content and topic of the note.
It should be one or two words with no spaces between them.
When considering if a tag is missing from the note,
consider if linked or related notes have a tag that might also be relevant to the note,
whether it is appropriate to the original note itself, and whether it is a broad enough topic.

I will give you a note, along with any related notes and their tags,
and you will analyse them, step-by-step to determine if you think any tags should be added to the note.

I will give you a list of tags and short descriptions about when they apply. 
Make sure you suggest tags from this list.
"""
ai_response = """
Yes, I understand, I am Adrian and I will anlyse the note for any missing relevant tags."""

query_user_prompt = """
Excellent! Lets begin
note:
    full_text: {full_text}

linked notes:
    {linked_notes}

similar notes:
    {similar_notes}

prefered tags: {desired_tags}
"""


def format_note_and_tags(n):
    return " ".join(n.tags) + "\n" + n.full_text


def async_timer(async_fn):
    @functools.wraps(async_fn)
    async def wrapper(*args, **kwargs):
        start = time.time()
        result = await async_fn(*args, **kwargs)
        end = time.time()
        logger.info(f"Time taken: {end-start}s")
        return result

    return wrapper


@async_timer
async def analyze_tags(
    note: Note,
    linked_notes: List[Note],
    similar_notes: List[Note],
    tag_descriptions=curated_tag_descriptions,
    temperature=0.1,
) -> dict:
    headers = {
        "Content-Type": "application/json",
        "Authorization": f"Bearer {api_key}",
    }

    data = {
        "model": "gpt-3.5-turbo",
        "messages": [
            {"role": "system", "content": sys_prompt},
            {"role": "user", "content": preamble_user_prompt},
            {"role": "assistant", "content": ai_response},
            {
                "role": "user",
                "content": query_user_prompt.format(
                    full_text="\n".join([note.title, note.full_text]),
                    linked_notes="\n\n".join(
                        [format_note_and_tags(n) for n in linked_notes]
                    ),
                    similar_notes="\n\n".join(
                        [format_note_and_tags(n) for n in similar_notes]
                    ),
                    desired_tags=tag_descriptions,
                ),
            },
        ],
        "temperature": temperature,
    }

    async with aiohttp.ClientSession() as session:
        async with session.post(
            "https://api.openai.com/v1/chat/completions",
            headers=headers,
            data=json.dumps(data),
        ) as response:
            response_data = await response.json()

    return response_data

In [21]:
await analyze_tags(notes[3], [], [])

2023-07-06 08:09:18,507 - Time taken: 4.050398111343384s


{'id': 'chatcmpl-7ZDP0QK9Y8la3UHahELAdiuPkn7Wv',
 'object': 'chat.completion',
 'created': 1688627354,
 'model': 'gpt-3.5-turbo-0613',
 'choices': [{'index': 0,
   'message': {'role': 'assistant',
    'content': "Based on the given note and the provided tags, I would suggest adding the following tags to the note:\n\n- #thoughts: The note discusses the author's thoughts and principles on note-taking.\n- #notetaking: The note is specifically about the process of note-taking.\n- #incremental: The note emphasizes the incremental nature of thought and note-taking.\n- #time: The note highlights the importance of time in remembering and organizing notes.\n- #relational: The note mentions relational note organization and its benefits.\n- #principles: The note discusses the principles of incremental note-taking.\n\nThese tags capture the main topics and themes of the note and provide a more comprehensive understanding of its content."},
   'finish_reason': 'stop'}],
 'usage': {'prompt_tokens': 

### Vector similarity using FAISS to KISS

In [23]:
loader = DirectoryLoader(
    "example_notes/", loader_cls=UnstructuredMarkdownLoader, show_progress=True
)

docs = loader.load()

text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=0)
texts = text_splitter.split_documents(docs)


embeddings = OpenAIEmbeddings()


vec_db = FAISS.from_documents(texts, embeddings)

  0%|          | 0/23 [00:00<?, ?it/s]

100%|██████████| 23/23 [00:00<00:00, 34.12it/s]


In [24]:
vec_db.similarity_search_with_score("fruit")

[(Document(page_content="These examples strongly link to the idea of a [[A Fruitful Void - rich but never exhaustive detail]] and [[Loaded Questions with details]]. it also adds some symmetry and implies [[Loaded statements with details]] which is an aspect that I haven't thought so deaply of.\n\nAnother article, that was mentioned alongside this is about [[The Church of Interruption]] and takes a much more moralist tone, by a self professed Taker/Interruptor. And instead talks about how you might think about avoiding hurting a giver.", metadata={'source': 'example_notes/give vs take conversation styles and conversational doorknobs.md'}),
  0.4246919),
 (Document(page_content='blog on GCN more specifically', metadata={'source': 'example_notes/GCN graph convolution layers.md'}),
  0.4609743),
 (Document(page_content='I should say ideally, because if we are to attached to things that are already established, then we cant be free with where things are going. And if we are too attached to 

In [25]:
def get_similar_notes(note, vec_db, note_title_dict, score_threshold=0.45):
    similar_docs = vec_db.similarity_search_with_score(
        "\n".join([note.title, note.full_text])
    )
    similar_note_titles = [
        Path(d[0].metadata["source"]).stem
        for d in similar_docs
        if d[1] < score_threshold
    ]
    similar_notes = [note_title_dict[t] for t in similar_note_titles if t != note.title]
    return similar_notes

In [26]:
vec_db

<langchain.vectorstores.faiss.FAISS at 0x169037110>

### Full function

In [27]:
async def suggest_tags(
    note: Note, note_title_dict: Dict[str, Note], vec_db: FAISS, temperature=0.0
) -> List[str]:
    linked_notes = [
        note_title_dict.get(linked_title) for linked_title in note.metadata.note_links
    ]
    similar_notes = get_similar_notes(notes[3], vec_db, note_title_dict)

    result = await analyze_tags(
        note,
        list(filter(None, linked_notes)),
        list(filter(None, similar_notes)),
        temperature=temperature,
    )
    try:
        result_text = result["choices"][0]["message"]["content"]

        result_tags = list(set(re.findall(r"#\w+", result_text)))
        assert result_tags != []
    except (AssertionError, KeyError):
        print(note.title, result)
        return []
    return result_tags


note = notes[3]
note_title_dict = {n.title: n for n in notes}

tags = await suggest_tags(note, note_title_dict, vec_db, temperature=0.1)

tags

2023-07-06 08:10:21,931 - Time taken: 5.612031936645508s


['#incremental', '#relational', '#time']

### Time and temp analysis

In [37]:
linked_notes = [
    note_title_dict.get(linked_title) for linked_title in note.metadata.note_links
]
similar_notes = get_similar_notes(notes[3], vec_db, note_title_dict)

temp_runs = [
    (
        t,
        analyze_tags(
            note,
            list(filter(None, linked_notes)),
            list(filter(None, similar_notes)),
            temperature=t,
        ),
    )
    for t in np.linspace(0, 1, 11)
    for n in range(1)
]
temp_varying_results = await asyncio.gather(*[r[1] for r in temp_runs])

2023-07-06 08:38:48,815 - Time taken: 2.7078630924224854s
2023-07-06 08:38:49,936 - Time taken: 3.8311290740966797s
2023-07-06 08:38:50,196 - Time taken: 4.090193033218384s
2023-07-06 08:38:50,455 - Time taken: 4.3496270179748535s
2023-07-06 08:38:51,319 - Time taken: 5.214974880218506s
2023-07-06 08:38:51,429 - Time taken: 5.324207067489624s
2023-07-06 08:38:51,590 - Time taken: 5.486499071121216s
2023-07-06 08:38:51,867 - Time taken: 5.763485908508301s
2023-07-06 08:38:51,906 - Time taken: 5.799304008483887s
2023-07-06 08:38:52,059 - Time taken: 5.9558069705963135s
2023-07-06 08:38:52,355 - Time taken: 6.250136137008667s


In [39]:
[result["choices"][0]["message"]["content"] for result in temp_varying_results]

['Based on the given note and related notes, I would suggest adding the following tags to the note:\n\n1. #relational: The note discusses the concept of relational note-taking and emphasizes the importance of relationships between ideas. This tag captures the main theme of the note.\n\n2. #incremental: The note highlights the principles of incremental note-taking, which involves capturing ideas as they come and adding new ideas rather than updating old ones. This tag reflects the approach discussed in the note.\n\n3. #time: The note emphasizes the importance of time in the note-taking process and suggests that a good note-taking system should consider time as a first-class concept. This tag captures this aspect of the note.\n\n4. #zettelkasten: The note mentions Zettelkasten, a note organization method that focuses on relational note organization. Although the note does not provide detailed information about Zettelkasten, it is still relevant to include this tag as it is mentioned in t

In [40]:
temp_runs = [
    (t, suggest_tags(note, note_title_dict, vec_db, temperature=t))
    for t in np.linspace(0, 1, 11)
    for n in range(1)
]
temp_varying_tags = await asyncio.gather(*[r[1] for r in temp_runs])

2023-07-06 08:41:27,277 - Time taken: 4.604407072067261s
2023-07-06 08:41:27,447 - Time taken: 4.4439918994903564s
2023-07-06 08:41:27,934 - Time taken: 6.186061859130859s
2023-07-06 08:41:28,410 - Time taken: 7.347073078155518s
2023-07-06 08:41:29,017 - Time taken: 5.8125691413879395s
2023-07-06 08:41:29,117 - Time taken: 7.184546232223511s
2023-07-06 08:41:29,270 - Time taken: 8.647213220596313s
2023-07-06 08:41:29,488 - Time taken: 7.295354843139648s
2023-07-06 08:41:29,613 - Time taken: 8.174010276794434s
2023-07-06 08:41:29,738 - Time taken: 8.873906135559082s
2023-07-06 08:41:30,655 - Time taken: 8.218586206436157s


In [41]:
for temp, tags in zip([r[0] for r in temp_runs], temp_varying_tags):
    print(f"temp {temp:0.1f}", tags)

temp 0.0 ['#incremental', '#relational', '#time', '#zettelkasten']
temp 0.1 ['#incremental', '#relational', '#time', '#noteorganization']
temp 0.2 ['#incremental', '#relational', '#time', '#noteorganization']
temp 0.3 ['#time', '#notetakingtools', '#incremental', '#zettelkasten', '#relational', '#evergreennotes']
temp 0.4 ['#incremental', '#relational', '#time', '#thoughtcollection']
temp 0.5 ['#incremental', '#relational', '#time', '#Zettelkasten']
temp 0.6 ['#incremental', '#relational', '#time', '#Zettelkasten']
temp 0.7 ['#thoughtcollection', '#zettelkasten', '#notetaking', '#incrementalnotetaking', '#timemanagement', '#noteorganization']
temp 0.8 ['#knowledgeorganizationtools', '#notesystem', '#knowledgeorganization', '#notetaking', '#noteorganization']
temp 0.9 ['#time', '#informationmanagement', '#relational']
temp 1.0 ['#incremental', '#relational', '#time']


In [42]:
note

Note(path=PosixPath('example_notes/relational note taking.md'), title='relational note taking', full_text="relational note taking\n- source: https://thesephist.com/posts/inc/\n- tags: #notetaking \nBlog post codefying their note taking style, principles and tooling. informs much of [[My Note taking principles]].\n\nUses an iceberg analogy, where most note taking accounts for the 10% of structured, easily codefiable notes such as meeting or lecture notes, or notes from reading. But misses the 90% of thoughts that are the majority of our thought process.\n\nHas a strong emphasis on the incremental nature of thought, and the nature of notes over time. My own past positive experience with note taking, and using those notes, was a chronological pen-and-paper lab notebook. a chronological view, with some labeling to spur thought, was the most effective way in which I actually used and referred back to my notes. This was acheived with a paper note taking system, which in this blog they discus

### Run on all notes

In [None]:
all_suggested_tags = await asyncio.gather(
    *[suggest_tags(n, note_title_dict, vec_db) for n in notes]
)

In [None]:
all_suggested_tags

[['#rpgs', '#thoughtcollection', '#Ritualsofplay', '#LoadedQuestions'],
 ['#conversationstyle',
  '#affordances',
  '#communication',
  '#interpersonalcommunication',
  '#improvisation'],
 ['#storytelling', '#improv', '#theater'],
 ['#relational', '#notetaking'],
 ['#rpg'],
 ['#tool', '#thoughtcollection', '#notetaking'],
 ['#rpg', '#thoughts', '#gamebalance', '#spellcasting', '#magictradeoffs'],
 ['#worldbuilding', '#storyresponsibility', '#metacharacter', '#failure'],
 ['#deeplearning', '#graph'],
 ['#tutorial', '#writing'],
 ['#thoughtcollection', '#notetaking'],
 ['#deeplearning',
  '#machinelearning',
  '#graph',
  '#neuralnetwork',
  '#transferlearning',
  '#representationlearning'],
 ['#graph', '#bayesian', '#probability'],
 ['#gamedesignanalysis', '#videogames', '#gamestorytelling'],
 ['#roleplaying', '#gmless', '#tropes', '#storyresponsibility', '#npc'],
 ['#deeplearning', '#messagepassing', '#graphstructure'],
 ['#graph', '#machinelearning', '#algorithm'],
 ['#deeplearning', 

In [43]:
for n, tags in zip(notes, all_suggested_tags):
    print("---\n", n.metadata.short_title)
    print(n.tags)
    print(tags)
    print([t for t in tags if t in curated_tags])

NameError: name 'all_suggested_tags' is not defined

## Langchain approach

In [None]:
Use a chain structure to 