# Storing notes in Elasticsearch using Eland

In [1]:
from pydantic import BaseModel
from typing import List
import pandas as pd
import glob 
import logging
import eland as ed
import elasticsearch
import matplotlib.pyplot as plt
from noteboard import Org

import warnings
from elasticsearch.exceptions import ElasticsearchWarning
warnings.simplefilter('ignore', ElasticsearchWarning)

## Org node elements

OrgElement represents an org-mode subheading.

In [2]:
roam_nodes_df = Org.to_df(Org.load_dir_generator("/home/kuba/Projects/org", only_root_contents=False))



In [3]:
roam_df = Org.to_df(Org.load_dir_generator("/home/kuba/Projects/org", only_root_contents=True))



In [4]:
es_client = elasticsearch.Elasticsearch([{'host': 'localhost', 'port': 9200, 'scheme':'http'}])

In [5]:
existing_behavior = "replace"

roam_nodes_elastic_df = ed.pandas_to_eland(
    roam_nodes_df,
    es_client=es_client,
    es_dest_index="org_roam_nodes",
    es_type_overrides={"text": "text"},
    es_if_exists=existing_behavior
)

  elastic_version = es_version(client)


In [9]:
roam_df["text"].apply(type).unique()

array([<class 'str'>], dtype=object)

In [16]:
roam_nodes_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5420 entries, 0 to 5419
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   org_id     5420 non-null   object
 1   file_name  5420 non-null   object
 2   heading    5420 non-null   object
 3   level      5420 non-null   int64 
 4   body       5420 non-null   object
 5   links      5420 non-null   object
 6   text       5420 non-null   object
dtypes: int64(1), object(6)
memory usage: 296.5+ KB


In [15]:
roam_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 918 entries, 0 to 917
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   org_id     918 non-null    object
 1   file_name  918 non-null    object
 2   heading    918 non-null    object
 3   level      918 non-null    int64 
 4   body       918 non-null    object
 5   links      918 non-null    object
 6   text       918 non-null    object
dtypes: int64(1), object(6)
memory usage: 50.3+ KB


In [66]:
existing_behavior = "replace"

try:
        
    roam_elastic_df = ed.pandas_to_eland(
        roam_df,
        es_client=es_client,
        es_dest_index="org_roam",
        es_type_overrides={"text": {"type": "text"}},
        es_if_exists=existing_behavior
    )
    print("success")
except Exception as e:
    print("failure")
    exc = e

success


In [80]:
roam_elastic_df.es_match("llms", columns=["text"], fuzziness=1).filter(["_score"])

8
21
33
45
47
...
898
899
912
914
916


In [93]:
llm_query =         {
    "fuzzy": {
            "text": {
                "value": "llms",
                "fuzziness": 1
            }
        }
    }

llm_query_results_df = roam_elastic_df.es_query(llm_query)

In [94]:
print(llm_query_results_df.es_info())

es_index_pattern: org_roam
Index:
 es_index_field: _id
 is_source_field: False
Mappings:
 capabilities:
          es_field_name  is_source es_dtype es_date_format pd_dtype  is_searchable  is_aggregatable  is_scripted aggregatable_es_field_name
body               body       True  keyword           None   object           True             True        False                       body
file_name     file_name       True  keyword           None   object           True             True        False                  file_name
heading         heading       True  keyword           None   object           True             True        False                    heading
level             level       True     long           None    int64           True             True        False                      level
links             links       True  keyword           None   object           True             True        False                      links
org_id           org_id       True  keyword           No

In [95]:
#for err in exc.errors:
#    print(err["index"]["data"]["file_name"])

In [96]:
es_client.indices.get_mapping(index="org_roam")

ObjectApiResponse({'org_roam': {'mappings': {'properties': {'body': {'type': 'keyword'}, 'file_name': {'type': 'keyword'}, 'heading': {'type': 'keyword'}, 'level': {'type': 'long'}, 'links': {'type': 'keyword'}, 'org_id': {'type': 'keyword'}, 'text': {'type': 'text'}}}}})

In [101]:
es_client.search(index="org_roam", body={"query": llm_query})["hits"]["hits"]

[{'_index': 'org_roam',
  '_type': '_doc',
  '_id': '381',
  '_score': 7.240573,
  '_source': {'org_id': 'dc4a0d36-8d07-4a6b-87f5-3f7cfb6e283b',
   'file_name': '20230227210357-llms',
   'heading': '',
   'level': 0,
   'body': '#+title: llms\n\n\n',
   'links': ['97379691-82f0-4fcd-a521-2ab3aeacda8a',
    '8b7560f7-9233-41c2-9939-a51497232943',
    '35740f44-515a-4bf6-bfd1-cd949e827913',
    '4d2e3520-1973-4c93-bdb1-80db1a47f065',
    '3d574d6f-f79d-437c-8c95-e361515efe3b',
    '8689333b-0885-4848-9bfe-db464db3fcb2',
    '4f718ddb-fa01-40ca-b76b-ec265f3c8d21',
    '62249719-cbc3-44bd-a6b6-55be09fed98f',
    '1e699344-4440-4347-ac1e-089a04ef0504',
    'fad664b1-7ae9-4c7e-b2b0-87298b88d8e4',
    'c8155236-245e-4025-9db6-74e761012342',
    'ab5ccbbf-5893-4d42-8785-6c8e51d0e9e2'],
   'text': ':PROPERTIES:\n:ID:       dc4a0d36-8d07-4a6b-87f5-3f7cfb6e283b\n:END:\n#+title: llms\n\n\n\n\n* Logit bias\n\nUsed to constrain generation\n\n\n\n* Vocabulary\n\n\n** Grounding\n\nBasing LLM outputs o

In [47]:
es_client.indices.get_mapping()

ObjectApiResponse({'github_search': {'mappings': {'properties': {'refresh': {'type': 'text', 'fields': {'keyword': {'type': 'keyword', 'ignore_above': 256}}}, 'title': {'type': 'text', 'analyzer': 'english'}, 'txt': {'type': 'text', 'analyzer': 'english'}}}}, 'org_roam_nodes': {'mappings': {'properties': {'body': {'type': 'keyword'}, 'file_name': {'type': 'keyword'}, 'heading': {'type': 'keyword'}, 'level': {'type': 'long'}, 'links': {'type': 'keyword'}, 'org_id': {'type': 'keyword'}, 'text': {'type': 'text', 'fields': {'keyword': {'type': 'keyword'}}}}}}, 'org_roam': {'mappings': {'properties': {'body': {'type': 'text'}, 'file_name': {'type': 'keyword'}, 'heading': {'type': 'keyword'}, 'level': {'type': 'long'}, 'links': {'type': 'keyword'}, 'org_id': {'type': 'keyword'}, 'text': {'type': 'text'}}}}})

In [None]:
type(roam_elastic_df)

In [None]:
roam_elastic_df.es_match("pragmatics", columns="text")

## Topic modeling

In [None]:
texts = roam_elastic_df["text"].to_pandas().values

In [None]:
from sklearn import decomposition, feature_extraction, pipeline

n_topics = 25
lda = decomposition.LatentDirichletAllocation(n_components=n_topics, random_state=0)

lda_pipeline = pipeline.make_pipeline(
    feature_extraction.text.CountVectorizer(),
    lda
)

In [None]:
type(lda_pipeline)

In [None]:
from pydantic import BaseModel, Field
from typing import Callable

class TopicModeler(BaseModel):
    tm_pipeline: pipeline.Pipeline
    text_col: str
    preprocess_texts: Callable[pd.Series, pd.Series]

    @classmethod
    def init_unfitted(cls, make_vectorizer, make_topic_modeler, n_topics, text_col, vectorizer_kwargs, tm_kwargs, preprocess_texts):
        tm_kwargs["n_components"] = n_topics
        tm_pipeline = pipeline.make_pipeline(make_vectorizer(**vectorizer_kwargs), make_topic_modeler(**tm_kwargs))
        return TopicModeler(tm_pipeline=tm_pipeline, text_col=text_col, preprocess_texts=preprocess_texts)

    @classmethod
    def init(cls, texts_df, make_vectorizer, make_topic_modeler, n_topics, text_col="text", vectorizer_kwargs={}, tm_kwargs={}, preprocess_texts=lambda texts: texts):
        tm = cls.init_unfitted(make_vectorizer, make_topic_modeler, n_topics, text_col, vectorizer_kwargs, tm_kwargs, preprocess_texts)
        tm.tm_pipeline.fit(preprocess_texts(texts_df[text_col]))
        return tm
    
    def get_topic_representatives(self, texts_df, topic_idx, agg_by=None, topk=5):
        topic_scores = self.tm_pipeline.transform(texts_df[self.text_col])[:,topic_idx]
        top_idxs = topic_scores.argsort()[::-1][:topk]
        results_df = texts_df.iloc[top_idxs]
        return results_df.assign(topic_score=topic_scores[top_idxs])

    def get_top_topic_words(self, topk=25):
        return self.features[self.topic_loadings.argsort(axis=1)[:,::-1][:,:topk]]

    def get_topics(self, texts_df):
        preprocessed_texts = self.preprocess_texts(texts_df[self.text_col])
        return pd.Series(self.tm_pipeline.transform(preprocessed_texts).argmax(axis=1))
    
    @property
    def features(self):
        (_, vectorizer) = self.tm_pipeline.steps[0]
        return vectorizer.get_feature_names_out()

    @property
    def topic_loadings(self):
        (_, tm) = self.tm_pipeline.steps[-1]
        return tm.components_
        
    @property
    def n_features(self):
        return len(self.features)

    @property
    def n_topics(self):
        [(_, tm)] = self.tm_pipeline.steps[-1:]
        return tm.n_components
    
    class Config:
        arbitrary_types_allowed = True

In [None]:
org_file_contents = roam_df.groupby("file_name").apply(lambda df: "\n".join(df["text"]))
org_file_roam_df = pd.DataFrame(org_file_contents)
org_file_roam_df.columns = ["text"]
org_file_roam_df = org_file_roam_df.reset_index()

In [None]:
n_topics = 25

used_stop_words = stop_words.get_stop_words("en") + stop_words.get_stop_words("pl")
vectorizer_kwargs={"min_df": 5, "binary": True, "stop_words": used_stop_words}
nmf_kwargs = {"alpha_W": 1e-4, "l1_ratio": 0.0, "beta_loss": "kullback-leibler"}


def preprocess_snakecase(texts):
    return texts.str.replace("_", " ").str.replace(r" \d+ |\d{3,}", " NUMBER ")


lda_tm = TopicModeler.init(
    org_file_roam_df,
    n_topics=n_topics,
    make_vectorizer=feature_extraction.text.CountVectorizer,
    make_topic_modeler=decomposition.LatentDirichletAllocation,
    vectorizer_kwargs=vectorizer_kwargs,
    preprocess_texts=preprocess_snakecase
)

nmf_tm = TopicModeler.init(
    org_file_roam_df,
    n_topics=n_topics,
    make_vectorizer=feature_extraction.text.TfidfVectorizer,
    vectorizer_kwargs=vectorizer_kwargs,
    make_topic_modeler=decomposition.NMF,
    preprocess_texts=preprocess_snakecase
)


In [None]:
lda_tm.get_topics(roam_df).value_counts()

In [None]:
nmf_tm.get_topics(roam_df).value_counts()

## Guessing good parameters for NMF

We optimize for the smallest size of biggest cluster

In [None]:
import optuna

def objective(trial):
    
    nmf_kwargs = {
        "alpha_W": trial.suggest_loguniform("alpha_W", 1e-6, 0.1),
        "beta_loss": trial.suggest_categorical("beta_loss", ["frobenius", "kullback-leibler"]),
        "solver": "mu"
    }
    
    nmf_tm = TopicModeler.init(
        org_file_roam_df,
        n_topics=n_topics,
        make_vectorizer=feature_extraction.text.TfidfVectorizer,
        vectorizer_kwargs=vectorizer_kwargs,
        make_topic_modeler=decomposition.NMF,
        tm_kwargs=nmf_kwargs,
        preprocess_texts=preprocess_snakecase
    )
    return nmf_tm.get_topics(roam_df).value_counts().max()
    

In [None]:
study = optuna.create_study()
study.optimize(objective, n_trials=100, n_jobs=8)

In [None]:
trials_df = pd.DataFrame.from_records([{**trial.params, "value": trial.value} for trial in study.get_trials()])

In [None]:
trials_df[(trials_df["value"] < 1000) & (trials_df["beta_loss"] == "frobenius")].plot.scatter("alpha_W", "value")
plt.xscale("log")

In [None]:
trials_df[(trials_df["value"] < 1000) & (trials_df["beta_loss"] == "kullback-leibler")].plot.scatter("alpha_W", "value")
plt.xscale("log")

In [None]:
trials_df.groupby("beta_loss").agg({"value": "mean"})

In [None]:
nmf_tm.tm_pipeline.transform(roam_df["text"]).argmax(axis=1)

# Topic representatives

In [None]:
lda_tm.get_topic_representatives(roam_df, 23, topk=100).groupby(["file_name"]).agg({"topic_score": "mean"}).sort_values("topic_score", ascending=False).iloc[:25]

In [None]:
nmf_tm.get_topic_representatives(roam_df, 1, topk=100).groupby(["file_name"]).agg({"topic_score": "mean"}).sort_values("topic_score", ascending=False).iloc[:25]

## Top words per topic

In [None]:
nmf_tm.get_top_topic_words()[:,:5]

In [None]:
lda_tm.get_top_topic_words()[:,:5]

## BERTopic

small - runs on org nodes
big - runs on whole files

In [None]:
import bertopic

In [None]:
mmr_topic_representation_model = bertopic.representation.MaximalMarginalRelevance(diversity=0.3)
keybert_topic_representation_model = bertopic.representation.KeyBERTInspired()

bertopic_models = {}
bertopic_models["big_kw"] = bertopic.BERTopic(language="multilingual", representation_model=keybert_topic_representation_model, nr_topics=50, min_topic_size=5)
bertopic_models["big_mmr"] = bertopic.BERTopic(language="multilingual", representation_model=mmr_topic_representation_model, nr_topics=50, min_topic_size=5)
bertopic_models["small_kw"] = bertopic.BERTopic(language="multilingual", representation_model=keybert_topic_representation_model, nr_topics=50, min_topic_size=25)
bertopic_models["small_mmr"] = bertopic.BERTopic(language="multilingual", representation_model=mmr_topic_representation_model, nr_topics=50, min_topic_size=25)

In [None]:
def show_bertopic(bertopic_model):
    for topic in sorted(bertopic_model.topic_sizes_.keys()):
        print(f"topic {topic}")
        topic_info = bertopic_model.get_topic_info(topic).iloc[0]
        print(f"number of documents {topic_info['Count']}")
        if topic_info["Count"] > 0:
            print(topic_info["Representation"])
        print()

## BERTopic

Seems like the cluster merging results in high imbalance

In [None]:
#bertopic_models["big_kw"].fit(org_file_contents)
#bertopic_models["big_mmr"].fit(org_file_contents.values)

In [None]:
bertopic_models["small_kw"].fit(preprocess_snakecase(roam_df["text"]).values)
bertopic_models["small_mmr"].fit(preprocess_snakecase(roam_df["text"]).values)

In [None]:
show_bertopic(bertopic_models["small_kw"])

In [None]:
show_bertopic(bertopic_models["small_mmr"])

## Results

Bertopic is too aggressive when it comes to merging clusters.

LDA is least aggressive, but it seems like NMF pipeline is better (likely because it's easy to optimize the hyperparams)

In [None]:
bertopic_models["small_mmr"].topic_sizes_[-1]

In [None]:
bertopic_models["small_kw"].topic_sizes_[-1]

In [None]:
lda_tm.get_topics(roam_df).value_counts().max()

In [None]:
nmf_tm.get_topics(roam_df).value_counts().max()

NMF results in highly interpretable topics:

- NLP
- general software
- philosophy and logic
- mathematics
- information retrieval

In [None]:
nmf_tm.get_top_topic_words()[:,:5]