In [None]:
# default_exp roam_utils

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import re
import numpy as np
from proseflow.aws_utils import *
from proseflow.load import *
from proseflow.text import *
from proseflow.utils import pipe

from dotenv import load_dotenv
from collections.abc import Iterable

from IPython.display import JSON

In [None]:
#hide
from fastcore.test import *

In [None]:
load_dotenv()

True

In [None]:
#export
for s_func in STRING_FUNCS: #PYTHON MAGIC
    exec("%s=getattr(str, s_func)" %s_func)

In [None]:
BUCKET = "roam-export"

In [None]:
roam_graph = read_json_from_s3(bucket=BUCKET, key="scify.json").get("data")[:10]

In [None]:
#export
URL_REGEX = '\(?((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*\)?'

def remove_html_tags(form, content="HTML_TAGS"):
    html_tags = re.compile('<.*?>')
    return re.sub(html_tags, '', form)

def remove_buttons(form):
    button_elems = re.compile('\{\{\[\[(TODO|DONE|slider)\]\]\}\}')
    return re.sub(button_elems, '', form)

def remove_url(form):
    url = re.compile(URL_REGEX)
    return re.sub(url, '', form)

def remove_attr(form):
    attr = re.compile('^[^:\r\n]+:*')
    return re.sub(attr, '', form)

def replace_block_ref(form, lookup):
    """good job here... my implementation in js was much worse"""
    block_ref = re.compile('\(\((.*)\)\)')
    block = re.search(block_ref, form)
    if block: 
        return lookup.get(re.sub('[()]', '', block.group(0)))
    return form

def remove_duplicates(form: Iterable):
    if isinstance(form, list):
        return list(set(form))
    return form

clean_sentence = pipe(
                      replace_block_ref,
                      remove_buttons,
                      remove_html_tags,
                      remove_url,
#                       remove_attr, #attention !!! buggy!
                      (replace, "  ", " "),
                      (replace, "[", ""),
                      (replace, "]", ""), 
                      (replace, "#", ""),
                      (replace, "`", ""),
                      (replace, "__", ""),
                      (replace, "~~", ""),
                      (replace, "**", ""),
                      (replace, "^^", ""),
                      strip, 
                      lower
                      )

In [None]:
test_eq(remove_buttons("{{[[TODO]]}} watch at least 5 videos of the course on bio"), " watch at least 5 videos of the course on bio")
test_eq(remove_buttons("{{[[DONE]]}} watch at least 5 videos of the course on bio"), " watch at least 5 videos of the course on bio")
test_eq(remove_url("[How to take smart notes video](https://vimeo.com/275530205)"), "[How to take smart notes video]")
test_eq(remove_url("How to take smart notes video https://vimeo.com/275530205"), "How to take smart notes video ")
test_eq(remove_attr("tag:: #programming"), " #programming")
# print(replace_block_ref("First, extract all the relevant sentences using the words that were used in the dataset from ((nC-tI-yaD))"))

In [None]:
#export
def is_too_short(s, length=10): 
    return len(s.split(" ")) < length

stop_symbols = ["TODO", "DONE", "::", "```", "!["]
def has_stop_symbols(s): 
    return any([symbol in s for symbol in stop_symbols])

In [None]:
# export
def roam_graph_to_blocks(roam_graph):
    roam_blocks_map = {}

    def extract_strings(roam_block):
        if type(roam_block) == list:
            roam_block = roam_block[0]
        if roam_block.get("string") and roam_block.get("uid"):
            roam_blocks_map[roam_block.get("uid")] = roam_block.get("string")
        if roam_block.get("children"):
            for child in roam_block.get("children"):
                extract_strings(child)
                
    for block in roam_graph:
        extract_strings(block)
        
    return roam_blocks_map

In [None]:
test_roam_graph = read_json_from_s3(bucket=BUCKET, key="scify.json").get("data")[:10]
roam_blocks = roam_graph_to_blocks(test_roam_graph)
test_vectors = np.arange(len(roam_blocks))

roam_block_uid = list(roam_blocks.keys())[0]
roam_block_sentence = list(roam_blocks.values())[0]

test_eq_type(type(roam_block_uid), str)
test_eq_type(type(roam_block_sentence), str)

In [None]:
# export
def roam_blocks_to_embeddings_index(roam_blocks, vectors):
    roam_embeddings = []
    
    for i, (uid, sentence) in enumerate(roam_blocks.items()):
        if not is_too_short(sentence) and not has_stop_symbols(sentence):
#             cleaned_sentence = clean_sentence(sentence)
            roam_embeddings.append({
                "uid": uid,
                "sentence": sentence,
                "embedding": vectors[i].tolist()
            })
    
    return roam_embeddings

In [None]:
index_list = roam_blocks_to_embeddings_index(roam_blocks, test_vectors)
test_eq(list(index_list[0].keys()), ["uid", "sentence", "embedding"])

In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted aws_utils.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted roam_utils.ipynb.
Converted semanticscholar_api.ipynb.
Converted spec.ipynb.
Converted text.ipynb.
Converted utils.ipynb.
