In [None]:
# default_exp utils

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
from typing import List
from fastcore.basics import typed
from fastcore.test import *
from toolz import thread_first, thread_last
import proseflow.text as txt

In [2]:
#export
from tabulate import tabulate
def show_tabs(doc):
    print(tabulate([
            [token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            token.shape_, token.is_alpha, token.is_stop] 
            for token in doc], headers=["token", "lemma", "POS", "Tag", "DEP", "shape", "is_alpha", "is_stop"]
  ))


In [3]:
#export

def take_while(fn, coll):
    """Yield values from coll until fn is False"""
    for e in coll:
        if fn(e):
            yield e
        else:
            return

def partition(n, coll, step=None):
    return take_while(lambda e: len(e) == n,
        (coll[i:i+n] for i in range(0, len(coll), step or n)))

def partition_all(n, coll, step=None):
    return (coll[i:i+n] for i in range(0, len(coll), step or n))

[*partition(2, [1, 2,3 ,4,5, 5,6,7], 1)]

[[1, 2], [2, 3], [3, 4], [4, 5], [5, 5], [5, 6], [6, 7]]

In [None]:
#export
@typed
def create_embedding_files_for_visualization(metadata, vectors, metadata_headers=None):
    """ Create embedding files for visualization. Sentences can be any kind of metadata """
    metadata = [*metadata]
    assert len(metadata) == len(vectors)

    vectors_filepath = f"/results/vectors.tsv"
    metadata_filepath = f"results/metadata.tsv"

    out_vectors = open(vectors_filepath, "w", encoding="utf-8")
    out_metadata = open(metadata_filepath, "w", encoding="utf-8")

    # Meta File Header
    if metadata_headers:
        out_metadata.write("\t".join(metadata_headers) + "\n")

    for i in range(len(vectors)):
        out_metadata.write("\t".join(metadata[i]) + "\n")
        out_vectors.write("\t".join([str(x) for x in vectors[i]]) + "\n")

    out_vectors.close()
    out_metadata.close()

In [None]:
#export
def pipe(*funcs:List[callable], thread="first"):
    thread = thread_first if thread == "first" else thread_last
    return lambda data: thread(data, *funcs)

In [None]:
for s_func in txt.STRING_FUNCS: #PYTHON MAGIC
    exec("%s=getattr(str, s_func)" %s_func)

In [None]:
clean_sentence = pipe(strip,
                      lower)

test_eq("this is a test", clean_sentence("   THIS iS a TEsT  "))

In [None]:
#export
def dedupe_conseq(coll):
    """
    Returns a generator of the elements of coll with consecutive duplicates removed.
    """
    initial = True
    prev = None
    for e in coll:
        if initial or e != prev:
            initial = False
            yield e
        prev = e

In [None]:
test_eq([*dedupe_conseq([1,2,3,4,5,5,5,3])], [1, 2, 3, 4, 5, 3])


In [None]:
def tree_seq(has_branch, get_children, root):
    """
    Returns a generator of the nodes in a tree, via a depth-first walk.
    ``has_branch`` must be a function of one argument that returns ``True`` if
    passed a node that can have children (but may not). ``get_children`` must
    be a function of one argument that returns an iterable of the children.
    Will only be called on nodes for which ``has_branch`` returns true.
    ``root`` is the root node of the tree.
    """
    yield root
    if has_branch(root):
        for child in get_children(root):
            for subchild in tree_seq(has_branch, get_children, child):
                yield subchild

In [None]:
d = {"a": 1, "b": 3,"e" :{"f": 6, "g": 7}}
[*tree_seq(lambda n: type(n) == dict, lambda x : x, d)]

[{'a': 1, 'b': 3, 'e': {'f': 6, 'g': 7}}, 'a', 'b', 'e']

In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted aws_utils.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted roam_utils.ipynb.
Converted semanticscholar_api.ipynb.
Converted spec.ipynb.
Converted text.ipynb.
Converted utils.ipynb.
