In [1]:
# default_exp spec

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
#export
import re
from typing import List, Tuple
import numpy as np
from varname import varname, nameof

from numpy import ndarray
from pandas import DataFrame, Series
from sentence_transformers import SentenceTransformer, models
from spacy_stanza import StanzaLanguage
from fastcore.test import test_eq

from functools import wraps
import copy

In [4]:
#export
def gsheet_to_df(worksheet) -> DataFrame:
    df = DataFrame(worksheet.get_all_values())
    df.columns = df.iloc[0]
    df = df.iloc[1:]
    return df

## TODOS
* use marshmallow python lib for serialization?
* more testing on auto_coerce!!! ... it's only working for some right now
* conform instead of typed now


In [22]:
#SPECS inspired by Malli for Clojure 

#export

TRANSFORMER = {
    "name": "TRANSFORMER"
}
SENTENCE_TRANSFORMER = {
    "name":"SENTENCE_TRANSFORMER"
}
PUBMED_IDS = {
    "name": "PUBMED_IDS"
}
PUBMED_CONTENT = {
    "name": "PUBMED_CONTENT"
}
EMAIL = {
    "name": "EMAIL"
}

# JSON = {
#     "re": "(.*?)\.(json)$"
# }

# TEXT = {
#     "re": "(.*?)\.(txt)$"
# }

SPACY_MODEL = {
    "re": "(zh|da|nl|en|fr|de|el|it|ja|lt|nb|pl|pt|ro|es|xx)[_(core|ent|ner)_(web|news|wiki|sci|craft|jnlpba|bc5cdr|bionlp13cg)_(sm|md|lg)]*$"
}
STANZA_MODEL = {
    "re": "stanza",
    "doc": "Stanford's Stanza Model",
    "options": ["stanza1", "stanza2"]
}


PUBMED_IDS = {"re": ".*pubmed.ncbi.nlm.nih.gov.*"}
GSHEET = {"re": ".*docs.google.com\/spreadsheets.*"}
    
TRANSFORMER = {"re": "TRANSFORMER:.*"}
HTML_TAG = {
    "name": "HTML_TAG"
}
URL = {
 "re": '\(?((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*\)?'   
}


# for t in TYPES:  # PYTHON MAGIC
#     exec("%s=str('%s')" % (t, t))

"""
? Dynamically create Types in NameSpace aka TRANSFORMER = "TRANSFORMER"
"""
SPEC = [GSHEET, JSON, TEXT, TRANSFORMER, SENTENCE_TRANSFORMER, PUBMED_CONTENT, PUBMED_IDS, SPACY_MODEL, STANZA_MODEL, EMAIL, HTML_TAG]

In [17]:
if {}:
    print("aa")

In [18]:
test_eq([True for s in range(len(SPEC))], [type(schema) == dict and bool(schema) for schema in SPEC])

In [19]:
#export
def infer_type(form, SPEC=SPEC):
    """ What types match this shape? """
    
    schemas_with_re = [schema for schema in SPEC if schema.get("re")]
    schemas_with_validate = [schema for schema in SPEC if schema.get("validate")]
    
    match=[]
    
    if (type(form) == str):
        match = [schema for schema in schemas_with_re if re.compile(schema["re"]).match(form)]

#     else:
#         match = [schema for schema in SPEC if form == schema] #direct {} compare 

    if len(match) > 1:
        raise Exception(
            "Found multiple inferences for the shape you put in. Please put the input_type =EMAIL or something as a keyword argument. MAKE SURE that all schemas are UNIQUELY IDENTIFYABLE"
        )

    if len(match) == 0:
        print("No Match found in type inference, returning None")
        return None
    return match[0]



In [20]:
infer_type("somejason.json")

{'re': '(.*?)\\.(json)$'}

In [24]:
test_eq(infer_type("docs.google.com/spreadsheets.2"), GSHEET)
test_eq(infer_type("en_ner_bionlp13cg_md"), SPACY_MODEL)
test_eq(infer_type("somejason.json"), JSON)
#test_eq(infer_type("distilbert-base-nli-mean-tokens"), SENTENCE_TRANSFORMER)
#test_eq(infer_type(SENTENCE_TRANSFORMER), SENTENCE_TRANSFORMER)


In [None]:
#export
def _transformer_to_sentence_transformer(transformer_model):
    pooling_model = models.Pooling(
        transformer_model.get_word_embedding_dimension(),
        pooling_mode_mean_tokens=True,
        pooling_mode_cls_token=True,
        pooling_mode_max_tokens=False,
    )

    return SentenceTransformer(modules=[transformer_model, pooling_model])

In [None]:

# for name, value in globals().items():
#     if value is STANZA_MODEL:
#         print(name)



In [None]:
#varname(STANZA_MODEL)

In [None]:
# (Source, Target) -> Transform Function
#export
TRANSFORMS = {
    #!!! DO NOT USE Tuple[str] or any of these inferior BS pythonista types. They don't work and are an abomination
    ("SPACY_MODEL", "STANZA_MODEL"): StanzaLanguage,
    (str, list): lambda string: [string],
    (tuple, list): list,
    ("GSHEET", DataFrame): gsheet_to_df,
    (Series, list): Series.to_list,
    (ndarray, list): list,
    ("TRANSFORMER", "SENTENCE_TRANSFORMER"): _transformer_to_sentence_transformer,
}

In [None]:
#export
def convert(shape, source, target):
    """
    Converts an object from source type to target type
    """
    s = source if source==str or list or tuple or dict else varname(source)
    t = target if target==str or list or tuple or dict else varname(target)
    convert_func = TRANSFORMS[(s, t)]
    return convert_func(shape)


In [None]:
list == str or list or tuple or dict

list

In [None]:
test_eq(convert(("aa", "bb"), source=tuple, target=list), ["aa", "bb"])
test_eq(convert(("aa", "bb"), source=tuple, target=list), ["aa", "bb"])
#test_eq(type(convert(load("en"), source="SPACY_MODEL", target="STANZA_MODEL")), StanzaLanguage)

In [None]:
def identity(x):
    return x
def auto_coerce(func, TRANSFORMS=TRANSFORMS):
    """
    DECORATOR: Right now we only do kwargs.
    Use this solution to get away from it: https://docs.python.org/3/library/inspect.html#introspecting-callables-with-the-signature-object
    """

    
    #def a(r:list): return r
    #a.__annotations__ == {'r': list}
    #args, kwargs == () {'r': [1, 23]}
    
    @wraps(func) 
    def wrapped(*args, **kwargs):
        #print(args, kwargs)
        annotations = func.__annotations__
        updated_kwargs = copy.deepcopy(kwargs)
        if not len(args):
            for k, v in updated_kwargs.items():
                
                #if there's no schema match
                spec = infer_type(v)
                inputdata_type = nameof(spec) if spec else type(v)
                
                anno_type = annotations.get(k)
                print(inputdata_type,"aaasd", anno_type)
                #look for convert function or do nothing
                convert_func = TRANSFORMS.get((inputdata_type, anno_type), identity)
                
                updated_kwargs[k] = convert_func(v)

        return_value = func(*args, **updated_kwargs)
        TRANSFORMS_function = TRANSFORMS.get(
            (type(return_value), annotations.get("return")), identity
        )
        return TRANSFORMS_function(return_value)

    return wrapped

In [None]:

def a(x:list): return x
def b(x:SENTENCE_TRANSFORMER): return x

test_eq(type(auto_coerce(a)(x=np.zeros(5))), list)
test_eq(type(auto_coerce(a)(x=("a", "b"))), list)
#test_eq(type(auto_coerce(b)(x=("a", "b"))), list)

#auto_coerce(b)(x=("a", "b"))

No Match found in type inference, returning None
<class 'numpy.ndarray'> aaasd <class 'list'>
No Match found in type inference, returning None
<class 'tuple'> aaasd <class 'list'>


In [None]:
from nbdev.export import notebook2script; notebook2script()

Converted aws_utils.ipynb.
Converted index.ipynb.
Converted load.ipynb.
Converted roam_utils.ipynb.
Converted semanticscholar_api.ipynb.
Converted spec.ipynb.
Converted text.ipynb.
Converted utils.ipynb.
