In [None]:
# default_exp load

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
#export
import json
import os
import re
from collections.abc import Iterable
from io import BytesIO
from typing import Dict

import gspread
import pandas as pd
import requests
import spacy
import stanza
from dotenv import load_dotenv
from multipledispatch import dispatch
from pandas import DataFrame
from sentence_transformers import SentenceTransformer, models
from spacy_stanza import StanzaLanguage
from textacy.corpus import Corpus
from typeguard import typechecked

from proseflow.spec import *

In [None]:
#hide
from nbdev.showdoc import *
from fastcore.test import *

In [None]:
    # TODO: [Markus -> use func.signature()]
    # gspreadsheet
    # csv
    # tsv
    # pubmed articles
    # wikipedia
    # url
    # load spacy_corpus
    # annotations
    # BRAT
    # Resource = Union[URL, str, email]
# ? @typecheck is pointless here

# Load

> This module loads.

In [None]:
gsheet_to_df

<function proseflow.spec.gsheet_to_df(worksheet) -> pandas.core.frame.DataFrame>

In [None]:
#DIR_PATH = os.path.dirname(os.path.realpath(__file__))
load_dotenv()
env_debug = True

In [None]:
# TODO: -> converter
# Example: https://docs.google.com/spreadsheets/d/1N_aANmDaosjAlodJ5nMNVPfe6REsDtsNYHj_ltH3Q_0/edit?usp=drive_web&ouid=112317186249575590696
#export
@typechecked
def _load_gsheet(
    url: str,
    sheet_number: int = 0,
    credential_path: str = os.getenv("GSHEET_CREDENTIALS"),
    **kwargs,
) -> GSHEET:
    if not credential_path:
        raise Exception("Add the $GSHEET_CREDENTIALS variable to your .env file.")
    gc = gspread.service_account(filename=credential_path)
    wb = gc.open_by_url(url)
    worksheet = wb.get_worksheet(sheet_number)

    return worksheet

In [None]:
#export
def _load_corpus(nlp, path):
    corpus = Corpus(nlp).load(nlp, path)
    for label in labels:
        nlp.vocab.strings.add(label)

    return corpus

In [None]:
import json

def _load_json(path: str, **kwargs):
    with open(path, "r") as file:
        return json.load(file)

def _load_txt(path: str, **kwargs):
    with open(path, "r") as file:
        return file.read()

In [None]:
txt = _load_txt("../data/cancer_abstracts.txt")
txt[:500]

'1. Respirology. 2016 Jul;21(5):821-33. doi: 10.1111/resp.12789. Epub 2016 Apr 21.\n\nImmunotherapy for lung cancer.\n\nSteven A(1)(2), Fisher SA(1)(2), Robinson BW(1)(2).\n\nAuthor information:\n(1)School of Medicine and Pharmacology, University of Western Australia, \nCrawley, Western Australia, Australia.\n(2)National Centre for Asbestos Related Diseases (NCARD), Perth, Western \nAustralia, Australia.\n\nTreatment of lung cancer remains a challenge, and lung cancer is still the \nleading cause of cancer-re'

In [None]:
# TODO: [Rico] make it work with "stanza" or "sci-md" strings
#export
@dispatch((spacy.language.Language, StanzaLanguage), str)
def load(nlp, path):
    return _load_corpus(nlp, path)

In [None]:
#export
@dispatch(Iterable)
def load(resource, **kwargs):
    """All shapes become lists for further processing
    #TODO: [Rico] -- a job for autoconvert?
    """
    shape_iterable = convert(resource, source=type(resource), target=list)
    return load(shape_iterable, **kwargs)

In [None]:
# TODO [Rico] cache all sane things
#export
@dispatch(list)
def load(resource, **kwargs):
    #! checks the type of the FIRST element (like an actual pmid, not a list of pmids)
    shape = kwargs.get("input_type") or infer_type(resource[0])
    if shape == PUBMED_IDS:
        content = kwargs.get(PUBMED_CONTENT) or "ALL"
        if content == "ABSTRACT":
            return _get_pubmed_abstracts(pmids=resource)
        if content == "INFO":
            return _get_pubmed_info(pmids=resource)
        return _get_pubmed_records(pmids=resource)

    return None

NameError: name '_get_pubmed_abstracts' is not defined

In [None]:
#export
def _load_transformer(model_name):
    # ! TODO: abstract so that it also works for Tensorflow, etc..; right now its only PyTorch
    # TODO: make sure it actually loads a huggingface transformer and not the sentence transformer version
    model_name = model_name.split(":")[1]

    return models.Transformer(model_name)

In [None]:
#export
def _load_spacy(model_name: str = "en_core_web_sm", **kwargs) -> spacy.language.Language:
    print("Loading SpaCy...")
    try:
        nlp = spacy.load(model_name, **kwargs)
    except OSError:
        try:
            spacy.cli.download(model_name)
            nlp = spacy.load(model_name, **kwargs)
        except:
            print("Download the SpaCy model before trying to import it.")
            return None
    return nlp

In [None]:
#export
def _load_stanza(
    stanza_setup: Dict[str, str] = {
        "lang": "en",
        "package": "genia",
        "processors": {"ner": "bionlp13cg"},
    },
    use_gpu: bool = True,
) -> stanza.Pipeline:
    # TODO: [RICO -> put use_gpu inside one config]
    print("loading stanza", stanza_setup)
    try:
        snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu)
    except:
        stanza.download(**stanza_setup)
        snlp = stanza.Pipeline(**stanza_setup, use_gpu=use_gpu)

    return snlp

In [None]:
#export
@dispatch(str)  # dispatch decides if the load gets executed; the type level is more expressive
def load(resource, *args, **kwargs):
    """This names the important args like config and credentials, but leaves options open"""


    if resource.endswith(".csv"):
        return pd.read_csv(resource)
    if resource.endswith(".tsv"):
        pass
    if resource == "some url":
        pass  # scrape (params:)
    if resource.endswith(".json"):
        return _load_json(resource)
    if resource.endswith(".txt"):
        return _load_txt(resource)

    shape = kwargs.get("input_type") or infer_type(resource)
    print(shape, "shape", kwargs)
    as_type = kwargs.get("as_type")
    should_convert = as_type is not None
    if shape == GSHEET:
        gs = _load_gsheet(resource, **kwargs)

        # ! Don't Try to be smart here and use (should_convert and convert(...) -- there's problems with boolean
        # operators and some types)
        if should_convert:
            gs = convert(gs, source=GSHEET, target=as_type)
            if as_type == DataFrame and kwargs.get("columns"):
                gs = gs[kwargs.get("columns")]
        return gs
    if shape == SPACY_MODEL:
        return _load_spacy(resource, **kwargs)
    if shape == STANZA_MODEL:
        if as_type:
            kwargs.pop("as_type")
        snlp = _load_stanza(**kwargs)
        if as_type:
            return convert(snlp, source=STANZA_MODEL, target=SPACY_MODEL)
        return snlp
    if shape == SENTENCE_TRANSFORMER:
        return SentenceTransformer(resource)
    if shape == TRANSFORMER:
        transformer_model = _load_transformer(resource)
        if as_type:
            return convert(
                transformer_model, source=TRANSFORMER, target=SENTENCE_TRANSFORMER
            )
        return transformer_model

    return "None found"

In [None]:
load("../data/cancer_abstracts.txt")[:100]

'1. Respirology. 2016 Jul;21(5):821-33. doi: 10.1111/resp.12789. Epub 2016 Apr 21.\n\nImmunotherapy for'

In [None]:
test_eq(type(load("en")), spacy.lang.en.English)

{'re': '(zh|da|nl|en|fr|de|el|it|ja|lt|nb|pl|pt|ro|es|xx)[_(core|ent|ner)_(web|news|wiki|sci|craft|jnlpba|bc5cdr|bionlp13cg)_(sm|md|lg)]*$'} shape {}
Loading SpaCy...
[38;5;2m✔ Download and installation successful[0m
You can now load the model via spacy.load('en_core_web_sm')
[38;5;2m✔ Linking successful[0m
/Users/markus/Library/Caches/pypoetry/virtualenvs/proseflow-GKtXBSGs-py3.8/lib/python3.8/site-packages/en_core_web_sm
-->
/Users/markus/Library/Caches/pypoetry/virtualenvs/proseflow-GKtXBSGs-py3.8/lib/python3.8/site-packages/spacy/data/en
You can now load the model via spacy.load('en')


In [None]:
test_eq(type(load("en_core_web_sm", disable=["tagger", "ner", "parser"])), spacy.lang.en.English)

{'re': '(zh|da|nl|en|fr|de|el|it|ja|lt|nb|pl|pt|ro|es|xx)[_(core|ent|ner)_(web|news|wiki|sci|craft|jnlpba|bc5cdr|bionlp13cg)_(sm|md|lg)]*$'} shape {'disable': ['tagger', 'ner', 'parser']}
Loading SpaCy...


In [None]:
SENTENCE_TRANSFORMER


{'name': 'SENTENCE_TRANSFORMER'}

In [None]:
load("distilbert-base-nli-mean-tokens", input_type=SENTENCE_TRANSFORMER)

{'name': 'SENTENCE_TRANSFORMER'} shape {'input_type': {'name': 'SENTENCE_TRANSFORMER'}}


SentenceTransformer(
  (0): Transformer(
    (auto_model): DistilBertModel(
      (embeddings): Embeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (transformer): Transformer(
        (layer): ModuleList(
          (0): TransformerBlock(
            (attention): MultiHeadSelfAttention(
              (dropout): Dropout(p=0.1, inplace=False)
              (q_lin): Linear(in_features=768, out_features=768, bias=True)
              (k_lin): Linear(in_features=768, out_features=768, bias=True)
              (v_lin): Linear(in_features=768, out_features=768, bias=True)
              (out_lin): Linear(in_features=768, out_features=768, bias=True)
            )
            (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (ffn): FFN(
              (dro

In [None]:
#test_eq(type(load("distilbert-base-nli-mean-tokens", input_type="SENTENCE_TRANSFORMER")), SentenceTransformer)

In [None]:
#TODO
@dispatch(int)
def save(what, where):
    # spacy_docs_to_corpus -> annotation
    # csv
    # tsv
    # to_local (Binary, String, List[str], List[json], json, dict)
    pass