# TfIdf-based lexical embedding pipeline

In this example notebook we will illustrate how Tf-Idf encoding based on character n-grams of aliases from the [NCIt](https://ncithesaurus.nci.nih.gov/ncitbrowser/) ontology can be used to constuct embeddings of words and lexical similarity search using BlueGraph's `EmbeddingPipeline`.

In [None]:
import getpass
import math
import os
import random
import time
import json
import jwt
import sys

from collections import defaultdict

import rdflib
from rdflib import RDFS, XSD

import numpy as np
import pandas as pd
import zipfile

from joblib import parallel_backend

from sklearn.decomposition import PCA, TruncatedSVD
from scipy.sparse import vstack

from kgforge.core import KnowledgeGraphForge
from kgforge.specializations.resources import Dataset

from bluegraph import version as bg_version
from bluegraph import PandasPGFrame
from bluegraph.core.utils import Preprocessor
from bluegraph.downstream import EmbeddingPipeline
from bluegraph.downstream.similarity import FaissSimilarityIndex, SimilarityProcessor
from bluegraph.preprocess.utils import TfIdfEncoder

## Helpers

In [None]:
def find_uri_by_label(graph, label, lang="en", dtype=XSD.string):
    params = [
        {},
        {"lang": "en"},
        {"datatype": dtype},
        {"lang": "en", "datatype": dtype}
    ]
    resource = None
    for param_set in params:
        for s in graph.subjects(RDFS.label, rdflib.Literal(label, **param_set)):
            resource = str(s)
            break
        if resource is not None:
            break
    
    return resource

def get_agent(token):
    agent_data = jwt.decode(token, verify=False)
    agent = forge.reshape(
        forge.from_json(agent_data), keep=[
            "name", "email", "sub", "preferred_username"])
    agent.id = agent.sub
    agent.type = "Person"
    return agent


def register_model(forge, agent, name, description, label, distribution, similarity, dimension):
    # Create a new model resource
    model_resource = Dataset(
        forge,
        name=name,
        description=description)
    model_resource.type = "EmbeddingModel"
    model_resource.prefLabel = label
    model_resource.similarity = similarity
    model_resource.vectorDimension = dimension

    # Add distrubution
    if distribution is not None:
        model_resource.add_distribution(
            distribution, content_type="application/octet-stream")

    # Add contribution
    model_resource.add_contribution(agent, versioned=False)
    role = forge.from_json({
        "hadRole": {
            "id": "http://purl.obolibrary.org/obo/CRO_0000064",
            "label": "software engineering role"
        }
    })
    model_resource.contribution.hadRole = role

    # Add software agent
    software_agent = {
        "type": "SoftwareAgent",
        "description": "Unifying Python framework for graph analytics and co-occurrence analysis.",
        "name": "BlueGraph",
        "softwareSourceCode": {
            "type": "SoftwareSourceCode",
            "codeRepository": "https://github.com/BlueBrain/BlueGraph",
            "programmingLanguage": "Python",
            "runtimePlatform": f"{sys.version_info.major}.{sys.version_info.minor}",
            "version": bg_version.__version__
        }
    }
    model_resource.wasAssociatedWith = software_agent
    
    forge.register(model_resource)
    return model_resource.id


def update_model_distribution(forge, model_resource, new_distribution, vector_dim=None):
    if vector_dim is not None:
        model_resource.vectorDimension = vector_dim
    model_resource.distribution = forge.attach(new_distribution, content_type="application/octet-stream")
    forge.update(model_resource)
    

def push_model(forge, agent, name, description, label, distribution, similarity, dimension):
    result = forge.search({"name": name})
    if result:
        print("Model exists, updating...")
        model_resource = result[0]
        update_model_distribution(forge, model_resource, distribution, dimension)
    else:
        print("Registering new model...")
        register_model(forge, agent, name, description, label, distribution, similarity, dimension)

Load the ontology

In [None]:
ontology_graph = rdflib.Graph()
ontology_graph.parse("../../ontologies/bbp/bmo.ttl", format="ttl")
ontology_graph.parse("../../ontologies/bbp/molecular-systems.ttl", format="ttl")
ontology_graph.parse("../../ontologies/bbp/etypes.ttl", format="ttl")
ontology_graph.parse("../../ontologies/bbp/mtypes.ttl", format="ttl")
# ontology_graph.parse("../../ontologies/external/allen_MBA_ontology_ccfv3.ttl", format="ttl")

In [None]:
frame = PandasPGFrame.from_ontology(rdf_graph=ontology_graph, remove_prop_uris=True)

In [None]:
ALIAS_PROPS = ["label", "prefLabel", "synonym", "altLabel"]

Get all unique aliases (all lower case)

In [None]:
alias_mapping = {}
for node in frame.nodes():
    record = frame._nodes.loc[node].to_dict()
    for prop in ALIAS_PROPS:
        if not isinstance(record[prop], float):
            value = record[prop]
            if isinstance(value, str):
                alias_mapping[record[prop].lower()] = find_uri_by_label(ontology_graph, node)
            else:
                for el in value:
                    alias_mapping[el.lower()] = find_uri_by_label(ontology_graph, node)

In [None]:
aliases = list(alias_mapping.keys())

In [None]:
len(aliases)

Specify Tf-Idf model parameters

In [None]:
params = {
    "analyzer": "char",
    "dtype": np.float32,
    "max_df": 1.0,
    "min_df": 0.0001,
    "ngram_range": (3, 3),
    "max_features": 1024
}

Create an instance of `EmbeddingPipeline` using:

- `TfIdfEncoder` as a preprocessor,
- No embedder
- BlueGraph `SimilarityProcessor` with Euclidean distance based on an index segmented into 100 Voronoi cells (more details can be found [here](https://github.com/facebookresearch/faiss/wiki/Faster-search)).

In [None]:
d = 1024

In [None]:
index = FaissSimilarityIndex(
    dimension=d, similarity="euclidean", n_segments=200)

In [None]:
pipeline = EmbeddingPipeline(
    preprocessor=TfIdfEncoder(params),
    embedder=None,
    similarity_processor=SimilarityProcessor(index))

Run fitting of the pipeline on the aliases.

In [None]:
pipeline.run_fitting(aliases, point_ids=aliases)

Save the pipeline.

In [None]:
pipeline.save("../data/BMO-linking", compress=True)

Push model into Nexus

In [None]:
TOKEN = getpass.getpass()

In [None]:
forge = KnowledgeGraphForge(
    "../../config/forge-config.yml",
    endpoint="https://bbp.epfl.ch/nexus/v1",
    token=TOKEN,
    bucket="dke/embedding-pipelines")

In [None]:
agent = get_agent(TOKEN)

In [None]:
axon_model_resource = push_model(
    forge, agent, "BMO term embedding with Tf-Idf",
    "Embedding of BMO terms using a simple Tf-Idf-based model on on character n-grams",
    "BMO Tf-Idf Embedding",
    "../data/BMO-linking.zip", "euclidean", d)

In [None]:
embedding_table = pipeline.generate_embedding_table()

In [None]:
embedding_table.sample(5)

Retrieve embedding vectors for the trems of interest.

In [None]:
terms = [
    "l5_lbc",
    "layer 5 bipolar cell",
    "burst non-accommodating electrical type",
    "lalala not in index",
    "emodel building workflow",
#     "primary somatosensory area"
]

In [None]:
vectors = pipeline.retrieve_embeddings(terms)

In [None]:
print("Vector sizes: ")
for i, v in enumerate(vectors):
    print("\t'{}': {}".format(terms[i], len(v) if v is not None else None))

Get similar points to the query terms

In [None]:
distances, points = pipeline.get_neighbors(
    existing_points=terms, k=3)

In [None]:
for i, el in enumerate(terms):
    print(f"Similar terms to '{el}': ")
    if points[i] is not None:
        for p in points[i]:
            print(f"\t- {p} (ontology term {alias_mapping[p]})")
    else:
        print(f"\t {el} is not in index")
    print()

Predict vectors for potentially unseen points

In [None]:
terms_to_predict = [
    "bipolar cell",
    "burst non-accommodating neuron",
    "mariotti cell",
    "e-model reconstruction workflow",
    "burst electrical type",
#     "primary somatosensory cortex"
]

In [None]:
vectors = pipeline.run_prediction(terms_to_predict)

Get similar points for these vectors

In [None]:
distances, points = pipeline.get_neighbors(vectors=vectors, k=3)

In [None]:
for i, el in enumerate(terms_to_predict):
    print(f"Similar terms to '{el}': ")
    if points[i] is not None:
        for p in points[i]:
            print(f"\t- {p} (ontology term {alias_mapping[p]})")
    else:
        print(f"\t {el} is not in index")
    print()