# Demo 3 - Mistyped philosophers

In [None]:
!pip install --quiet "astrapy>=1.0.0" "python-dotenv>=1.0.0" "unidecode>=1.3.8"

In [None]:
import getpass
import os

from dotenv import load_dotenv
from unidecode import unidecode

from astrapy import DataAPIClient
from astrapy.constants import VectorMetric

## Setup DB

In [None]:
load_dotenv()

if "ASTRA_DB_APPLICATION_TOKEN" not in os.environ:
    os.environ["ASTRA_DB_APPLICATION_TOKEN"] = getpass.getpass("Please input your Astra DB Token:")

if "ASTRA_DB_API_ENDPOINT" not in os.environ:
    os.environ["ASTRA_DB_API_ENDPOINT"] = input("Please input your Astra DB API Endpoint:")

if "ASTRA_DB_KEYSPACE" not in os.environ:
    _namespace = input("(Optional) Input your Astra DB namespace if desired, or leave blank:")
    if _namespace:
        os.environ["ASTRA_DB_KEYSPACE"] = _namespace

ASTRA_DB_APPLICATION_TOKEN = os.environ["ASTRA_DB_APPLICATION_TOKEN"]
ASTRA_DB_API_ENDPOINT = os.environ["ASTRA_DB_API_ENDPOINT"]
ASTRA_DB_KEYSPACE = os.environ.get("ASTRA_DB_KEYSPACE")

In [None]:
db = DataAPIClient(ASTRA_DB_APPLICATION_TOKEN).get_database_by_api_endpoint(ASTRA_DB_API_ENDPOINT, namespace=ASTRA_DB_KEYSPACE)

## Digram-vector from a philosopher name

In [None]:
alphabet = [chr(ascii_i) for ascii_i in range(ord("a"), ord("z") + 1)]
digrams = [f"{c1}{c2}" for c1 in alphabet for c2 in alphabet]
num_digrams = len(digrams)

print(f"Alphabet: {''.join(alphabet)}")
print("Digrams:")
print(f"      0 -> '{digrams[0]}'")
print(f"      1 -> '{digrams[1]}'")
print(f"    326 -> '{digrams[326]}'")
print(f"    {num_digrams - 1} -> '{digrams[num_digrams - 1]}'")

In [None]:
def normalize_text(text):
    base = unidecode(text).lower()
    normalized = ""
    for c in base:
        if c in alphabet:
            normalized += c
    return normalized

for src_name in ["Quine", "Martin-Löf", "Merleau-Ponty"]:
    print(f"normalize_text('{src_name}') ==> '{normalize_text(src_name)}'")

In [None]:
def digrams_from_string(text):
    return [
        f"{c1}{c2}"
        for c1, c2 in zip(text[:-1], text[1:])
        if f"{c1}{c2}" in digrams
    ]

for src_name in ["quine", "martinlof", "merleauponty"]:
    print(f"digrams_from_string('{src_name}') ==> '{digrams_from_string(src_name)}'")

In [None]:
digram_to_index = {dg: dg_i for dg_i, dg in enumerate(digrams)}

def normalize_vector(v):
    norm = sum(v_i * v_i for v_i in v) ** 0.5
    if norm > 0:
        return [v_i / norm for v_i in v]
    else:
        return v

def name_to_vector(text):
    n_name = normalize_text(text)
    t_digrams = digrams_from_string(n_name)
    t_vector = [0] * num_digrams
    for dg in t_digrams:
        t_vector[digram_to_index[dg]] += 1
    return normalize_vector(t_vector)

print(f"name_to_vector('Kierkegaard') =\n   {str(name_to_vector('Kierkegaard'))[:130]}...")

In [None]:
ph_names_collection = db.create_collection(
    "craftdemo_philonames",
    dimension=num_digrams,
    metric=VectorMetric.DOT_PRODUCT,
    check_exists=False,
)
# just in case this demo is re-run
ph_names_collection.delete_all()

## Data input

In [None]:
philosophers = [
    "Kierkegaard",
    "Martin-Löf",
    "Wittgenstein",
    "Sartre",
    "Heidegger",
    "Spinoza",
]

In [None]:
for ph_name in philosophers:
    ph_vector = name_to_vector(ph_name)
    result = ph_names_collection.insert_one({"name": ph_name}, vector=ph_vector)
    print(f"Inserted {ph_name} (_id = {result.inserted_id})")

## Approximate search by digram-vector

In [None]:
def find_philosopher(name, threshold=0.8):
    exact_match = ph_names_collection.find_one({"name": name})
    if exact_match:
        return exact_match
    else:
        # try with digram-based misspelling corrections
        best_match = ph_names_collection.find_one(vector=name_to_vector(name), include_similarity=True)
        if best_match is not None:
            if best_match["$similarity"] >= threshold:
                return best_match
            else:
                return None
        else:
            return None

In [None]:
found = find_philosopher("Spinoza")
if found:
    print(f"Found: {found['name']} ({found['_id']})")
else:
    print("No match.")

In [None]:
found = find_philosopher("wiggenstein")
if found:
    print(f"Found: {found['name']} ({found['_id']})")
else:
    print("No match.")

In [None]:
found = find_philosopher("martinlov")
if found:
    print(f"Found: {found['name']} ({found['_id']})")
else:
    print("No match.")

In [None]:
found = find_philosopher("Miocuggino")
if found:
    print(f"Found: {found['name']} ({found['_id']})")
else:
    print("No match.")