Semantic Search: 

"Natural Language Processing (NLP): Semantic search uses NLP to process queries as a human would, focusing on the intent behind the words.

Vector embeddings: It represents words, sentences, and data as numerical vectors, placing similar concepts close together in a multi-dimensional space.

Similarity search: When a query is made, it is converted into a vector and compared to the vectors of the available data. The system then returns the content vectors that are most similar (closest) to the query vector.

Knowledge graphs: These help connect entities and concepts, further understanding the relationships between them. "

#Import Packages

In [14]:
import os
import math
import re
from pathlib import Path
import re, math
import pandas as pd
import numpy as np
import time

!pip install chromadb
!pip install --upgrade pip
import chromadb
from chromadb.config import Settings
from chromadb.utils import embedding_functions
chroma_client = chromadb.Client()




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




#Import Data

In [2]:
DATA_DIR = Path("data_directory")   # 
# 2) Load data
df = pd.read_csv(DATA_DIR / "cleaned_pipeline.csv")

#Create Documents, Metadata, Chroma Collection

In [3]:
#Helper Label
def is_won(stage: str) -> int:
    return 1 if str(stage).strip().lower() == "won" else 0

In [4]:
#Turn each row into document 
def row_to_doc(r) -> str:
    return (
        f"Sales agent {r.get('sales_agent','unknown')}, "
        f"revenue {r.get('revenue','?')}, "
        f"employees {r.get('employees','?')}, "
        f"close value {r.get('close_value','?')}, "
        f"sales cycle {r.get('sales_cycle_days','?')} days."
    )

documents = df.apply(row_to_doc, axis=1).astype(str).tolist()


In [8]:
metadatas = []
for _, r in df.iterrows():
    meta = {
        "sales_agent": str(r.get("sales_agent", "")),   # keep as text
        "revenue": float(pd.to_numeric(r.get("revenue"), errors="coerce")) if pd.notna(r.get("revenue")) else None,
        "employees": float(pd.to_numeric(r.get("employees"), errors="coerce")) if pd.notna(r.get("employees")) else None,
        "close_value": float(pd.to_numeric(r.get("close_value"), errors="coerce")) if pd.notna(r.get("close_value")) else None,
        "sales_cycle_days": float(pd.to_numeric(r.get("sales_cycle_days"), errors="coerce")) if pd.notna(r.get("sales_cycle_days")) else None,
        "deal_stage": str(r.get("deal_stage", "")).strip().lower(),   # keep as text
        "is_won": is_won(r.get("deal_stage", "")),   # 1 or 0
    }
    # Drop keys where the value is None or NaN
    clean_meta = {k: v for k, v in meta.items() if v is not None and v == v}
    metadatas.append(clean_meta)

    ids = [f"row-{i:06d}" for i in range(len(documents))]


In [9]:
#Create Chroma Collection

emb_fn = embedding_functions.DefaultEmbeddingFunction()  
client = chromadb.PersistentClient(path="./chroma_db")    # persists across runs

collection = client.get_or_create_collection(
    name="pipeline_semantic_min",
    embedding_function=emb_fn
) 


In [15]:
# Precompute embeddings in batches, then add to Chroma in batches
emb_fn = embedding_functions.DefaultEmbeddingFunction()  # local CPU

def chunked(seq, size):
    for i in range(0, len(seq), size):
        yield i, seq[i:i+size]

# 1) Precompute embeddings with small batches + progress
EMB_BATCH = 128  # smaller is smoother for CPU
all_vecs = []
t0 = time.perf_counter()
for start, docs in chunked(documents, EMB_BATCH):
    vecs = emb_fn(docs)                     # returns list of vectors
    all_vecs.extend(vecs)
    print(f"[embed] {start+len(docs)}/{len(documents)} | {time.perf_counter()-t0:.1f}s")

# 2) Add to Chroma using embeddings (no embedding work inside .add)
ADD_BATCH = 256
t1 = time.perf_counter()
for start, _ in chunked(documents, ADD_BATCH):
    end = min(start + ADD_BATCH, len(documents))
    collection.add(
        embeddings=all_vecs[start:end],     # <- key change
        documents=documents[start:end],
        metadatas=metadatas[start:end],
        ids=ids[start:end],
    )
    print(f"[add] {end}/{len(documents)} | {time.perf_counter()-t1:.1f}s")


[embed] 128/6711 | 18.0s
[embed] 256/6711 | 30.6s
[embed] 384/6711 | 42.9s
[embed] 512/6711 | 61.1s
[embed] 640/6711 | 75.3s
[embed] 768/6711 | 91.3s
[embed] 896/6711 | 105.7s
[embed] 1024/6711 | 116.3s
[embed] 1152/6711 | 128.2s
[embed] 1280/6711 | 140.7s
[embed] 1408/6711 | 152.6s
[embed] 1536/6711 | 164.4s
[embed] 1664/6711 | 177.7s
[embed] 1792/6711 | 216.0s
[embed] 1920/6711 | 231.4s
[embed] 2048/6711 | 262.0s
[embed] 2176/6711 | 273.1s
[embed] 2304/6711 | 286.0s
[embed] 2432/6711 | 308.5s
[embed] 2560/6711 | 328.8s
[embed] 2688/6711 | 339.9s
[embed] 2816/6711 | 350.2s
[embed] 2944/6711 | 362.5s
[embed] 3072/6711 | 381.3s
[embed] 3200/6711 | 397.2s
[embed] 3328/6711 | 408.9s
[embed] 3456/6711 | 420.4s
[embed] 3584/6711 | 436.8s
[embed] 3712/6711 | 450.4s
[embed] 3840/6711 | 466.9s
[embed] 3968/6711 | 479.7s
[embed] 4096/6711 | 493.5s
[embed] 4224/6711 | 505.6s
[embed] 4352/6711 | 518.7s
[embed] 4480/6711 | 530.1s
[embed] 4608/6711 | 543.5s
[embed] 4736/6711 | 556.2s
[embed] 4864/6

#Query: Won/Not Won (neighbor predicting)

In [16]:
def predict_won_from_text(collection, query_text: str, k: int = 10, threshold: float = 0.5):
    res = collection.query(
        query_texts=[query_text],
        n_results=k,
        include=["metadatas","distances","documents"]
    )
    metas = res["metadatas"][0]
    dists = res["distances"][0]

    # neighbors that have labels
    labeled = [(m, d) for m, d in zip(metas, dists) if isinstance(m.get("is_won"), (int, bool))]
    if not labeled:
        return {"propensity_won": None, "neighbors": 0, "decision": None, "reason": "No labeled neighbors."}

    eps = 1e-6
    weights = [1.0/(d + eps) if d is not None else 1.0 for _, d in labeled]
    votes = [int(m["is_won"]) for m, _ in labeled]
    prop = sum(w*v for w, v in zip(weights, votes)) / sum(weights)

    return {
        "propensity_won": float(prop),
        "neighbors": len(labeled),
        "decision": "WON" if prop >= threshold else "NOT_WON",
        "threshold": threshold,
        # optional: quick peek at nearest labeled neighbors
        "top_examples": [{"deal_stage": m["deal_stage"], "sales_agent": m.get("sales_agent")} for m,_ in labeled[:3]]
    }

# Example free-text prospect
print(predict_won_from_text(
    collection,
    "Sales agent Jamie, revenue 1200, employees 80, close value 600, sales cycle 30 days.",
    k=10, threshold=0.55
))


{'propensity_won': 0.9009872144879337, 'neighbors': 10, 'decision': 'WON', 'threshold': 0.55, 'top_examples': [{'deal_stage': 'won', 'sales_agent': 'James Ascencio'}, {'deal_stage': 'won', 'sales_agent': 'Donn Cantrell'}, {'deal_stage': 'won', 'sales_agent': 'James Ascencio'}]}


#Predict Pre-Existing Row

In [17]:
def row_to_doc(r) -> str:
    return (
        f"Sales agent {r.get('sales_agent','unknown')}, "
        f"revenue {r.get('revenue','?')}, employees {r.get('employees','?')}, "
        f"close value {r.get('close_value','?')}, sales cycle {r.get('sales_cycle_days','?')} days."
    )

def predict_won_from_row(collection, df, row_index: int, k: int = 10, threshold: float = 0.5):
    q = row_to_doc(df.iloc[row_index])
    return predict_won_from_text(collection, q, k=k, threshold=threshold)

# Example: score row 12
print(predict_won_from_row(collection, df, 12))


{'propensity_won': 0.9998911848247066, 'neighbors': 10, 'decision': 'WON', 'threshold': 0.5, 'top_examples': [{'deal_stage': 'won', 'sales_agent': 'Maureen Marcano'}, {'deal_stage': 'won', 'sales_agent': 'Maureen Marcano'}, {'deal_stage': 'won', 'sales_agent': 'Maureen Marcano'}]}


Batch Score All Rows

In [18]:
def score_all_rows(collection, df, k=10, threshold=0.5):
    props = []
    for _, r in df.iterrows():
        q = row_to_doc(r)
        out = predict_won_from_text(collection, q, k=k, threshold=threshold)
        props.append(out["propensity_won"])
    return props

df["propensity_won"] = score_all_rows(collection, df, k=10, threshold=0.5)
df["pred_label"] = (df["propensity_won"] >= 0.5).map({True:"won", False:"not_won"})


KeyboardInterrupt: 