In [1]:
%pip install rapidfuzz

Collecting rapidfuzz
  Downloading rapidfuzz-3.14.3-cp311-cp311-win_amd64.whl.metadata (12 kB)
Downloading rapidfuzz-3.14.3-cp311-cp311-win_amd64.whl (1.5 MB)
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
   ---------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB ? eta -:--:--
    --------------------------------------- 0.0/1.5 MB ? eta -:--:--
   - -------------------------------------- 0.0/1.5 MB 279.3 kB/s eta 0:00:06
   - -------------------------------------- 0.0/1.5 MB 279.3 kB/s eta 0:00:06
   -- ------------------------------------- 0.1/1.5 MB 403.5 kB/s eta 0:00:04
   -- ------------------------------------- 0.1/1.5 MB 435.7 kB/s eta 0:00:04
   ------ --------------------------------- 0.2/1.5 MB 752.5 kB/s eta 0:00:02
   ------ --------------------------------- 0.2/1.5 MB 752.5 kB/s eta 0:00:02
   ------------ ------------

In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
Provider Specialty → NUCC Taxonomy Ensemble Mapper
- No external API calls. Optional local embeddings if installed (sentence-transformers).
- Combines: Synonyms → Fuzzy → TF-IDF → (optional) Embeddings, with calibrated ensemble scoring.
- Handles multi-specialty strings (e.g., "Cardio / Diab", "ENT & Allergy").
- Returns JUNK if confidence below threshold.

Usage:
  This script is designed to be run directly in a Python environment or imported as a module.
  If running as a script with command-line arguments, use the --nucc, --input, --out, etc. flags.
  If importing as a module, call the main function with appropriate arguments.
"""

import argparse
import math
import os
import re
import sys
from typing import Dict, List, Tuple

import numpy as np
import pandas as pd
from rapidfuzz import fuzz, process
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity


# --------------------------- Text Utilities ---------------------------

PUNCT_RE = re.compile(r"[^a-z0-9&/ +\-]")
SPACE_RE = re.compile(r"\s+")
SPLIT_MULTI = re.compile(r"[;/,&+]| and | with ", flags=re.IGNORECASE)
JUNK_HINTS = re.compile(r"\b(dept|department|clinic|hospital|desk|admin|billing|front|room|floor|block)\b")

def normalize_text(t: str) -> str:
    t = str(t).lower().strip()
    t = PUNCT_RE.sub(" ", t)
    t = SPACE_RE.sub(" ", t).strip()
    return t

def expand_synonyms(text: str, syn_map: Dict[str, str]) -> str:
    if not syn_map:
        return text
    parts = text.split()
    return " ".join([syn_map.get(w, w) for w in parts])

def split_multi_specialty(raw: str) -> List[str]:
    # Split on separators but keep compact chunks
    chunks = [c.strip() for c in SPLIT_MULTI.split(str(raw)) if c.strip()] # Ensure raw is a string
    # Fall back to original if split produced nothing useful
    return chunks or [str(raw)]


# --------------------------- Embeddings (Optional) ---------------------------

def try_load_embedder(use_embeddings: bool, model_name: str):
    if not use_embeddings:
        return None, None
    try:
        from sentence_transformers import SentenceTransformer
        model = SentenceTransformer(model_name)
        return model, model.get_sentence_embedding_dimension()
    except Exception as e:
        print(f"[INFO] Embeddings disabled (could not init '{model_name}'): {e}", file=sys.stderr)
        return None, None


# --------------------------- Ensemble Scoring ---------------------------

def rescale_0_1(x, lo, hi):
    # Clamp/linear rescale
    x = max(lo, min(hi, x))
    return (x - lo) / (hi - lo) if hi > lo else 0.0

def combine_scores(fuzzy, tfidf, embed, w_fuzzy=0.45, w_tfidf=0.35, w_embed=0.20) -> float:
    # If embeddings are missing, renormalize weights
    if embed is None:
        s = w_fuzzy + w_tfidf
        w_fuzzy /= s
        w_tfidf /= s
        return w_fuzzy * fuzzy + w_tfidf * tfidf
    return w_fuzzy * fuzzy + w_tfidf * tfidf + w_embed * embed


# --------------------------- Main Pipeline ---------------------------

def load_synonyms(path: str) -> Dict[str, str]:
    if not path or not os.path.exists(path):
        # Default medical shorthand expansions
        return {
            "ent": "otolaryngology",
            "obgyn": "obstetrics gynecology",
            "gyn": "gynecology",
            "cardio": "cardiology",
            "cv": "cardiovascular",
            "endo": "endocrinology",
            "diab": "diabetes",
            "neuro": "neurology",
            "derma": "dermatology",
            "ortho": "orthopedic",
            "psych": "psychiatry",
            "ped": "pediatrics",
            "peds": "pediatrics",
            "heent": "otolaryngology",
            "audiologist": "audiology",
            "path": "pathology",
            "im": "internal medicine",
            "fp": "family medicine",
            "pmr": "physical medicine rehabilitation",
        }
    df = pd.read_csv(path)
    df = df.dropna()
    # Expected columns: short_form, standard_term
    m = {}
    for _, r in df.iterrows():
        m[str(r[0]).strip().lower()] = str(r[1]).strip().lower()
    return m


def build_nucc_corpus(nucc: pd.DataFrame) -> Tuple[pd.DataFrame, List[str]]:
    # Build a combined searchable string per taxonomy row
    for col in ["Code", "Grouping", "Classification", "Specialization", "Display_Name", "Definition", "Notes", "Section"]:
        if col not in nucc.columns:
            nucc[col] = ""
    nucc = nucc.fillna("")
    nucc["combined"] = (nucc["Classification"] + " " +
                        nucc["Specialization"] + " " +
                        nucc["Display_Name"]).map(normalize_text)
    # Fallback: if combined is empty, use grouping/definition
    mask_empty = nucc["combined"].str.len() == 0
    nucc.loc[mask_empty, "combined"] = (nucc.loc[mask_empty, "Grouping"] + " " +
                                        nucc.loc[mask_empty, "Definition"]).map(normalize_text)
    return nucc, nucc["combined"].tolist()


def precompute_models(corpus: List[str], use_embeddings: bool, model_name: str):
    # TF-IDF
    tfidf = TfidfVectorizer(stop_words="english")
    tfidf_matrix = tfidf.fit_transform(corpus)

    # Embeddings optional
    embedder, _ = try_load_embedder(use_embeddings, model_name)
    if embedder is not None:
        emb_matrix = embedder.encode(corpus, show_progress_bar=False, normalize_embeddings=True)
    else:
        emb_matrix = None

    return tfidf, tfidf_matrix, embedder, emb_matrix


def shortlist_candidates(query: str, corpus: List[str], topk: int) -> List[int]:
    # Rapid shortlist using fuzzy WRatio top-k
    # Returns indices of candidates inside corpus
    # Using process.extract provides (match_str, score, index)
    results = process.extract(query, corpus, scorer=fuzz.WRatio, limit=topk)
    idxs = [r[2] for r in results]
    return idxs


def score_against_candidates(query: str,
                             nucc: pd.DataFrame,
                             corpus: List[str],
                             cand_idx: List[int],
                             tfidf, tfidf_matrix,
                             embedder, emb_matrix):
    # Fuzzy scores (scaled to 0..1)
    fuzzy_scores = []
    for i in cand_idx:
        s = corpus[i]
        # Blend several fuzzy scorers to reduce idiosyncrasies
        w = fuzz.WRatio(query, s)
        ts = fuzz.token_sort_ratio(query, s)
        pr = fuzz.partial_ratio(query, s)
        f_blend = 0.5 * w + 0.3 * ts + 0.2 * pr     # in 0..100
        fuzzy_scores.append(rescale_0_1(f_blend, 40, 100))  # soften scaling

    # TF-IDF score
    q_vec = tfidf.transform([query])
    cos = cosine_similarity(q_vec, tfidf_matrix[cand_idx]).ravel()
    tfidf_scores = [rescale_0_1(float(x), 0.15, 0.75) for x in cos]  # empirical scaling

    # Embedding score (optional)
    if embedder is not None and emb_matrix is not None:
        q_emb = embedder.encode([query], show_progress_bar=False, normalize_embeddings=True)[0]
        emb_scores = np.dot(emb_matrix[cand_idx], q_emb).tolist()   # cosine in [-1,1]
        emb_scores = [rescale_0_1((x + 1) / 2.0, 0.55, 0.95) for x in emb_scores]  # map to 0..1 with floor
    else:
        emb_scores = [None] * len(cand_idx)

    # Combine
    combined = []
    for k, i in enumerate(cand_idx):
        conf = combine_scores(fuzzy_scores[k], tfidf_scores[k], emb_scores[k])
        combined.append((i, conf, fuzzy_scores[k], tfidf_scores[k], (emb_scores[k] if emb_scores[k] is not None else -1.0)))
    # Sort by combined confidence desc
    combined.sort(key=lambda t: t[1], reverse=True)
    return combined


def map_one(raw: str,
            nucc: pd.DataFrame,
            corpus: List[str],
            tfidf, tfidf_matrix,
            embedder, emb_matrix,
            syn_map: Dict[str, str],
            topk: int,
            threshold: float) -> Tuple[str, float, str]:
    if not raw or str(raw).strip() == "":
        return "JUNK", 0.0, "Empty input"

    original = str(raw)
    # Heuristic early junk detection
    if JUNK_HINTS.search(original.lower()):
        # keep processing, but mark if it falls low
        pass

    # Normalize & expand
    cleaned = normalize_text(original)
    cleaned = expand_synonyms(cleaned, syn_map)

    # Short-circuit: if query exactly equals any "Classification" token, boost later via fuzzy
    # Shortlist
    cand_idx = shortlist_candidates(cleaned, corpus, topk=topk)
    if not cand_idx:
        return "JUNK", 0.0, "No candidates"

    # Score candidates with ensemble
    combined = score_against_candidates(cleaned, nucc, corpus, cand_idx,
                                        tfidf, tfidf_matrix, embedder, emb_matrix)

    # Take top score and include ties within a small delta
    if not combined:
        return "JUNK", 0.0, "No scores"

    best_conf = combined[0][1]
    if best_conf < threshold:
        return "JUNK", round(float(best_conf), 2), f"No confident match (score={best_conf:.2f})"

    # Gather ties within 0.03 for ambiguity
    tie_delta = 0.03
    picks = [combined[0]]
    for item in combined[1:]:
        if best_conf - item[1] <= tie_delta and item[1] >= threshold:
            picks.append(item)
        else:
            break

    # Convert to codes and explanation
    codes = []
    expl_parts = []
    for idx, conf, fz, tf, em in picks:
        row = nucc.iloc[idx]
        codes.append(str(row["Code"]))
        name = row["Display_Name"] if "Display_Name" in row and str(row["Display_Name"]).strip() else row["Classification"]
        em_txt = f"{em:.2f}" if em >= 0 else "NA"
        expl_parts.append(f"{name} [ens={conf:.2f}, fuzzy={fz:.2f}, tfidf={tf:.2f}, emb={em_txt}]")

    nucc_codes = " | ".join(codes)
    explain = f"Matched '{original}' → " + " ; ".join(expl_parts)
    return nucc_codes, round(float(best_conf), 2), explain


def map_row_multispecialty(raw, **kwargs): # Removed type hint for raw
    # Split multi-specialty strings and union the results
    parts = split_multi_specialty(raw)
    all_codes = []
    explains = []
    confs = []

    for p in parts:
        codes, conf, exp = map_one(p, **kwargs)
        confs.append(conf)
        explains.append(exp)
        if codes != "JUNK":
            all_codes.extend([c.strip() for c in codes.split("|")])

    if not all_codes:
        # if none mapped, return the best single part explanation
        best_i = int(np.argmax(confs))
        return "JUNK", float(np.max(confs)), f"Multi-part unresolved; best part: {explains[best_i]}"

    # Deduplicate while preserving order
    seen = set()
    uniq_codes = []
    for c in all_codes:
        if c not in seen and c != "":
            seen.add(c)
            uniq_codes.append(c)

    # Confidence for multi-specialty = max part confidence
    final_conf = float(np.max(confs))
    return " | ".join(uniq_codes), round(final_conf, 2), " + ".join(explains)


def main(nucc_path: str, input_path: str, out_path: str, syn_path: str = "", threshold: float = 0.50, topk: int = 25, use_embeddings: bool = False, model_name: str = "all-MiniLM-L6-v2"):
    # Load data
    nucc = pd.read_csv(nucc_path)
    if "raw_specialty" not in pd.read_csv(input_path, nrows=1).columns.tolist():
        print("[ERROR] 'raw_specialty' column not found in input CSV.", file=sys.stderr)
        sys.exit(2)
    input_df = pd.read_csv(input_path)

    # Build NUCC corpus
    nucc, corpus = build_nucc_corpus(nucc)

    # Prepare models
    tfidf, tfidf_matrix, embedder, emb_matrix = precompute_models(
        corpus, use_embeddings=use_embeddings, model_name=model_name
    )

    # Synonyms
    syn_map = load_synonyms(syn_path)

    # Map rows
    out_rows = []
    for raw in input_df["raw_specialty"].tolist():
        codes, conf, explain = map_row_multispecialty(
            raw=raw,
            nucc=nucc,
            corpus=corpus,
            tfidf=tfidf,
            tfidf_matrix=tfidf_matrix,
            embedder=embedder,
            emb_matrix=emb_matrix,
            syn_map=syn_map,
            topk=topk,
            threshold=threshold,
        )
        out_rows.append((raw, codes if codes else "JUNK", conf, explain))

    out_df = pd.DataFrame(out_rows, columns=["raw_specialty", "nucc_codes", "confidence", "explain"])
    out_df.to_csv(out_path, index=False)
    print(f"[OK] Wrote {len(out_df)} rows to {out_path}")


# if __name__ == "__main__":
#     # This part is commented out to avoid argparse errors when running in the notebook
#     # ap = argparse.ArgumentParser()
#     # ap.add_argument("--nucc", required=True, help="Path to nucc_taxonomy_master.csv")
#     # ap.add_argument("--input", required=True, help="Path to input_specialties.csv with column 'raw_specialty'")
#     # ap.add_argument("--out", required=True, help="Path to write output CSV")
#     # ap.add_argument("--syn", default="", help="Optional synonyms CSV (short_form,standard_term)")
#     # ap.add_argument("--threshold", type=float, default=0.50, help="Confidence threshold for non-JUNK")
#     # ap.add_argument("--topk", type=int, default=25, help="Candidate shortlist size")
#     # ap.add_argument("--use-embeddings", action="store_true", help="Enable sentence embeddings if available")
#     # ap.add_argument("--model", default="all-MiniLM-L6-v2", help="SentenceTransformer model name")
#     # args = ap.parse_args()
#     # main(args.nucc, args.input, args.out, args.syn, args.threshold, args.topk, args.use_embeddings, args.model)

In [None]:
import os
import sys
import pandas as pd

# Import from your preprocessing script
from preprocessing import load_synonyms, PreprocessSpecialty

# --- File paths ---
nucc_path = "nucc_taxonomy_master.csv"
input_path = "input_specialties.csv"
output_path = "output_notebook.csv"
synonyms_path = "synonyms.csv"  

# --- Step 1: Load input ---
df_input = pd.read_csv(input_path)

# --- Step 2: Initialize preprocessor ---
synonyms = load_synonyms(synonyms_path)
pre = PreprocessSpecialty(synonyms_map=synonyms)

# --- Step 3: Apply preprocessing ---
processed_records = []
for raw in df_input.iloc[:, 0].astype(str):
    processed, is_junk = pre.process_one(raw)
    # If junk → skip
    processed_records.append(None if is_junk else processed)

# Replace original specialties with processed ones
df_input.iloc[:, 0] = processed_records
df_input = df_input.dropna().reset_index(drop=True)

# --- Step 4: Save preprocessed file ---
preprocessed_path = "preprocessed_input.csv"
df_input.to_csv(preprocessed_path, index=False)

# --- Step 5: Run classification on preprocessed specialties ---
print(f"Running classification for {preprocessed_path} ...\n")

main(
    nucc_path=nucc_path,
    input_path=preprocessed_path,
    out_path=output_path,
    threshold=0.5,
    topk=25,
)

# --- Step 6: Verify output ---
if os.path.exists(output_path):
    print(f"\n✅ Classified CSV successfully generated → {output_path}")
    df = pd.read_csv(output_path)
    print(df.head(10))
else:
    print("❌ Output file not found. Please check logs.")


Running classification for preprocessed_input.csv ...

[OK] Wrote 9689 rows to classified_output_full.csv

✅ Classified CSV successfully generated → classified_output_full.csv
                     raw_specialty                            nucc_codes  \
0                      acupuncture                                  JUNK   
1              adolescent medicine  2080A0000X | 207QA0000X | 207RA0000X   
2               allergy/immunology               207KA0200X | 207K00000X   
3      anatomic clinical pathology               207ZP0101X | 207ZP0102X   
4                   anesthesiology                            207L00000X   
5  applied behavioral analysis aba                            2080P0006X   
6                        audiology                            2355A2700X   
7                bariatric surgery                            2086S0120X   
8        cardiac electrophysiology                            207RC0001X   
9                  cardiac surgery                            20