## STEP 1: Install and import libraries

In [1]:
import os
import pandas as pd
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.config import Settings

## STEP 2: Load data


In [6]:
pipeline = "../data_directory/sales_pipeline_clean.csv"
accounts = pd.read_csv("../data_directory/accounts_clean.csv")
teams = pd.read_csv("../data_directory/sales_teams_clean.csv")
products = pd.read_csv("../data_directory/products_clean.csv")

df = accounts.copy()
df["combined_text"] = (
    df["account"].astype(str) + " - " +
    df["sector"].astype(str) + " - " +
    df["office_location"].astype(str)
)

print("\n Sample of combined text data for embeddings:")
print(df["combined_text"].head())




 Sample of combined text data for embeddings:
0    Acme Corporation - technology - United States
1             Betasoloin - medical - United States
2                       Betatech - medical - Kenya
3               Bioholding - medical - Philippines
4                Bioplex - medical - United States
Name: combined_text, dtype: object


## STEP 3: Load SentenceTransformer model

In [7]:
# Model will be a "meaning converter" — it turns text into number lists (embeddings)
# Each vector represents the semantic meaning* of a sentence.

model = SentenceTransformer("all-MiniLM-L6-v2")
print("\n SentenceTransformer model loaded successfully")


 SentenceTransformer model loaded successfully


## STEP 4: Generate embeddings for the text

In [8]:
embeddings = model.encode(df["combined_text"].tolist())  # convert text to embeddings
print(f"\n Created embeddings. Length: {len(embeddings)}")
print("Each row = one account, each column = a feature number that captures meaning.")



 Created embeddings. Length: 85
Each row = one account, each column = a feature number that captures meaning.


## STEP 5: Initialize ChromaDB (mini vector database)

In [9]:
# This database will store the text, IDs, and embeddings together.
client = chromadb.Client()
collection = client.create_collection(name="crm_accounts")

## STEP 6: Add data + embeddings to the ChromaDB collection

In [11]:
# Saving it all so that it can do semantic searches

collection.add(
    ids=[str(i) for i in range(len(df))],  # unique ID for each record
    embeddings=embeddings,                 # our vector representations
    documents=df["combined_text"].tolist(),  # original text (for reference)
    metadatas=[{"account": acc} for acc in df["account"]]  # extra info
)

print("\n Added CRM data to ChromaDB!")


 Added CRM data to ChromaDB!


## STEP 7: Testing


In [12]:
# Here, I'll try searching for things through meaning rather than keywords:

query = "AI data company"
results = collection.query(
    query_texts=[query],
    n_results=2  # returns top 2 most similar
)

print("\n Query:", query)
print("\n Top matching results:")
for doc, meta in zip(results["documents"][0], results["metadatas"][0]):
    print(f"- {meta['account']}: {doc}")


/Users/zainabahmed/.cache/chroma/onnx_models/all-MiniLM-L6-v2/onnx.tar.gz: 100%|██████████| 79.3M/79.3M [00:02<00:00, 36.1MiB/s]




 Query: AI data company

 Top matching results:
- Dontechi: Dontechi - software - United States
- dambase: dambase - marketing - United States
