In [8]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions


In [2]:
curr_dir  = os.getcwd()
data_path = os.path.join(curr_dir, "..", "data")
df = pd.read_csv(os.path.join(data_path, "processed", "original_merged_data.csv"))

In [13]:
df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,Dustin Brinkmann,Central,GTX,550,software,1998.0,2714.90,2641.0,United States,Acme Corporation
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,Summer Sewald,West,GTX,550,services,1982.0,792.46,1299.0,United States,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,


In [3]:
# replace every row with a sentence describing the row so that LLMs can understand it, change the sentence structure if missing values
def row_to_sentence(row):
    if pd.isna(row["deal_stage"]):
        deal_text = "The deal was at an unknown stage"
    elif row["deal_stage"] in ["Won", "Lost"]:
        deal_text = (
            f"The deal was {row['deal_stage']} "
            f"with a close value of "
            f"{'an unknown value' if pd.isna(row['close_value']) else row['close_value']}, "
            f"engaged on "
            f"{'an unknown date' if pd.isna(row['engage_date']) else pd.to_datetime(row['engage_date']).strftime('%B %d, %Y')} "
            f"and closed on "
            f"{'an unknown date' if pd.isna(row['close_date']) else pd.to_datetime(row['close_date']).strftime('%B %d, %Y')}"
        )
    elif row["deal_stage"] in ["Prospecting", "Engaging"]:
        deal_text = f"The deal is currently in the {row['deal_stage']} stage"
    else:
        deal_text = f"The deal was {row['deal_stage']}"

    return (
        f"Opportunity {'with unknown ID' if pd.isna(row['opportunity_id']) else f"with ID {row['opportunity_id']}"} for "
        f"{'an unknown account or account ID' if pd.isna(row['account']) else f"account {row['account']}"} "
        f"in {'an unknown sector' if pd.isna(row['sector']) else f"sector {row['sector']}"}, which was"
        f" established in {'an unknown year' if pd.isna(row['year_established']) else int(row['year_established'])} "
        f"with {'an unknown number of' if pd.isna(row['employees']) else int(row['employees'])} employees "
        f"and has {'an unknown revenue' if pd.isna(row['revenue']) else f'revenue of {row["revenue"]}'}."
        f" {'An unknown sales agent' if pd.isna(row['sales_agent']) else f'The sales agent {row["sales_agent"]}'}"
        f", managed by {'an unknown manager' if pd.isna(row['manager']) else row['manager']}, "
        f"handled {'an unknown product' if pd.isna(row['product']) else f'product {row['product']}'} "
        f"({'an unknown series' if pd.isna(row['series']) else f'series {row['series']}'}) "
        f"{'with an unknown price' if pd.isna(row['sales_price']) else f'priced at {row['sales_price']}'}. "
        f"{deal_text} "
        f"through {'an unknown Salesforce office' if pd.isna(row['regional_office']) else f"the {row['regional_office']} regional Salesforce office" } "
        f"located in {'an unknown location' if pd.isna(row['office_location']) else row['office_location']}."
    )

# chromaDB requires a list of IDS
def get_ids(df):
    return df['opportunity_id'].unique().tolist()


In [4]:
# obtain every sentence
sentences = [row_to_sentence(row) for _, row in df.iterrows()]


In [5]:
# prepare metadata for chromaDB
metadata = df.to_dict(orient="records")


In [6]:
ids = get_ids(df)


In [7]:
model = SentenceTransformer("all-MiniLM-L6-v2")
embeddings = model.encode(sentences)
# print(embeddings.shape)
# print(embeddings)
# similarities = model.similarity(embeddings, embeddings)
# print(similarities)

In [9]:
print(f"Embedding size: {len(embeddings)}, metadata size: {len(metadata)}, sentence size: {len(sentences)}, id size: {len(ids)}")

Embedding size: 8800, metadata size: 8800, sentence size: 8800, id size: 8800


In [None]:
# create chromadb client
client = chromadb.Client()
collection = client.get_or_create_collection("sales_opportunities")

In [12]:
# batch them because chromadb has limit on how many you can add at once
batch_size = 5000
for i in range(0, len(embeddings), batch_size):
    batch_ids = ids[i:i+batch_size]
    batch_embeddings = embeddings[i:i+batch_size].tolist()
    batch_sentences = sentences[i:i+batch_size]
    batch_metadata = metadata[i:i+batch_size]

collection.add(
        ids=batch_ids,
        embeddings=batch_embeddings,
        documents=batch_sentences,
        metadatas=batch_metadata
)

In [42]:
query = "Deals engaged in November won by sales agent Darcel Schlecht"
query_embedding = model.encode([query])

results = collection.query(
    query_embeddings=query_embedding.tolist(),
    n_results=10
)


In [43]:
df[df['opportunity_id'].isin(results['ids'][0])]

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
5075,JAH8NGC2,Vicki Laflamme,MG Advanced,Xx-holding,Won,2017-07-21,2017-09-24,2851.0,Celia Rouche,West,MG,3393,finance,1993.0,7537.24,20293.0,United States,
5348,2K52EH11,Cassey Cress,MG Advanced,Vehement Capital Partners,Lost,2017-07-27,2017-12-12,0.0,Rocco Neubert,East,MG,3393,finance,1993.0,646.1,883.0,United States,Golddex
5693,7CC4DUSQ,Elease Gluck,MG Advanced,Fasehatice,Lost,2017-08-05,2017-11-29,0.0,Celia Rouche,West,MG,3393,retail,1990.0,4968.91,7523.0,United States,
5868,A7YPY8KI,Elease Gluck,MG Special,Fasehatice,Won,2017-08-12,2017-08-21,53.0,Celia Rouche,West,MG,55,retail,1990.0,4968.91,7523.0,United States,
6793,8W35TVOQ,Hayden Neloms,MG Advanced,Faxquote,Won,2017-09-18,2017-12-02,2918.0,Celia Rouche,West,MG,3393,telecommunications,1995.0,1825.82,5595.0,United States,Sonron
7410,VB26KWQ6,Kary Hendrixson,GTX Basic,Faxquote,Won,2017-10-10,2017-11-20,511.0,Summer Sewald,West,GTX,550,telecommunications,1995.0,1825.82,5595.0,United States,Sonron
7638,DXAO9R8N,Vicki Laflamme,MG Advanced,Rantouch,Won,2017-10-18,2017-10-27,3363.0,Celia Rouche,West,MG,3393,telecommunications,1994.0,1188.42,3015.0,United States,
8111,L11T0DYF,Vicki Laflamme,MG Advanced,Faxquote,Won,2017-11-21,2017-12-01,2590.0,Celia Rouche,West,MG,3393,telecommunications,1995.0,1825.82,5595.0,United States,Sonron
8233,7IAHPFT9,Vicki Laflamme,MG Advanced,Faxquote,Won,2017-12-10,2017-12-18,3078.0,Celia Rouche,West,MG,3393,telecommunications,1995.0,1825.82,5595.0,United States,Sonron
8250,HNCSYU4T,Markita Hansen,MG Advanced,Faxquote,Won,2017-12-13,2017-12-27,3442.0,Celia Rouche,West,MG,3393,telecommunications,1995.0,1825.82,5595.0,United States,Sonron


In [24]:
df[(df['deal_stage'] == 'Won') & (df['sales_agent'] == 'Darcel Schlecht')]


Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
23,ADRB8OMB,Darcel Schlecht,GTX Basic,Warephase,Won,2016-11-08,2017-03-26,561.0,Melvin Marxen,Central,GTX,550,services,1997.0,2041.73,5276.0,United States,
41,CZVN09WN,Darcel Schlecht,GTX Plus Basic,Konmatfix,Won,2016-11-14,2017-03-20,1170.0,Melvin Marxen,Central,GTX,1096,marketing,1985.0,375.43,1190.0,United States,
45,97UN20YY,Darcel Schlecht,MG Advanced,Conecom,Won,2016-11-16,2017-03-14,3725.0,Melvin Marxen,Central,MG,3393,technolgy,2005.0,1520.66,1806.0,United States,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8129,K4N74V6U,Darcel Schlecht,GTX Pro,Codehow,Won,2017-11-24,2017-12-04,5110.0,Melvin Marxen,Central,GTX,4821,software,1998.0,2714.90,2641.0,United States,Acme Corporation
8139,RDZ2FC3A,Darcel Schlecht,GTX Basic,Warephase,Won,2017-11-25,2017-12-11,537.0,Melvin Marxen,Central,GTX,550,services,1997.0,2041.73,5276.0,United States,
8155,348COZ5O,Darcel Schlecht,GTX Pro,Genco Pura Olive Oil Company,Won,2017-11-27,2017-11-30,4888.0,Melvin Marxen,Central,GTX,4821,retail,2007.0,894.33,1635.0,Italy,
8231,7GXUCFW5,Darcel Schlecht,GTX Pro,Inity,Won,2017-12-10,2017-12-17,4755.0,Melvin Marxen,Central,GTX,4821,marketing,1986.0,2403.58,8801.0,United States,
