In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os
from sentence_transformers import SentenceTransformer
import chromadb
from chromadb.utils import embedding_functions


In [2]:
curr_dir  = os.getcwd()
data_path = os.path.join(curr_dir, "..", "data")
df = pd.read_csv(os.path.join(data_path, "processed", "original_merged_data.csv"))

In [3]:
df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,Dustin Brinkmann,Central,GTX,550,software,1998.0,2714.90,2641.0,United States,Acme Corporation
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,Summer Sewald,West,GTX,550,services,1982.0,792.46,1299.0,United States,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,


In [4]:
# replace every row with a sentence describing the row so that LLMs can understand it, change the sentence structure if missing values
def combined_to_sentence(row):
    if pd.isna(row["deal_stage"]):
        deal_text = "The deal was at an unknown stage"
    elif row["deal_stage"] in ["Won", "Lost"]:
        deal_text = (
            f"The deal was {row['deal_stage']} "
            f"with a close value of "
            f"{'an unknown value' if pd.isna(row['close_value']) else row['close_value']}, "
            f"engaged on "
            f"{'an unknown date' if pd.isna(row['engage_date']) else pd.to_datetime(row['engage_date']).strftime('%B %d, %Y')} "
            f"and closed on "
            f"{'an unknown date' if pd.isna(row['close_date']) else pd.to_datetime(row['close_date']).strftime('%B %d, %Y')}"
        )
    elif row["deal_stage"] in ["Prospecting", "Engaging"]:
        deal_text = f"The deal is currently in the {row['deal_stage']} stage"
    else:
        deal_text = f"The deal was {row['deal_stage']}"

    return (
        f"Opportunity {'with unknown ID' if pd.isna(row['opportunity_id']) else f"with ID {row['opportunity_id']}"} for "
        f"{'an unknown account or account ID' if pd.isna(row['account']) else f"account {row['account']}"} "
        f"in {'an unknown sector' if pd.isna(row['sector']) else f"sector {row['sector']}"}, which was"
        f" established in {'an unknown year' if pd.isna(row['year_established']) else int(row['year_established'])} "
        f"with {'an unknown number of' if pd.isna(row['employees']) else int(row['employees'])} employees "
        f"and has {'an unknown revenue' if pd.isna(row['revenue']) else f'revenue of {row["revenue"]}'}."
        f" {'An unknown sales agent' if pd.isna(row['sales_agent']) else f'The sales agent {row["sales_agent"]}'}"
        f", managed by {'an unknown manager' if pd.isna(row['manager']) else row['manager']}, "
        f"handled {'an unknown product' if pd.isna(row['product']) else f'product {row['product']}'} "
        f"({'an unknown series' if pd.isna(row['series']) else f'series {row['series']}'}) "
        f"{'with an unknown price' if pd.isna(row['sales_price']) else f'priced at {row['sales_price']}'}. "
        f"{deal_text} "
        f"through {'an unknown Salesforce office' if pd.isna(row['regional_office']) else f"the {row['regional_office']} regional Salesforce office" } "
        f"located in {'an unknown location' if pd.isna(row['office_location']) else row['office_location']}."
    )

# chromaDB requires a list of IDS as unique indentifiers for each embedding
def get_ids(df):
    return df['opportunity_id'].unique().tolist()


def account_to_sentence(row):
    return (
        f"Account {'with unknown name' if pd.isna(row['account']) else row['account']} "
        f"operates in {'an unknown sector' if pd.isna(row['sector']) else f'the {row['sector']} sector'}. "
        f"It was established in "
        f"{'an unknown year' if pd.isna(row['year_established']) else int(row['year_established'])} "
        f"and employs "
        f"{'an unknown number of employees' if pd.isna(row['employees']) else int(row['employees'])}. "
        f"The company has "
        f"{'an unknown revenue' if pd.isna(row['revenue']) else f'an annual revenue of {row['revenue']}'} "
        f"and is managed by "
        f"{'an unknown manager' if pd.isna(row['manager']) else row['manager']}. "
        f"The primary Salesforce office handling this account is "
        f"{'unknown' if pd.isna(row['regional_office']) else f'the {row['regional_office']} regional office'} "
        f"located in "
        f"{'an unknown location' if pd.isna(row['office_location']) else row['office_location']}."
    )


def opportunity_to_sentence(row):
    close_value = "unknown" if pd.isna(row["close_value"]) else f"{row['close_value']:,}"
    sector = "unknown sector" if pd.isna(row["sector"]) else row["sector"]
    deal_stage = row["deal_stage"] if not pd.isna(row["deal_stage"]) else "unknown stage"

    return (
        f"This is a {sector} industry sales opportunity handled by "
        f"{'an unknown agent' if pd.isna(row['sales_agent']) else row['sales_agent']}. "
        f"The deal is currently in the {deal_stage} stage "
        f"with a closing value of {close_value}. "
        f"It involves the product {row.get('product', 'unknown product')} "
        f"and was managed through the {row.get('regional_office', 'unknown')} office. "
        f"The account associated with this deal is {row.get('account', 'unknown account')}."
    )

def add_embeddings_to_chromadb(model, raw_sentences, meta, ids, db, batch_size=5000):
    vector_embeddings = model.encode(raw_sentences)

    for i in range(0, len(vector_embeddings), batch_size):
        batch_ids = ids[i:i+batch_size]
        batch_embeddings = vector_embeddings[i:i+batch_size].tolist()
        batch_sentences = raw_sentences[i:i+batch_size]
        batch_metadata = meta[i:i+batch_size]

    db.add(
            ids=batch_ids,
            embeddings=batch_embeddings,
            documents=batch_sentences,
            metadatas=batch_metadata
    )

def get_result_from_query(query, collection, model, n_results=5):
    query_embedding = model.encode([query]).tolist()
    results = collection.query(
        query_embeddings=query_embedding,
        n_results=n_results
    )
    return results


In [5]:
# initialize the sentence transformer model
sentence_transformer = SentenceTransformer("all-MiniLM-L6-v2")

# prepare metadata for chromaDB
metadata = df.to_dict(orient="records")

# get unique identifier for chromaDB
ids = get_ids(df)

# create chromadb client
client = chromadb.Client()

In [6]:
# print(f"Embedding size: {len(embeddings)}, metadata size: {len(metadata)}, sentence size: {len(sentences)}, id size: {len(ids)}")

In [7]:
# create all collections
combined_collection = client.get_or_create_collection("combined_opportunities")
account_collection = client.get_or_create_collection("sales_accounts")
opportunity_collection = client.get_or_create_collection("sales_opportunities")

# obtain every sentence
combined_sentences = [combined_to_sentence(row) for _, row in df.iterrows()]
account_sentences = [account_to_sentence(row) for _, row in df.iterrows()]
opportunity_sentences = [opportunity_to_sentence(row) for _, row in df.iterrows()]


In [8]:
add_embeddings_to_chromadb(sentence_transformer, combined_sentences, metadata, ids, combined_collection, batch_size=5000)
add_embeddings_to_chromadb(sentence_transformer, account_sentences, metadata, ids, account_collection, batch_size=5000)
add_embeddings_to_chromadb(sentence_transformer, opportunity_sentences, metadata, ids, opportunity_collection, batch_size=5000)


In [9]:
res = get_result_from_query(
    "Deals closed in the finance sector and handled by Darcel Schlect",
    combined_collection,
    sentence_transformer,
    n_results=3)

In [10]:
df[df['opportunity_id'].isin(res['ids'][0])]

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
6046,ZN5CVY5V,Darcel Schlecht,GTX Basic,Finhigh,Lost,2017-08-19,2017-08-22,0.0,Melvin Marxen,Central,GTX,550,finance,2006.0,1102.43,1759.0,United States,
7830,7TOPAQ9Y,Darcel Schlecht,MG Special,Finhigh,Won,2017-10-27,2017-12-29,53.0,Melvin Marxen,Central,MG,55,finance,2006.0,1102.43,1759.0,United States,
8173,GUDTJY0I,Darcel Schlecht,MG Advanced,Finhigh,Lost,2017-11-30,2017-12-14,0.0,Melvin Marxen,Central,MG,3393,finance,2006.0,1102.43,1759.0,United States,


In [11]:
res = get_result_from_query(
    "Accounts with more than 100 employees and located in Romania",
    account_collection,
    sentence_transformer,
    n_results=10)

In [12]:
df[df['opportunity_id'].isin(res['ids'][0])]

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
5011,7439NGIG,Marty Freudenburg,GTX Pro,Sumace,Engaging,2017-07-20,,,Melvin Marxen,Central,GTX,4821,retail,2000.0,167.89,493.0,Romania,
5306,L6223VHA,Darcel Schlecht,GTX Pro,Sumace,Engaging,2017-07-26,,,Melvin Marxen,Central,GTX,4821,retail,2000.0,167.89,493.0,Romania,
5536,ICG3THQK,Gladys Colclough,GTX Basic,Sumace,Engaging,2017-07-31,,,Melvin Marxen,Central,GTX,550,retail,2000.0,167.89,493.0,Romania,
5537,WOO9JXOQ,Gladys Colclough,MG Advanced,Sumace,Engaging,2017-07-31,,,Melvin Marxen,Central,MG,3393,retail,2000.0,167.89,493.0,Romania,
5823,J31SUXQJ,Marty Freudenburg,GTX Plus Pro,Sumace,Engaging,2017-08-10,,,Melvin Marxen,Central,GTX,5482,retail,2000.0,167.89,493.0,Romania,
7152,TU58PHME,Garret Kinder,GTX Plus Pro,Sumace,Lost,2017-10-01,2017-11-17,0.0,Cara Losch,East,GTX,5482,retail,2000.0,167.89,493.0,Romania,
8146,MAUIC3DD,Corliss Cosme,GTX Plus Pro,Sumace,Lost,2017-11-26,2017-12-10,0.0,Cara Losch,East,GTX,5482,retail,2000.0,167.89,493.0,Romania,
8312,9P9ISECL,Anna Snelling,GTX Plus Pro,Sumace,Prospecting,,,,Dustin Brinkmann,Central,GTX,5482,retail,2000.0,167.89,493.0,Romania,
8367,9BM1CGBV,Cecily Lampkin,MG Special,Sumace,Prospecting,,,,Dustin Brinkmann,Central,MG,55,retail,2000.0,167.89,493.0,Romania,
8753,C5B5WBY9,Versie Hillebrand,GTX Plus Basic,Sumace,Prospecting,,,,Dustin Brinkmann,Central,GTX,1096,retail,2000.0,167.89,493.0,Romania,


In [None]:
res = get_result_from_query(
    "Marty Freudenburg, Won, GTX", # better with key words isntead of sentences? having a full sentence seems to dilute the important parts
    opportunity_collection,
    sentence_transformer,
    n_results=5) 

In [48]:
df[df['opportunity_id'].isin(res['ids'][0])]

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
5177,BNIU5YZ2,Marty Freudenburg,GTX Plus Basic,Toughzap,Won,2017-07-23,2017-08-07,1006.0,Melvin Marxen,Central,GTX,1096,retail,1995.0,332.43,799.0,United States,
5379,8IOAX81S,Marty Freudenburg,GTX Plus Basic,Domzoom,Won,2017-07-27,2017-08-07,965.0,Melvin Marxen,Central,GTX,1096,entertainment,1998.0,217.87,551.0,United States,
6039,E3L02XMN,Marty Freudenburg,GTX Basic,Ron-tech,Won,2017-08-18,2017-12-30,580.0,Melvin Marxen,Central,GTX,550,medical,1992.0,3922.42,6837.0,United States,
8038,N3YVDYY2,Marty Freudenburg,GTX Plus Basic,Ron-tech,Won,2017-11-14,2017-11-23,947.0,Melvin Marxen,Central,GTX,1096,medical,1992.0,3922.42,6837.0,United States,
8210,EJYFAV40,Marty Freudenburg,GTX Plus Pro,Toughzap,Won,2017-12-07,2017-12-28,5862.0,Melvin Marxen,Central,GTX,5482,retail,1995.0,332.43,799.0,United States,
