In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os

In [2]:
curr_dir  = os.getcwd()
data_path = os.path.join(curr_dir, "..", "data")
df = pd.read_csv(os.path.join(data_path, "processed", "original_merged_data.csv"))

In [3]:
df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,Dustin Brinkmann,Central,GTX,550,software,1998.0,2714.90,2641.0,United States,Acme Corporation
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,Summer Sewald,West,GTX,550,services,1982.0,792.46,1299.0,United States,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,


In [4]:
# replace every row with a sentence describing the row so that LLMs can understand it, change the sentence structure if missing values
def row_to_sentence(row):
    if pd.isna(row["deal_stage"]):
        deal_text = "The deal was at an unknown stage"
    elif row["deal_stage"] in ["Won", "Lost"]:
        deal_text = (
            f"The deal was {row['deal_stage']} "
            f"with a close value of "
            f"{'an unknown value' if pd.isna(row['close_value']) else row['close_value']}, "
            f"engaged on "
            f"{'an unknown date' if pd.isna(row['engage_date']) else pd.to_datetime(row['engage_date']).strftime('%B %d, %Y')} "
            f"and closed on "
            f"{'an unknown date' if pd.isna(row['close_date']) else pd.to_datetime(row['close_date']).strftime('%B %d, %Y')}"
        )
    elif row["deal_stage"] in ["Prospecting", "Engaging"]:
        deal_text = f"The deal is currently in the {row['deal_stage']} stage"
    else:
        deal_text = f"The deal was {row['deal_stage']}"

    return (
        f"Opportunity {'with unknown ID' if pd.isna(row['opportunity_id']) else f"with ID {row['opportunity_id']}"} for "
        f"{'an unknown account or account ID' if pd.isna(row['account']) else f"account {row['account']}"} "
        f"in {'an unknown sector' if pd.isna(row['sector']) else f"sector {row['sector']}"}, which was"
        f" established in {'an unknown year' if pd.isna(row['year_established']) else int(row['year_established'])} "
        f"with {'an unknown number of' if pd.isna(row['employees']) else int(row['employees'])} employees "
        f"and has {'an unknown revenue' if pd.isna(row['revenue']) else f'revenue of {row["revenue"]}'}."
        f" {'An unknown sales agent' if pd.isna(row['sales_agent']) else f'The sales agent {row["sales_agent"]}'}"
        f", managed by {'an unknown manager' if pd.isna(row['manager']) else row['manager']}, "
        f"handled {'an unknown product' if pd.isna(row['product']) else f'product {row['product']}'} "
        f"({'an unknown series' if pd.isna(row['series']) else f'series {row['series']}'}) "
        f"{'with an unknown price' if pd.isna(row['sales_price']) else f'priced at {row['sales_price']}'}. "
        f"{deal_text} "
        f"through {'an unknown Salesforce office' if pd.isna(row['regional_office']) else f"the {row['regional_office']} regional Salesforce office" } "
        f"located in {'an unknown location' if pd.isna(row['office_location']) else row['office_location']}."
    )

# chromaDB requires a list of IDS
def get_ids(df):
    return df['opportunity_id'].unique().tolist()


In [5]:
# obtain every sentence
sentences = [row_to_sentence(row) for _, row in df.iterrows()]
sentences

['Opportunity with ID 1C1I7A6R for account Cancity in sector retail, which was established in 2001 with 2448 employees and has revenue of 718.62. The sales agent Moses Frase, managed by Dustin Brinkmann, handled product GTX Plus Basic (series GTX) priced at 1096. The deal was Won with a close value of 1054.0, engaged on October 20, 2016 and closed on March 01, 2017 through the Central regional Salesforce office located in United States.',
 'Opportunity with ID Z063OYW0 for account Isdom in sector medical, which was established in 2002 with 4540 employees and has revenue of 3178.24. The sales agent Darcel Schlecht, managed by Melvin Marxen, handled product GTX Pro (series GTX) priced at 4821. The deal was Won with a close value of 4514.0, engaged on October 25, 2016 and closed on March 11, 2017 through the Central regional Salesforce office located in United States.',
 'Opportunity with ID EC4QE1BX for account Cancity in sector retail, which was established in 2001 with 2448 employees a

In [6]:
# prepare metadata for chromaDB
metadata = df.to_dict(orient="records")
metadata

[{'opportunity_id': '1C1I7A6R',
  'sales_agent': 'Moses Frase',
  'product': 'GTX Plus Basic',
  'account': 'Cancity',
  'deal_stage': 'Won',
  'engage_date': '2016-10-20',
  'close_date': '2017-03-01',
  'close_value': 1054.0,
  'manager': 'Dustin Brinkmann',
  'regional_office': 'Central',
  'series': 'GTX',
  'sales_price': 1096,
  'sector': 'retail',
  'year_established': 2001.0,
  'revenue': 718.62,
  'employees': 2448.0,
  'office_location': 'United States',
  'subsidiary_of': nan},
 {'opportunity_id': 'Z063OYW0',
  'sales_agent': 'Darcel Schlecht',
  'product': 'GTX Pro',
  'account': 'Isdom',
  'deal_stage': 'Won',
  'engage_date': '2016-10-25',
  'close_date': '2017-03-11',
  'close_value': 4514.0,
  'manager': 'Melvin Marxen',
  'regional_office': 'Central',
  'series': 'GTX',
  'sales_price': 4821,
  'sector': 'medical',
  'year_established': 2002.0,
  'revenue': 3178.24,
  'employees': 4540.0,
  'office_location': 'United States',
  'subsidiary_of': nan},
 {'opportunity_id'

In [7]:
get_ids(df)

['1C1I7A6R',
 'Z063OYW0',
 'EC4QE1BX',
 'MV1LWRNH',
 'PE84CX4O',
 'ZNBS69V1',
 '9ME3374G',
 '7GN8Q4LL',
 'OLK9LKZB',
 'HAXMC4IX',
 'NL3JZH1Z',
 'KWVA7VR1',
 'S8DX3XOU',
 'ENB2XD8G',
 '09YE9QOV',
 '3F5MZNEH',
 'M6WEJXC0',
 '6PTR7VBR',
 '902REDPA',
 '5J9CMGDV',
 'JJXRR8R6',
 'WF4HA5NW',
 'C5K2JP1H',
 'ADRB8OMB',
 'SBCR987L',
 'UP409DSB',
 'JSD4APT2',
 'AO9Z2D17',
 '5M58DTJK',
 'KNY1OSAB',
 'EAZDUUM9',
 '2STUSOFE',
 'JYKM0B00',
 'KU28360J',
 'N4SD17JR',
 'E67P9Y3Q',
 'AT3MMVIS',
 'REJ11LRY',
 'ERV0CAZ7',
 '8SOQADK7',
 'TCHFT25B',
 'CZVN09WN',
 'EG7OFLFR',
 '30UQWUKB',
 'OLVI7L8M',
 '97UN20YY',
 'JXLERZ9O',
 '6ROE69W5',
 '0DFXFKT7',
 'XKMZVSN4',
 'IU8V0BZK',
 'XY42936P',
 'XRN54MBM',
 '2V848WZD',
 'ONYNTUCG',
 'HIOHX80Y',
 'F5U1ACDD',
 'LPKT07PV',
 'WPB2SLIG',
 'XUSUEAV7',
 'ZZY4516R',
 '3TYPII47',
 '7WAX8Z8O',
 'MYDUMR3R',
 '0DRC1U9Q',
 '37JFKD4I',
 '25YKPHX8',
 'BXXMA7F3',
 'GIUUTBXM',
 'MFX2LR1Q',
 'DUHE9FLY',
 '96BSG7R1',
 '7FQMSWIX',
 'C20AVXN7',
 'GS1QVWCR',
 'ZWH8FXY3',
 '2U94Y3Q9',