In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder
import os

In [3]:
curr_dir  = os.getcwd()
data_path = os.path.join(curr_dir, "..", "data")
df = pd.read_csv(os.path.join(data_path, "processed", "merged_df.csv"))

In [4]:
df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,Dustin Brinkmann,Central,GTX,550,software,1998.0,2714.90,2641.0,United States,Acme Corporation
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,Summer Sewald,West,GTX,550,services,1982.0,792.46,1299.0,United States,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,


In [5]:
df[df['account'] == 'Unknown'].count()

opportunity_id      0
sales_agent         0
product             0
account             0
deal_stage          0
engage_date         0
close_date          0
close_value         0
manager             0
regional_office     0
series              0
sales_price         0
sector              0
year_established    0
revenue             0
employees           0
office_location     0
subsidiary_of       0
dtype: int64

In [6]:
pd.isna(df.iloc[400]['opportunity_id'])


False

In [7]:
df['deal_stage'].unique()

array(['Won', 'Engaging', 'Lost', 'Prospecting'], dtype=object)

In [8]:
df[df['deal_stage'] == 'Prospecting']

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
8300,6CWZFOHJ,Anna Snelling,GTX Basic,Green-Plus,Prospecting,,,,Dustin Brinkmann,Central,GTX,550,services,2003.0,692.19,1922.0,United States,
8301,3LCLVRVV,Anna Snelling,GTX Basic,,Prospecting,,,,Dustin Brinkmann,Central,GTX,550,,,,,,
8302,YIU1B39V,Anna Snelling,GTX Basic,,Prospecting,,,,Dustin Brinkmann,Central,GTX,550,,,,,,
8303,8E0VRCLW,Anna Snelling,GTX Basic,,Prospecting,,,,Dustin Brinkmann,Central,GTX,550,,,,,,
8304,G99CS23F,Anna Snelling,GTX Basic,,Prospecting,,,,Dustin Brinkmann,Central,GTX,550,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,,Dustin Brinkmann,Central,MG,3393,,,,,,


In [15]:

def row_to_sentence(row):
    if pd.isna(row["deal_stage"]):
        deal_text = "The deal was at an unknown stage"
    elif row["deal_stage"] in ["Won", "Lost"]:
        deal_text = (
            f"The deal was {row['deal_stage']} "
            f"with a close value of "
            f"{'an unknown value' if pd.isna(row['close_value']) else row['close_value']}, "
            f"engaged on "
            f"{'an unknown date' if pd.isna(row['engage_date']) else pd.to_datetime(row['engage_date']).strftime('%B %d, %Y')} "
            f"and closed on "
            f"{'an unknown date' if pd.isna(row['close_date']) else pd.to_datetime(row['close_date']).strftime('%B %d, %Y')}"
        )
    elif row["deal_stage"] in ["Prospecting", "Engaging"]:
        deal_text = f"The deal is currently in the {row['deal_stage']} stage"
    else:
        deal_text = f"The deal was {row['deal_stage']}"

    return (
        f"Opportunity {'with unknown ID' if pd.isna(row['opportunity_id']) else f"with ID {row['opportunity_id']}"} for "
        f"{'an unknown account or account ID' if pd.isna(row['account']) else f"account {row['account']}"} "
        f"in {'an unknown sector' if pd.isna(row['sector']) else f"sector {row['sector']}"}, which was"
        f" established in {'an unknown year' if pd.isna(row['year_established']) else int(row['year_established'])} "
        f"with {'an unknown number of' if pd.isna(row['employees']) else int(row['employees'])} employees "
        f"and has {'an unknown revenue' if pd.isna(row['revenue']) else f'revenue of {row["revenue"]}'}."
        f" {'An unknown sales agent' if pd.isna(row['sales_agent']) else f'The sales agent {row["sales_agent"]}'}"
        f", managed by {'an unknown manager' if pd.isna(row['manager']) else row['manager']}, "
        f"handled {'an unknown product' if pd.isna(row['product']) else f'product {row['product']}'} "
        f"({'an unknown series' if pd.isna(row['series']) else f'series {row['series']}'}) "
        f"{'with an unknown price' if pd.isna(row['sales_price']) else f'priced at {row['sales_price']}'}. "
        f"{deal_text} "
        f"through {'an unknown Salesforce office' if pd.isna(row['regional_office']) else f"the {row['regional_office']} regional Salesforce office" } "
        f"located in {'an unknown location' if pd.isna(row['office_location']) else row['office_location']}."
    )


def get_ids(df):
    return df['opportunity_id'].unique().tolist()


In [13]:
sentences = [row_to_sentence(row) for _, row in df.iterrows()]

In [17]:
metadata = df.to_dict(orient="records")