In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import os

In [None]:
curr_dir = os.getcwd()
data_path = os.path.join(curr_dir, "..", "data")

In [3]:
data_path

'c:\\Users\\mrsai\\Documents\\Team2A\\src\\..\\data'

In [None]:
acc_df = pd.read_csv(os.path.join(data_path, "original", "accounts.csv"))
acc_df

Unnamed: 0,account,sector,year_established,revenue,employees,office_location,subsidiary_of
0,Acme Corporation,technolgy,1996,1100.04,2822,United States,
1,Betasoloin,medical,1999,251.41,495,United States,
2,Betatech,medical,1986,647.18,1185,Kenya,
3,Bioholding,medical,2012,587.34,1356,Philipines,
4,Bioplex,medical,1991,326.82,1016,United States,
...,...,...,...,...,...,...,...
80,Zathunicon,retail,2010,71.12,144,United States,
81,Zencorporation,technolgy,2011,40.79,142,China,
82,Zoomit,entertainment,1992,324.19,978,United States,
83,Zotware,software,1979,4478.47,13809,United States,


In [7]:
products_df = pd.read_csv(os.path.join(data_path, "products.csv"))
products_df

Unnamed: 0,product,series,sales_price
0,GTX Basic,GTX,550
1,GTX Pro,GTX,4821
2,MG Special,MG,55
3,MG Advanced,MG,3393
4,GTX Plus Pro,GTX,5482
5,GTX Plus Basic,GTX,1096
6,GTK 500,GTK,26768


In [8]:
sales_pipeline_df = pd.read_csv(os.path.join(data_path, "sales_pipeline.csv"))
sales_pipeline_df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0
1,Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0
...,...,...,...,...,...,...,...,...
8795,9MIWFW5J,Versie Hillebrand,MG Advanced,,Prospecting,,,
8796,6SLKZ8FI,Versie Hillebrand,MG Advanced,,Prospecting,,,
8797,LIB4KUZJ,Versie Hillebrand,MG Advanced,,Prospecting,,,
8798,18IUIUK0,Versie Hillebrand,MG Advanced,,Prospecting,,,


In [13]:
sales_pipeline_df.isna().sum()
# Account has some nan values; replace with "Unknown" and don't remove because it will keep
# important information for the SentenceTransformer


opportunity_id       0
sales_agent          0
product              0
account           1425
deal_stage           0
engage_date        500
close_date        2089
close_value       2089
dtype: int64

In [14]:
sales_pipeline_df['account'].fillna("Unknown", inplace=True)

print(sales_pipeline_df.isna().sum())

opportunity_id       0
sales_agent          0
product              0
account              0
deal_stage           0
engage_date        500
close_date        2089
close_value       2089
dtype: int64


In [None]:
# sales pipeline product column is "GTXPro" without a space, 
# while product's product has "GTX Pro" with space. this will mess up the inner join.

print(sales_pipeline_df['product'].unique())
print(products_df['product'].unique())


['GTX Plus Basic' 'GTXPro' 'MG Special' 'GTX Basic' 'MG Advanced'
 'GTX Plus Pro' 'GTK 500']
['GTX Basic' 'GTX Pro' 'MG Special' 'MG Advanced' 'GTX Plus Pro'
 'GTX Plus Basic' 'GTK 500']


In [None]:
sales_pipeline_df['product'].replace("GTXPro", "GTX Pro", inplace=True)
sales_pipeline_df['product'].unique().sort() == sales_pipeline_df['product'].unique().sort()

True

In [15]:

sales_teams_df = pd.read_csv(os.path.join(data_path, "sales_teams.csv"))
sales_teams_df

Unnamed: 0,sales_agent,manager,regional_office
0,Anna Snelling,Dustin Brinkmann,Central
1,Cecily Lampkin,Dustin Brinkmann,Central
2,Versie Hillebrand,Dustin Brinkmann,Central
3,Lajuana Vencill,Dustin Brinkmann,Central
4,Moses Frase,Dustin Brinkmann,Central
5,Jonathan Berthelot,Melvin Marxen,Central
6,Marty Freudenburg,Melvin Marxen,Central
7,Gladys Colclough,Melvin Marxen,Central
8,Niesha Huffines,Melvin Marxen,Central
9,Darcel Schlecht,Melvin Marxen,Central


In [None]:
# every sales agent in sales pipeline is in sales team, so the inner join will save every row
sales_pipeline_df['sales_agent'].unique().sort() == sales_teams_df['sales_agent'].unique().sort()

In [None]:
# all accounts in sales pipeline are mapped to an account in account df
# sales_pipeline_df['account'].dropna().unique().sort() == acc_df['account'].dropna().unique().sort()

In [None]:
sales_pipeline_df.isna().sum()

In [None]:
agent_merged = pd.merge(
    sales_pipeline_df,   
    sales_teams_df,     
    on="sales_agent",  
    how="inner"         
)

product_merged = pd.merge(
    agent_merged, 
    products_df,
    on="product",
    how="inner"
)

# do a left join here so that the "Unknown" account values will still be joined and those rows won't be dropped
final_df = pd.merge(
    product_merged,
    acc_df,
    on='account',
    how='left'
)

final_df

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,I043RXJV,Moses Frase,GTX Plus Basic,Dontechi,Lost,2016-12-03,2017-03-17,0.0,Dustin Brinkmann,Central,GTX,1096,software,1982.0,4618.00,10083.0,United States,
2,SRFVQ0HG,Moses Frase,GTX Plus Basic,Codehow,Lost,2017-01-20,2017-03-24,0.0,Dustin Brinkmann,Central,GTX,1096,software,1998.0,2714.90,2641.0,United States,Acme Corporation
3,L33YUX9V,Moses Frase,GTX Plus Basic,Rangreen,Won,2017-02-06,2017-05-15,1068.0,Dustin Brinkmann,Central,GTX,1096,technolgy,1987.0,2938.67,8775.0,Panama,
4,QKVUGRN9,Moses Frase,GTX Plus Basic,Rangreen,Lost,2017-02-22,2017-05-30,0.0,Dustin Brinkmann,Central,GTX,1096,technolgy,1987.0,2938.67,8775.0,Panama,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8795,6WIQG92N,Elease Gluck,GTK 500,Konex,Lost,2017-10-05,2017-10-18,0.0,Celia Rouche,West,GTK,26768,technolgy,1980.0,7708.38,13756.0,United States,
8796,H3K2E35I,Elease Gluck,GTK 500,Cheers,Won,2017-10-28,2017-10-29,27971.0,Celia Rouche,West,GTK,26768,entertainment,1993.0,4269.90,6472.0,United States,Massive Dynamic
8797,GB6C2UK5,Elease Gluck,GTK 500,Xx-holding,Won,2017-11-18,2017-12-04,29220.0,Celia Rouche,West,GTK,26768,finance,1993.0,7537.24,20293.0,United States,
8798,DO6VKC2G,Marty Freudenburg,GTK 500,Unknown,Engaging,2017-08-01,,,Melvin Marxen,Central,GTK,26768,,,,,,


In [None]:
final_df.isna().sum()


opportunity_id         0
sales_agent            0
product                0
account                0
deal_stage             0
engage_date          500
close_date          2089
close_value         2089
manager                0
regional_office        0
series                 0
sales_price            0
sector              1425
year_established    1425
revenue             1425
employees           1425
office_location     1425
subsidiary_of       7508
dtype: int64

In [None]:
final_df.dtypes

opportunity_id       object
sales_agent          object
product              object
account              object
deal_stage           object
engage_date          object
close_date           object
close_value         float64
manager              object
regional_office      object
series               object
sales_price           int64
sector               object
year_established    float64
revenue             float64
employees           float64
office_location      object
subsidiary_of        object
dtype: object

In [25]:
# the left join with unknown account created columns with NaN values; map those strings to "Unknown" while 
# leaving the numbers as NaN
final_df.fillna({"office_location": "Unknown", "subsidiary_of": "Unknown", "sector": "Unknown"}, inplace=True)

In [26]:
final_df.isna().sum()


opportunity_id         0
sales_agent            0
product                0
account                0
deal_stage             0
engage_date          500
close_date          2089
close_value         2089
manager                0
regional_office        0
series                 0
sales_price            0
sector                 0
year_established    1425
revenue             1425
employees           1425
office_location        0
subsidiary_of          0
dtype: int64

In [27]:
# change date string to datetime for ease of use, also keep as NaN 
final_df["close_date"] = pd.to_datetime(final_df["close_date"], errors="coerce")
final_df["engage_date"] = pd.to_datetime(final_df["engage_date"], errors="coerce")


In [None]:
final_df.to_csv(os.path.join(data_path, "processed", "sentence_tm_dataset.csv"), index=False)