In [1]:
#import python libraries
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

In [2]:
#load datasets into Pandas DataFrames
accounts = pd.read_csv("accounts.csv")
products = pd.read_csv("products.csv")
sales_pipelines = pd.read_csv("sales_pipeline.csv")
sales_teams = pd.read_csv("sales_teams.csv")

In [3]:
#print the columns for each dataset
print("Accounts:", accounts.columns.tolist())
print("Products:", products.columns.tolist())
print("Sales Pipelines:", sales_pipelines.columns.tolist())
print("Sales Teams:", sales_teams.columns.tolist())

Accounts: ['account', 'sector', 'year_established', 'revenue', 'employees', 'office_location', 'subsidiary_of']
Products: ['product', 'series', 'sales_price']
Sales Pipelines: ['opportunity_id', 'sales_agent', 'product', 'account', 'deal_stage', 'engage_date', 'close_date', 'close_value']
Sales Teams: ['sales_agent', 'manager', 'regional_office']


In [4]:
#merge the datasets
merged = pd.merge(sales_pipelines, accounts, on="account", how="left")
merged = pd.merge(merged, products, on="product", how="left")
merged = pd.merge(merged, sales_teams, on="sales_agent", how="left")
merged.head()

Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,sector,year_established,revenue,employees,office_location,subsidiary_of,series,sales_price,manager,regional_office
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,retail,2001.0,718.62,2448.0,United States,,GTX,1096.0,Dustin Brinkmann,Central
1,Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0,medical,2002.0,3178.24,4540.0,United States,,,,Melvin Marxen,Central
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,retail,2001.0,718.62,2448.0,United States,,MG,55.0,Melvin Marxen,Central
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,software,1998.0,2714.9,2641.0,United States,Acme Corporation,GTX,550.0,Dustin Brinkmann,Central
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,services,1982.0,792.46,1299.0,United States,,GTX,550.0,Summer Sewald,West


In [5]:
#clean the dataset
merged.drop(columns=["opportunity_id", "year_established", "subsidiary_of"], inplace=True) #drop uneccessary columns
merged.drop_duplicates(inplace=True) #drop duplicated columns
merged['product'] = merged['product'].replace("GTXPro", "GTX Pro")
merged.head()

Unnamed: 0,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,sector,revenue,employees,office_location,series,sales_price,manager,regional_office
0,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,retail,718.62,2448.0,United States,GTX,1096.0,Dustin Brinkmann,Central
1,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,medical,3178.24,4540.0,United States,,,Melvin Marxen,Central
2,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,retail,718.62,2448.0,United States,MG,55.0,Melvin Marxen,Central
3,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,software,2714.9,2641.0,United States,GTX,550.0,Dustin Brinkmann,Central
4,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,services,792.46,1299.0,United States,GTX,550.0,Summer Sewald,West


In [6]:
#view columns for potential missing values
print(merged.isna().sum())

sales_agent           0
product               0
account            1079
deal_stage            0
engage_date         213
close_date         1737
close_value        1737
sector             1079
revenue            1079
employees          1079
office_location    1079
series             1421
sales_price        1421
manager               0
regional_office       0
dtype: int64


In [7]:
#convert date columns to datetime
merged["engage_date"] = pd.to_datetime(merged["engage_date"], errors="coerce")
merged["close_date"]  = pd.to_datetime(merged["close_date"], errors="coerce")

#identify categorical and numeric columns
categorical_columns = merged.select_dtypes(include="object").columns
numerical_columns   = merged.select_dtypes(include="number").columns

#handle missing values for categorical variables with mode imputation
for col in categorical_columns:
    if merged[col].isna().sum() > 0:
        merged[col] = merged[col].fillna(merged[col].mode()[0])

#handle missing values for numerical variables with median imputation
for col in numerical_columns:
    if col not in ["close_value", "engage_date", "close_date"]:  #not including null values since the deal has not been closed
        merged[col] = merged[col].fillna(merged[col].median())
        
#standardize categorical text columns
for col in categorical_columns:
    merged[col] = merged[col].astype(str).str.strip().str.upper()

In [8]:
# Handle outliers with IQR clipping (except close_value)
for col in numerical_columns:
    if col != "close_value":  # close_value should not be altered
        Q1 = merged[col].quantile(0.25)
        Q3 = merged[col].quantile(0.75)
        IQR = Q3 - Q1

        lower_side = Q1 - 1.5 * IQR
        upper_side = Q3 + 1.5 * IQR

        merged[col] = merged[col].clip(lower=lower_side, upper=upper_side)

#handle outliers (except close_value)
    for col in numerical_columns:
        if col != "close_value": # close_value should not be included
            Q1 = merged[col].quantile(0.25)
            Q3 = merged[col].quantile(0.75)
            IQR = Q3 - Q1

            lower_side = Q1 - 1.5 * IQR
            upper_side = Q3 + 1.5 * IQR

            merged[col] = merged[col].clip(lower=lower_side, upper=upper_side)

#apply log transformation to skewed numerical columns
cols_to_log = ["close_value", "revenue", "employees"]

for col in cols_to_log:
    merged[col + "_log"] = np.log1p(merged[col])

#quick check of transformations
merged[["close_value", "close_value_log",
"revenue", "revenue_log",
"employees", "employees_log"]].head()

Unnamed: 0,close_value,close_value_log,revenue,revenue_log,employees,employees_log
0,1054.0,6.961296,718.62,6.578723,2448.0,7.803435
1,4514.0,8.41516,3178.24,8.064397,4540.0,8.420903
2,50.0,3.931826,718.62,6.578723,2448.0,7.803435
3,588.0,6.378426,2714.9,7.906879,2641.0,7.879291
4,517.0,6.249975,792.46,6.676403,1299.0,7.17012


In [9]:
#encode all categorical columns
label_enc = LabelEncoder()
for col in categorical_columns:
    merged[col] = label_enc.fit_transform(merged[col])

In [10]:
#re-view columns for potential missing values
merged.isnull().sum()

sales_agent           0
product               0
account               0
deal_stage            0
engage_date         213
close_date         1737
close_value        1737
sector                0
revenue               0
employees             0
office_location       0
series                0
sales_price           0
manager               0
regional_office       0
close_value_log    1737
revenue_log           0
employees_log         0
dtype: int64

In [12]:
merged.shape

(8448, 18)

In [13]:
merged.head(10)

Unnamed: 0,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,sector,revenue,employees,office_location,series,sales_price,manager,regional_office,close_value_log,revenue_log,employees_log
0,20,2,8,3,2016-10-20,2017-03-01,1054.0,5,718.62,2448.0,14,1,1096.0,2,0,6.961296,6.578723,7.803435
1,6,4,38,3,2016-10-25,2017-03-11,4514.0,4,3178.24,4540.0,14,1,1096.0,3,0,8.41516,8.064397,8.420903
2,6,6,8,3,2016-10-25,2017-03-07,50.0,5,718.62,2448.0,14,2,55.0,3,0,3.931826,6.578723,7.803435
3,20,1,10,3,2016-10-25,2017-03-09,588.0,7,2714.9,2641.0,14,1,550.0,2,0,6.378426,7.906879,7.879291
4,29,1,34,3,2016-10-25,2017-03-02,517.0,6,792.46,1299.0,14,1,550.0,5,2,6.249975,6.676403,7.17012
5,0,6,58,3,2016-10-29,2017-03-01,49.0,4,3922.42,6837.0,14,2,55.0,2,0,3.912023,8.274719,8.830251
6,26,6,40,3,2016-10-30,2017-03-02,57.0,5,1388.67,3583.0,14,2,55.0,1,2,4.060443,7.236822,8.184235
7,17,1,9,3,2016-11-01,2017-03-07,601.0,1,4269.9,6472.0,14,1,550.0,1,2,6.400257,8.35958,8.775395
8,21,2,84,3,2016-11-01,2017-03-03,1026.0,4,441.08,1210.0,14,1,1096.0,3,0,6.934397,6.091491,7.099202
9,12,5,35,0,2016-11-03,NaT,,5,1698.2,3492.0,14,2,3393.0,5,2,,7.437913,8.158516
