### Imports

In [69]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
import os

# **Data Cleaning**

### Load datasets

In [70]:
data_files = os.path.join(os.getcwd(), "data") #Alter accordingly if needed

accounts_df = pd.read_csv(os.path.join(data_files, "accounts.csv"))
products_df = pd.read_csv(os.path.join(data_files, "products.csv"))
sales_pipeline_df = pd.read_csv(os.path.join(data_files, "sales_pipeline.csv"))
sales_teams_df = pd.read_csv(os.path.join(data_files, "sales_teams.csv"))

### Remove duplicates



In [71]:
accounts_df.drop_duplicates(inplace=True)
products_df.drop_duplicates(inplace=True)
sales_pipeline_df.drop_duplicates(inplace=True)
sales_teams_df.drop_duplicates(inplace=True)

Fixing this issue: Sales pipeline product column is "GTXPro" without a space, while product's product has "GTX Pro" with space.

In [72]:
sales_pipeline_df['product'] = sales_pipeline_df['product'].replace("GTXPro", "GTX Pro")

In [73]:
#Verifying that worked as intended
print(sales_pipeline_df['product'].unique())

['GTX Plus Basic' 'GTX Pro' 'MG Special' 'GTX Basic' 'MG Advanced'
 'GTX Plus Pro' 'GTK 500']


### Merge datasets

In [74]:
merged_df = (
  sales_pipeline_df
  .merge(sales_teams_df, on="sales_agent", how="left")
  .merge(products_df, on="product", how="left")
  .merge(accounts_df, on="account", how="left")
)
print(merged_df.shape)
merged_df.head(10)

(8800, 18)


Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,Dustin Brinkmann,Central,GTX,1096,retail,2001.0,718.62,2448.0,United States,
1,Z063OYW0,Darcel Schlecht,GTX Pro,Isdom,Won,2016-10-25,2017-03-11,4514.0,Melvin Marxen,Central,GTX,4821,medical,2002.0,3178.24,4540.0,United States,
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,Melvin Marxen,Central,MG,55,retail,2001.0,718.62,2448.0,United States,
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,Dustin Brinkmann,Central,GTX,550,software,1998.0,2714.9,2641.0,United States,Acme Corporation
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,Summer Sewald,West,GTX,550,services,1982.0,792.46,1299.0,United States,
5,ZNBS69V1,Anna Snelling,MG Special,Ron-tech,Won,2016-10-29,2017-03-01,49.0,Dustin Brinkmann,Central,MG,55,medical,1992.0,3922.42,6837.0,United States,
6,9ME3374G,Vicki Laflamme,MG Special,J-Texon,Won,2016-10-30,2017-03-02,57.0,Celia Rouche,West,MG,55,retail,1989.0,1388.67,3583.0,United States,
7,7GN8Q4LL,Markita Hansen,GTX Basic,Cheers,Won,2016-11-01,2017-03-07,601.0,Celia Rouche,West,GTX,550,entertainment,1993.0,4269.9,6472.0,United States,Massive Dynamic
8,OLK9LKZB,Niesha Huffines,GTX Plus Basic,Zumgoity,Won,2016-11-01,2017-03-03,1026.0,Melvin Marxen,Central,GTX,1096,medical,1984.0,441.08,1210.0,United States,
9,HAXMC4IX,James Ascencio,MG Advanced,,Engaging,2016-11-03,,,Summer Sewald,West,MG,3393,,,,,,


### Handle missing values with mode/median imputation

In [75]:
categorical_columns = merged_df.select_dtypes(include="object").columns
numerical_columns = merged_df.select_dtypes(include="float64").columns

#Handling missing values in categorical variables with mode imputation
for col in categorical_columns:
  merged_df[col]=merged_df[col].fillna(merged_df[col].mode()[0])

#Handling missing values in numerical variables with median imputation
for col in numerical_columns:
  if col != "close_value" : # close_value should not be replaced with fake values, since a null value means the deal hasn't been closed yet.
    merged_df[col] = merged_df[col].fillna(merged_df[col].median())

### Convert date columns to proper datetime format

In [76]:
merged_df["engage_date"]=pd.to_datetime(merged_df["engage_date"], errors="coerce")
merged_df["close_date"]=pd.to_datetime(merged_df["close_date"], errors="coerce")

### Standardize categorical values

In [77]:
for col in categorical_columns:
    merged_df[col] = merged_df[col].astype(str).str.strip().str.upper()

### Handling outliers

In [78]:
for col in numerical_columns:
  if col != "close_value": #close_value should not be replaced with fake values, since a null value means the deal hasn't been closed yet.
    Q1 = merged_df[col].quantile(0.25)
    Q3 = merged_df[col].quantile(0.75)
    IQR = Q3-Q1

    lower_side = Q1 -1.5 *IQR
    upper_side = Q3 +1.5 * IQR

    merged_df[col]=merged_df[col].clip(lower=lower_side, upper=upper_side)

In [79]:
cols_to_log = ["close_value", "revenue", "employees"]

for col in cols_to_log:
    merged_df[col + "_log"] = np.log1p(merged_df[col])

merged_df[["close_value", "close_value_log",
           "revenue", "revenue_log",
           "employees", "employees_log"]].head()

Unnamed: 0,close_value,close_value_log,revenue,revenue_log,employees,employees_log
0,1054.0,6.961296,718.62,6.578723,2448.0,7.803435
1,4514.0,8.41516,3178.24,8.064397,4540.0,8.420903
2,50.0,3.931826,718.62,6.578723,2448.0,7.803435
3,588.0,6.378426,2714.9,7.906879,2641.0,7.879291
4,517.0,6.249975,792.46,6.676403,1299.0,7.17012


# **Data Transformation**

Encode categorical variables

In [80]:
label_enc = LabelEncoder()
for col in categorical_columns:
  merged_df[col] = label_enc.fit_transform(merged_df[col])

Standardize numerical variables

In [81]:
#We can uncomment this if we would like to standardize numerical variables
# scaler = StandardScaler()
# numeric_to_scale = ["year_established", "sales_price"]
# merged_df[numeric_to_scale] = scaler.fit_transform(merged_df[numeric_to_scale])

Filtering/dropping unnecessary columns

In [82]:
# opportunity_id is the only column safe to remove because it is a unique identifier and has no predictive or explanatory value for the model.
merged_df.drop(columns=["opportunity_id"], inplace=True)

# **Verifying**

In [83]:
merged_df.isnull().sum()

Unnamed: 0,0
sales_agent,0
product,0
account,0
deal_stage,0
engage_date,0
close_date,0
close_value,2089
manager,0
regional_office,0
series,0


In [84]:
merged_df.shape

(8800, 20)

In [85]:
merged_df.head(10)

Unnamed: 0,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,manager,regional_office,series,sales_price,sector,year_established,revenue,employees,office_location,subsidiary_of,close_value_log,revenue_log,employees_log
0,20,2,8,3,0,0,1054.0,2,0,1,1096,5,2001.0,718.62,2448.0,14,0,6.961296,6.578723,7.803435
1,6,4,38,3,1,10,4514.0,3,0,1,4821,4,2002.0,3178.24,4540.0,14,0,8.41516,8.064397,8.420903
2,6,6,8,3,1,6,50.0,3,0,2,55,5,2001.0,718.62,2448.0,14,0,3.931826,6.578723,7.803435
3,20,1,10,3,1,8,588.0,2,0,1,550,7,1998.0,2714.9,2641.0,14,0,6.378426,7.906879,7.879291
4,29,1,34,3,1,1,517.0,5,2,1,550,6,1982.0,792.46,1299.0,14,0,6.249975,6.676403,7.17012
5,0,6,58,3,2,0,49.0,2,0,2,55,4,1992.0,3922.42,6837.0,14,0,3.912023,8.274719,8.830251
6,26,6,40,3,3,1,57.0,1,2,2,55,5,1989.0,1388.67,3583.0,14,0,4.060443,7.236822,8.184235
7,17,1,9,3,4,6,601.0,1,2,1,550,1,1993.0,4269.9,6472.0,14,4,6.400257,8.35958,8.775395
8,21,2,84,3,4,2,1026.0,3,0,1,1096,4,1984.0,441.08,1210.0,14,0,6.934397,6.091491,7.099202
9,12,5,35,0,5,82,,5,2,2,3393,5,1995.0,1698.2,3492.0,14,0,,7.437913,8.158516
