## Reading in the datasets

In [94]:
import pandas as pd
import numpy as np
import os
from sklearn.preprocessing import LabelEncoder, StandardScaler

data_files = os.path.join(os.getcwd(), "data")

accounts= pd.read_csv(os.path.join(data_files, "accounts.csv")) ## company-level info
products = pd.read_csv(os.path.join(data_files, "products.csv")) ## product catalog
pipeline = pd.read_csv(os.path.join(data_files, "sales_pipeline.csv")) ## main dataset, as this links to company info from accounts.csv, product info, and sales agent info
teams = pd.read_csv(os.path.join(data_files, "sales_teams.csv")) ## sales agents, manager, office

pipeline.head(10)


Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0
1,Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0
5,ZNBS69V1,Anna Snelling,MG Special,Ron-tech,Won,2016-10-29,2017-03-01,49.0
6,9ME3374G,Vicki Laflamme,MG Special,J-Texon,Won,2016-10-30,2017-03-02,57.0
7,7GN8Q4LL,Markita Hansen,GTX Basic,Cheers,Won,2016-11-01,2017-03-07,601.0
8,OLK9LKZB,Niesha Huffines,GTX Plus Basic,Zumgoity,Won,2016-11-01,2017-03-03,1026.0
9,HAXMC4IX,James Ascencio,MG Advanced,,Engaging,2016-11-03,,


# **Merging datasets**


In [95]:
df = pipeline.merge(accounts, on="account", how="left")
print(df.shape)
df = df.merge(products, on="product", how="left")
print(df.shape)
df = df.merge(teams, on="sales_agent", how="left")
print(df.shape)
df.head(10)

(8800, 14)
(8800, 16)
(8800, 18)


Unnamed: 0,opportunity_id,sales_agent,product,account,deal_stage,engage_date,close_date,close_value,sector,year_established,revenue,employees,office_location,subsidiary_of,series,sales_price,manager,regional_office
0,1C1I7A6R,Moses Frase,GTX Plus Basic,Cancity,Won,2016-10-20,2017-03-01,1054.0,retail,2001.0,718.62,2448.0,United States,,GTX,1096.0,Dustin Brinkmann,Central
1,Z063OYW0,Darcel Schlecht,GTXPro,Isdom,Won,2016-10-25,2017-03-11,4514.0,medical,2002.0,3178.24,4540.0,United States,,,,Melvin Marxen,Central
2,EC4QE1BX,Darcel Schlecht,MG Special,Cancity,Won,2016-10-25,2017-03-07,50.0,retail,2001.0,718.62,2448.0,United States,,MG,55.0,Melvin Marxen,Central
3,MV1LWRNH,Moses Frase,GTX Basic,Codehow,Won,2016-10-25,2017-03-09,588.0,software,1998.0,2714.9,2641.0,United States,Acme Corporation,GTX,550.0,Dustin Brinkmann,Central
4,PE84CX4O,Zane Levy,GTX Basic,Hatfan,Won,2016-10-25,2017-03-02,517.0,services,1982.0,792.46,1299.0,United States,,GTX,550.0,Summer Sewald,West
5,ZNBS69V1,Anna Snelling,MG Special,Ron-tech,Won,2016-10-29,2017-03-01,49.0,medical,1992.0,3922.42,6837.0,United States,,MG,55.0,Dustin Brinkmann,Central
6,9ME3374G,Vicki Laflamme,MG Special,J-Texon,Won,2016-10-30,2017-03-02,57.0,retail,1989.0,1388.67,3583.0,United States,,MG,55.0,Celia Rouche,West
7,7GN8Q4LL,Markita Hansen,GTX Basic,Cheers,Won,2016-11-01,2017-03-07,601.0,entertainment,1993.0,4269.9,6472.0,United States,Massive Dynamic,GTX,550.0,Celia Rouche,West
8,OLK9LKZB,Niesha Huffines,GTX Plus Basic,Zumgoity,Won,2016-11-01,2017-03-03,1026.0,medical,1984.0,441.08,1210.0,United States,,GTX,1096.0,Melvin Marxen,Central
9,HAXMC4IX,James Ascencio,MG Advanced,,Engaging,2016-11-03,,,,,,,,,MG,3393.0,Summer Sewald,West


# **Data cleaning**

### Remove duplicates

In [96]:
df.drop_duplicates(inplace=True)
print(df.shape)

(8800, 18)


### Handle missing values

In [97]:
print(df.isnull().sum())

opportunity_id         0
sales_agent            0
product                0
account             1425
deal_stage             0
engage_date          500
close_date          2089
close_value         2089
sector              1425
year_established    1425
revenue             1425
employees           1425
office_location     1425
subsidiary_of       7508
series              1480
sales_price         1480
manager                0
regional_office        0
dtype: int64


In [98]:
df.dtypes

Unnamed: 0,0
opportunity_id,object
sales_agent,object
product,object
account,object
deal_stage,object
engage_date,object
close_date,object
close_value,float64
sector,object
year_established,float64


In [99]:
categorical_columns = df.select_dtypes(include="object").columns
numerical_columns = df.select_dtypes(include="float64").columns

#Replacing missing values in categorical columns with the mode
for col in categorical_columns:
  df[col]= df[col].fillna(df[col].mode()[0])

#Replacing missing values in numerical columns with the median (not the mean, because mean can be skewed)
for col in numerical_columns:
  df[col] = df[col].fillna(df[col].median())

In [100]:
df.isnull().sum()

Unnamed: 0,0
opportunity_id,0
sales_agent,0
product,0
account,0
deal_stage,0
engage_date,0
close_date,0
close_value,0
sector,0
year_established,0


In [101]:
df.shape

(8800, 18)

### Converting date columns

In [102]:
date_columns = ["engage_date", "close_date"]
for col in date_columns:
  df[col]=pd.to_datetime(df[col], errors="coerce")

### Standardizing string categorical columns

In [103]:
for col in categorical_columns:
  df[col] = df[col].astype(str).str.strip().str.upper()

### Handling outliers in numerical columns

In [104]:
for col in numerical_columns:
  Q1 = df[col].quantile(0.25)
  Q3 = df[col].quantile(0.75)
  IQR = Q3-Q1

  lower_side = Q1 -1.5 *IQR
  upper_side = Q3 +1.5 * IQR

  df[col]=df[col].clip(lower=lower_side, upper=upper_side)

In [105]:
df.shape

(8800, 18)

In [106]:
df.describe()

Unnamed: 0,close_value,year_established,revenue,employees,sales_price
count,8800.0,8800.0,8800.0,8800.0,8800.0
mean,832.453395,1995.366705,2106.799017,4640.427045,1665.766477
std,1023.851474,8.31702,1740.756718,4089.152878,1760.8532
min,0.0,1979.0,4.54,9.0,55.0
25%,0.0,1990.0,718.62,1588.0,550.0
50%,472.0,1995.0,1698.2,3492.0,1096.0
75%,1085.25,2000.0,2819.5,6472.0,3393.0
max,2713.125,2015.0,5970.82,13798.0,7657.5


# **Data transformation**

### Encode categorical variables

In [107]:
label_enc = LabelEncoder()
for col in categorical_columns:
  df[col] = label_enc.fit_transform(df[col])

### Standardize numerical variables

In [108]:
scaler = StandardScaler()
df[numerical_columns] = scaler.fit_transform(df[numerical_columns])

# **Verifying**

In [109]:
print(df.shape)

#Missing values
print(df.isnull().sum())

#Data types
print(df.dtypes)

print(df.describe())

(8800, 18)
opportunity_id      0
sales_agent         0
product             0
account             0
deal_stage          0
engage_date         0
close_date          0
close_value         0
sector              0
year_established    0
revenue             0
employees           0
office_location     0
subsidiary_of       0
series              0
sales_price         0
manager             0
regional_office     0
dtype: int64
opportunity_id        int64
sales_agent           int64
product               int64
account               int64
deal_stage            int64
engage_date           int64
close_date            int64
close_value         float64
sector                int64
year_established    float64
revenue             float64
employees           float64
office_location       int64
subsidiary_of         int64
series                int64
sales_price         float64
manager               int64
regional_office       int64
dtype: object
       opportunity_id  sales_agent      product      account  

In [110]:
#Categorical columns after encoding
for col in df.select_dtypes(include="int64").columns:
  print(col, df[col].nunique())

opportunity_id 8800
sales_agent 30
product 7
account 85
deal_stage 4
engage_date 421
close_date 306
sector 10
office_location 15
subsidiary_of 7
series 3
manager 6
regional_office 3
