# Data Preprocessing Tools

## Importing the libraries

In [2]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [3]:
# Load csv file
df = pd.read_csv("0_salesforce_raw_data/Salesforce.csv")

In [4]:
print(df)

    Sales Order Opportunity_ID                             Opportunity_Name  \
0          5708  OPP-2024-0091              Valley Medical Group EHR System   
1          5709  OPP-2024-0092             Mountain State Government Portal   
2          5710  OPP-2024-0093               Western Financial Services CRM   
3          5711  OPP-2024-0094               Lakeside Mall Retail Analytics   
4          5712  OPP-2024-0095       Precision Manufacturing Quality System   
..          ...            ...                                          ...   
91         5799  OPP-2024-0026    Western Regional Hospital Network Upgrade   
92         5800  OPP-2024-0027  Dynamic Marketing Agency CRM Implementation   
93         5801  OPP-2024-0028         Seaside Restaurant Chain POS Upgrade   
94         5802  OPP-2024-0029        Heritage Museum Virtual Tour Platform   
95         5803  OPP-2024-0030       Valley Agricultural Cooperative System   

                   Stage  Amount  Close_Date  Proba

In [5]:
print(df.shape)

(96, 24)


In [6]:
print(df.columns)

Index(['Sales Order', 'Opportunity_ID', 'Opportunity_Name', 'Stage', 'Amount',
       'Close_Date', 'Probability', 'Sales_Person', 'Sales_Person_Email',
       'Company_Name', 'Company_Industry', 'Contact_Name', 'Contact_Title',
       'Contact_Email', 'Contact_Phone', 'Created_Date', 'Last_Modified_Date',
       'Next_Step', 'Description', 'Type', 'Lead_Source', 'State', 'Zip_Code',
       'Payment_Terms'],
      dtype='object')


## Feature Engineering

In [30]:
df_merged['Last_Promoted'] = pd.to_datetime(df_merged['Last_Promoted'])
df_merged['Birthdate'] = pd.to_datetime(df_merged['Birthdate'])
df_merged['Monthly_FLC'] = df_merged['Fully_Loaded_Cost']/12

today = pd.Timestamp.today()

# Calculate years since last promotion using days / 365.25
df_merged['Years_Since_Last_Promotion'] = (today - df_merged['Last_Promoted']).dt.days / 365.25

# Calculate age as integer years
df_merged['age'] = ((today - df_merged['Birthdate']).dt.days / 365.25).astype(int)


# Renaming DataFrame for feature-engineered data
df_featured = df_merged.copy()

# Optionally drop original columns
df_featured.drop(columns=['Last_Promoted', 'Birthdate'], inplace=True)

print(df_featured[['Years_Since_Last_Promotion', 'age']].head())

   Years_Since_Last_Promotion  age
0                    2.814511   34
1                    1.577002   29
2                    1.259411   31
3                    2.310746   31
4                    5.796030   41


In [31]:
print(df_featured.head())

                Current_Role  Employee_ID  Years_Of_Service    Department  \
0   Senior Software Engineer           57               5.2   Engineering   
1   Associate Data Scientist           58               1.7  Data Science   
2  Associate Product Manager           59               1.3       Product   
3           Business Analyst           60               3.9       Finance   
4    Chief Operating Officer           61              15.2     Executive   

   Employee_HR_rate  Hours_per_week  Fully_Loaded_Cost   Monthly_FLC  \
0                60              40          163355.20  13612.933333   
1               117              40           92926.13   7743.844167   
2               147              40          160081.53  13340.127500   
3               153              40          105775.38   8814.615000   
4                79              40          412283.65  34356.970833   

   Years_Since_Last_Promotion  age  
0                    2.814511   34  
1                    1.577002 

In [32]:
print(df_featured.columns)

Index(['Current_Role', 'Employee_ID', 'Years_Of_Service', 'Department',
       'Employee_HR_rate', 'Hours_per_week', 'Fully_Loaded_Cost',
       'Monthly_FLC', 'Years_Since_Last_Promotion', 'age'],
      dtype='object')


In [33]:
print(df_featured.shape)

(100, 10)


## Create Custom Dependent Column

In [34]:
# Create a boolean Series for condition
condition = df_featured['Years_Since_Last_Promotion'] > 5

# Group by that boolean condition
grouped = df_featured.groupby(condition)


In [35]:
print(grouped.size())

Years_Since_Last_Promotion
False    89
True     11
dtype: int64


In [36]:
df_featured['left'] = (df_featured['Years_Since_Last_Promotion'] > 5).astype(int)

# Renaming DataFrame for updated-engineered data
df_updated = df_featured.copy()

In [37]:
print(df_updated[['Years_Since_Last_Promotion', 'left']].head())

   Years_Since_Last_Promotion  left
0                    2.814511     0
1                    1.577002     0
2                    1.259411     0
3                    2.310746     0
4                    5.796030     1


## Export Updated Excel

In [38]:
# Create the folder if it doesn't exist
os.makedirs("1_am_workday_updated_data", exist_ok=True)

# Save the Excel file inside the folder
df_updated.to_csv("1_am_workday_updated_data/1_am_workday_updated_data.csv", index=False)
