# Data Preprocessing Tools

## Importing the libraries

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [2]:
# Load csv file
df = pd.read_csv("0_salesforce_raw_data/Salesforce.csv")

In [3]:
print(df)

    Sales Order Opportunity_ID                             Opportunity_Name  \
0          5708  OPP-2024-0091              Valley Medical Group EHR System   
1          5709  OPP-2024-0092             Mountain State Government Portal   
2          5710  OPP-2024-0093               Western Financial Services CRM   
3          5711  OPP-2024-0094               Lakeside Mall Retail Analytics   
4          5712  OPP-2024-0095       Precision Manufacturing Quality System   
..          ...            ...                                          ...   
91         5799  OPP-2024-0026    Western Regional Hospital Network Upgrade   
92         5800  OPP-2024-0027  Dynamic Marketing Agency CRM Implementation   
93         5801  OPP-2024-0028         Seaside Restaurant Chain POS Upgrade   
94         5802  OPP-2024-0029        Heritage Museum Virtual Tour Platform   
95         5803  OPP-2024-0030       Valley Agricultural Cooperative System   

                   Stage  Amount  Close_Date  Proba

In [4]:
print(df.shape)

(96, 24)


In [5]:
print(df.columns)

Index(['Sales Order', 'Opportunity_ID', 'Opportunity_Name', 'Stage', 'Amount',
       'Close_Date', 'Probability', 'Sales_Person', 'Sales_Person_Email',
       'Company_Name', 'Company_Industry', 'Contact_Name', 'Contact_Title',
       'Contact_Email', 'Contact_Phone', 'Created_Date', 'Last_Modified_Date',
       'Next_Step', 'Description', 'Type', 'Lead_Source', 'State', 'Zip_Code',
       'Payment_Terms'],
      dtype='object')


In [6]:
# Selective renaming for df
df_rename = df.rename(columns={
    'Sales Order': 'Sales_Order'
})


In [7]:
print(df_rename.columns)

Index(['Sales_Order', 'Opportunity_ID', 'Opportunity_Name', 'Stage', 'Amount',
       'Close_Date', 'Probability', 'Sales_Person', 'Sales_Person_Email',
       'Company_Name', 'Company_Industry', 'Contact_Name', 'Contact_Title',
       'Contact_Email', 'Contact_Phone', 'Created_Date', 'Last_Modified_Date',
       'Next_Step', 'Description', 'Type', 'Lead_Source', 'State', 'Zip_Code',
       'Payment_Terms'],
      dtype='object')


In [8]:
# Your selected columns
req_cols = [
            'Sales_Order',
            'Stage',
            'Amount',
            'Close_Date',
            'Probability',
            'Sales_Person',
            'Company_Industry',
            'Contact_Title',
            'Created_Date',
            'Type',
            'Lead_Source',
            'State'
           ]
          
# Load only required columns
df_salesforce = df_rename[req_cols]

In [9]:
print(df_salesforce.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Close_Date', 'Probability',
       'Sales_Person', 'Company_Industry', 'Contact_Title', 'Created_Date',
       'Type', 'Lead_Source', 'State'],
      dtype='object')


In [10]:
print(id(df))
print(id(df_rename))
print(id(df_salesforce))

2595521652656
2595522130384
2595238583008


## Feature Engineering

In [11]:
df_salesforce_new = df_salesforce.copy()

df_salesforce_new['Close_Date'] = pd.to_datetime(df_salesforce_new['Close_Date'])
df_salesforce_new['Created_Date'] = pd.to_datetime(df_salesforce_new['Created_Date'])

# Calculate days_to_close
df_salesforce_new['days_to_close'] = (df_salesforce_new['Close_Date'] - df_salesforce_new['Created_Date']).dt.days

# Drop the original date columns
df_salesforce_new.drop(['Close_Date', 'Created_Date'], axis=1, inplace=True)


In [12]:
print(df_salesforce_new.head())

   Sales_Order                 Stage  Amount  Probability    Sales_Person  \
0         5708           Closed Lost  255000            0  Alex Rodriguez   
1         5709       Decision Makers  315000           70   Sarah Johnson   
2         5710  Proposal/Price Quote  275000           65   Michael Chang   
3         5711            Closed Won  135000          100     Emma Wilson   
4         5712    Negotiation/Review  325000           85  David Martinez   

     Company_Industry              Contact_Title          Type  \
0          Healthcare           Medical Director  New Business   
1          Government                IT Director  New Business   
2  Financial Services  Client Relations Director  New Business   
3              Retail         Marketing Director  New Business   
4       Manufacturing            Quality Manager  New Business   

             Lead_Source State  days_to_close  
0  Healthcare Conference    WA             45  
1                    RFP    MT             9

In [13]:
print(df_salesforce_new.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Probability', 'Sales_Person',
       'Company_Industry', 'Contact_Title', 'Type', 'Lead_Source', 'State',
       'days_to_close'],
      dtype='object')


## Export Updated Excel

In [14]:
# Create the folder if it doesn't exist
os.makedirs("1_salesforce_updated_data", exist_ok=True)

# Save the Excel file inside the folder
df_salesforce_new.to_csv("1_salesforce_updated_data/1_salesforce_updated_data.csv", index=False)
