# XG Boost Regressor

## Importing the libraries

In [34]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [35]:
# Load csv file
df = pd.read_csv("1_salesforce_updated_data/1_salesforce_updated_data.csv")

In [36]:
print(df)

    Sales_Order                 Stage  Amount  Probability    Sales_Person  \
0          5708           Closed Lost  255000            0  Alex Rodriguez   
1          5709       Decision Makers  315000           70   Sarah Johnson   
2          5710  Proposal/Price Quote  275000           65   Michael Chang   
3          5711            Closed Won  135000          100     Emma Wilson   
4          5712    Negotiation/Review  325000           85  David Martinez   
..          ...                   ...     ...          ...             ...   
91         5799   Perception Analysis  475000           25  Alex Rodriguez   
92         5800     Value Proposition  115000           40   Sarah Johnson   
93         5801         Qualification   95000           20   Michael Chang   
94         5802            Closed Won   68000          100     Emma Wilson   
95         5803        Needs Analysis  185000           35  David Martinez   

      Company_Industry              Contact_Title          Type

In [37]:
print(df.shape)

(96, 11)


In [38]:
print(df.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Probability', 'Sales_Person',
       'Company_Industry', 'Contact_Title', 'Type', 'Lead_Source', 'State',
       'days_to_close'],
      dtype='object')


## Encoding categorical data

### Encoding the Independent Variable

In [39]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
categorical_cols = [
    'Sales_Person',
    'Company_Industry',
    'Contact_Title',
    'Type',
    'Lead_Source',
    'State'
]

# Save encoder for later use
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  


In [40]:
print(df_encoded.head())

   Sales_Order                 Stage  Amount  Probability  Sales_Person  \
0         5708           Closed Lost  255000            0             0   
1         5709       Decision Makers  315000           70             4   
2         5710  Proposal/Price Quote  275000           65             3   
3         5711            Closed Won  135000          100             2   
4         5712    Negotiation/Review  325000           85             1   

   Company_Industry  Contact_Title  Type  Lead_Source  State  days_to_close  
0                12             29     0            5     42             45  
1                11             20     0           14     23             97  
2                 8              8     0            6     39            117  
3                19             28     0           16     31             36  
4                15             41     0           10     29             92  


In [41]:
print(df_encoded.shape)

(96, 11)


In [42]:
print(df_encoded.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Probability', 'Sales_Person',
       'Company_Industry', 'Contact_Title', 'Type', 'Lead_Source', 'State',
       'days_to_close'],
      dtype='object')


## Split two Data Frame for Independent Features & Dependent Columns

In [43]:
df_independent = df_encoded.drop(columns=['Probability'])
X = df_independent
df_dependent = df_encoded[['Probability']]
y = df_dependent

In [44]:
print(df_independent.head())

   Sales_Order                 Stage  Amount  Sales_Person  Company_Industry  \
0         5708           Closed Lost  255000             0                12   
1         5709       Decision Makers  315000             4                11   
2         5710  Proposal/Price Quote  275000             3                 8   
3         5711            Closed Won  135000             2                19   
4         5712    Negotiation/Review  325000             1                15   

   Contact_Title  Type  Lead_Source  State  days_to_close  
0             29     0            5     42             45  
1             20     0           14     23             97  
2              8     0            6     39            117  
3             28     0           16     31             36  
4             41     0           10     29             92  


In [45]:
print(df_independent.shape)

(96, 10)


In [46]:
print(df_dependent.head())

   Probability
0            0
1           70
2           65
3          100
4           85


In [47]:
print(df_dependent.shape)

(96, 1)


## Splitting the dataset into the Training set and Test set (`Closed Won` & `Closed Lost` = `train`)

In [48]:
# Define which rows go into train and test based on 'Stage'
is_train = df_encoded['Stage'].isin(['Closed Won', 'Closed Lost'])

# Split X
X_train = X[is_train] 
X_test = X[~is_train]

# Split y
y_train = y[is_train]
y_test = y[~is_train]  


In [49]:
print(X_train.head())

    Sales_Order        Stage  Amount  Sales_Person  Company_Industry  \
0          5708  Closed Lost  255000             0                12   
3          5711   Closed Won  135000             2                19   
8          5716   Closed Won  235000             2                12   
12         5720   Closed Won  225000             2                 8   
17         5725   Closed Won  175000             2                12   

    Contact_Title  Type  Lead_Source  State  days_to_close  
0              29     0            5     42             45  
3              28     0           16     31             36  
8               4     2            5     38             36  
12             11     2            4     24             38  
17             48     2            5     21             40  


In [50]:
print(X_train.shape)

(25, 10)


In [51]:
print(X_test.head())

   Sales_Order                 Stage  Amount  Sales_Person  Company_Industry  \
1         5709       Decision Makers  315000             4                11   
2         5710  Proposal/Price Quote  275000             3                 8   
4         5712    Negotiation/Review  325000             1                15   
5         5713   Perception Analysis  345000             0                23   
6         5714     Value Proposition  195000             4                 6   

   Contact_Title  Type  Lead_Source  State  days_to_close  
1             20     0           14     23             97  
2              8     0            6     39            117  
4             41     0           10     29             92  
5             32     0           14     33            204  
6             20     0            2     41            158  


In [52]:
print(X_test.shape)

(71, 10)


In [53]:
print(y_train.head())

    Probability
0             0
3           100
8           100
12          100
17          100


In [54]:
print(y_train.shape)

(25, 1)


In [55]:
print(y_test.head())

   Probability
1           70
2           65
4           85
5           20
6           45


In [56]:
print(y_test.shape)

(71, 1)


## Feature Scaling
- As this is a tree based model, no need to do feature scaling

## Removing `Sales_Order` & `Stage` in `X_train` & `X_test`

In [57]:
X_train_new = X_train.drop(columns=['Sales_Order', 'Stage'])


In [58]:
print(X_train_new.head())

    Amount  Sales_Person  Company_Industry  Contact_Title  Type  Lead_Source  \
0   255000             0                12             29     0            5   
3   135000             2                19             28     0           16   
8   235000             2                12              4     2            5   
12  225000             2                 8             11     2            4   
17  175000             2                12             48     2            5   

    State  days_to_close  
0      42             45  
3      31             36  
8      38             36  
12     24             38  
17     21             40  


In [59]:
print(X_train_new.shape)

(25, 8)


In [60]:
X_test_new = X_test.drop(columns=['Sales_Order', 'Stage'])


In [61]:
print(X_test_new.head())

   Amount  Sales_Person  Company_Industry  Contact_Title  Type  Lead_Source  \
1  315000             4                11             20     0           14   
2  275000             3                 8              8     0            6   
4  325000             1                15             41     0           10   
5  345000             0                23             32     0           14   
6  195000             4                 6             20     0            2   

   State  days_to_close  
1     23             97  
2     39            117  
4     29             92  
5     33            204  
6     41            158  


In [62]:
print(X_test_new.shape)

(71, 8)


## Training the Model

In [63]:
from xgboost import XGBRegressor

# Initialize the model
xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    random_state=42
)


In [64]:
xgb_model.fit(X_train_new, y_train.values.ravel())


## Prediction

In [65]:
y_pred = xgb_model.predict(X_test_new)

In [66]:
print(y_pred)

[9.999910e+01 9.999910e+01 1.331171e-02 1.331171e-02 9.999910e+01
 9.999910e+01 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02
 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02 9.999910e+01
 9.999910e+01 1.331171e-02 1.331171e-02 9.999910e+01 9.999910e+01
 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02 1.331171e-02
 9.999910e+01 9.999910e+01 1.331171e-02 9.999910e+01 9.999910e+01
 1.331171e-02 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02
 9.999910e+01 9.999910e+01 1.331171e-02 1.331171e-02 9.999910e+01
 9.999910e+01 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02
 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02 1.331171e-02
 9.999910e+01 9.999910e+01 1.331171e-02 9.999910e+01 9.999910e+01
 1.331171e-02 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02
 1.331171e-02 9.999910e+01 9.999910e+01 1.331171e-02 9.999910e+01
 9.999910e+01 1.331171e-02 1.331171e-02 9.999910e+01 9.999910e+01
 1.331171e-02]


### Saving the output as new table

In [67]:
sales_order = X_test['Sales_Order'].values
stage = X_test['Stage'].values

In [68]:
print("sales_order shape:", sales_order.shape)
print("stage shape:", stage.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


sales_order shape: (71,)
stage shape: (71,)
y_test shape: (71, 1)
y_pred shape: (71,)


In [69]:
print(y_test)

    Probability
1            70
2            65
4            85
5            20
6            45
..          ...
90           85
91           25
92           40
93           20
95           35

[71 rows x 1 columns]


In [70]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [71]:
print("sales_order shape:", sales_order.shape)
print("stage shape:", stage.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


sales_order shape: (71,)
stage shape: (71,)
y_test shape: (71,)
y_pred shape: (71,)


In [72]:
print(y_test)

[70 65 85 20 45 25 30 75 60 85 25 40 30 30 70 65 80 20 45 25 35 75 60 90
 20 40 30 35 70 65 80 15 45 25 30 75 60 85 20 40 30 35 70 65 85 15 45 25
 30 30 60 20 80 40 25 15 70 90 65 75 20 35 45 30 70 60 85 25 40 20 35]


In [73]:
results_df = pd.DataFrame({
    'sales_order': sales_order,
    'stage': stage,
    'Actual': y_test,
    'Predicted': y_pred
})


In [74]:
# Create the folder if it doesn't exist
os.makedirs("2_salesforce_xgboost_prediction_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("2_salesforce_xgboost_prediction_data/2_salesforce_xgboost_prediction_data.csv", index=False)


## Accuracy

In [75]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

print("r2:\n", r2_score(y_test, y_pred))
print("MAE:\n", mean_absolute_error(y_test, y_pred))
print("RMSE:\n", np.sqrt(mean_squared_error(y_test, y_pred)))

r2:
 -4.306451797485352
MAE:
 48.0218391418457
RMSE:
 53.300729576972024
