# XG Boost Regressor

## Importing the libraries

In [217]:
import os
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd


## Importing the dataset

In [218]:
# pyspark_df = spark.table("sd_bdc_demo.xgboost_regression.1_2_salesforce_xgboost_updated_data")
# pyspark_df.display()
# df = pyspark_df.toPandas()

In [219]:
# Load csv file
df = pd.read_csv("1_2_salesforce_xgboost_updated_data/1_2_salesforce_xgboost_updated_data.csv")

In [220]:
print(df)

      Sales_Order                 Stage  Amount  Probability    Sales_Person  \
0            5708           Closed Lost  255000            0  Alex Rodriguez   
1            5709       Decision Makers  315000           70   Sarah Johnson   
2            5710  Proposal/Price Quote  275000           65   Michael Chang   
3            5711            Closed Won  135000          100     Emma Wilson   
4            5712    Negotiation/Review  325000           85  David Martinez   
...           ...                   ...     ...          ...             ...   
1091         6799  Proposal/Price Quote  210000           60   Michael Chang   
1092         6800       Decision Makers  185000           75   Sarah Johnson   
1093         6801         Qualification   85000           25   Michael Chang   
1094         6802        Needs Analysis  355000           30  David Martinez   
1095         6803   Perception Analysis  175000           15  Alex Rodriguez   

        Company_Industry              C

In [221]:
print(df.shape)

(1096, 11)


In [222]:
print(df.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Probability', 'Sales_Person',
       'Company_Industry', 'Contact_Title', 'Type', 'Lead_Source', 'State',
       'days_to_close'],
      dtype='object')


In [223]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1096 entries, 0 to 1095
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Sales_Order       1096 non-null   int64 
 1   Stage             1096 non-null   object
 2   Amount            1096 non-null   int64 
 3   Probability       1096 non-null   int64 
 4   Sales_Person      1096 non-null   object
 5   Company_Industry  1096 non-null   object
 6   Contact_Title     1096 non-null   object
 7   Type              1096 non-null   object
 8   Lead_Source       1096 non-null   object
 9   State             1096 non-null   object
 10  days_to_close     1096 non-null   int64 
dtypes: int64(4), object(7)
memory usage: 94.3+ KB
None


## Encoding categorical data

### Encoding the Independent Variable

In [224]:
from sklearn.preprocessing import LabelEncoder

df_encoded = df.copy()
categorical_cols = [
    'Sales_Person',
    'Company_Industry',
    'Contact_Title',
    'Type',
    'Lead_Source',
    'State'
]

# Save encoder for later use
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  


In [225]:
print(df_encoded.head())

   Sales_Order                 Stage  Amount  Probability  Sales_Person  \
0         5708           Closed Lost  255000            0             0   
1         5709       Decision Makers  315000           70             4   
2         5710  Proposal/Price Quote  275000           65             3   
3         5711            Closed Won  135000          100             2   
4         5712    Negotiation/Review  325000           85             1   

   Company_Industry  Contact_Title  Type  Lead_Source  State  days_to_close  
0                12             29     0            5     42             45  
1                11             20     0           14     23             97  
2                 8              8     0            6     39            117  
3                19             28     0           16     31             36  
4                15             41     0           10     29             92  


In [226]:
print(df_encoded.shape)

(1096, 11)


In [227]:
print(df_encoded.columns)

Index(['Sales_Order', 'Stage', 'Amount', 'Probability', 'Sales_Person',
       'Company_Industry', 'Contact_Title', 'Type', 'Lead_Source', 'State',
       'days_to_close'],
      dtype='object')


## Split two Data Frame for Independent Features & Dependent Columns

In [228]:
df_independent = df_encoded.drop(columns=['Probability'])
X = df_independent
df_dependent = df_encoded[['Probability']]
y = df_dependent

In [229]:
print(df_independent.head())

   Sales_Order                 Stage  Amount  Sales_Person  Company_Industry  \
0         5708           Closed Lost  255000             0                12   
1         5709       Decision Makers  315000             4                11   
2         5710  Proposal/Price Quote  275000             3                 8   
3         5711            Closed Won  135000             2                19   
4         5712    Negotiation/Review  325000             1                15   

   Contact_Title  Type  Lead_Source  State  days_to_close  
0             29     0            5     42             45  
1             20     0           14     23             97  
2              8     0            6     39            117  
3             28     0           16     31             36  
4             41     0           10     29             92  


In [230]:
print(df_independent.shape)

(1096, 10)


In [231]:
print(df_dependent.head())

   Probability
0            0
1           70
2           65
3          100
4           85


In [232]:
print(df_dependent.shape)

(1096, 1)


## Splitting the dataset into the Training set and Test set (`Closed Won` & `Closed Lost` = `train`)

In [233]:
# Define which rows go into train and test based on 'Stage'
is_train = df_encoded['Stage'].isin(['Closed Won', 'Closed Lost'])

# Split X
X_train = X[is_train] 
X_test = X[~is_train]

# Split y
y_train = y[is_train]
y_test = y[~is_train]  


In [234]:
print(X_train)

    Sales_Order        Stage  Amount  Sales_Person  Company_Industry  \
0          5708  Closed Lost  255000             0                12   
3          5711   Closed Won  135000             2                19   
8          5716   Closed Won  235000             2                12   
12         5720   Closed Won  225000             2                 8   
17         5725   Closed Won  175000             2                12   
21         5729   Closed Won   85000             2                 4   
26         5734   Closed Won  195000             2                 8   
30         5738   Closed Won  145000             2                 1   
35         5743   Closed Won  265000             2                12   
39         5747   Closed Won  115000             2                 8   
44         5752   Closed Won  295000             2                23   
46         5754  Closed Lost  145000             0                13   
49         5757   Closed Won   72000             2              

In [235]:
print(y_train)

    Probability
0             0
3           100
8           100
12          100
17          100
21          100
26          100
30          100
35          100
39          100
44          100
46            0
49          100
54          100
56            0
59          100
64          100
67          100
71            0
72          100
79          100
84          100
86            0
89          100
94          100


### Splitting part of test data to train data

In [236]:
from sklearn.model_selection import train_test_split
X_train_ren, X_test_ren, y_train_ren, y_test_ren = train_test_split(X_test, y_test, test_size = 0.2, random_state = 1)

In [237]:
X_train = pd.concat([X_train, X_train_ren])
y_train = pd.concat([y_train, y_train_ren])

X_test = X_test_ren
y_test = y_test_ren

In [238]:
print(X_train.head())

    Sales_Order        Stage  Amount  Sales_Person  Company_Industry  \
0          5708  Closed Lost  255000             0                12   
3          5711   Closed Won  135000             2                19   
8          5716   Closed Won  235000             2                12   
12         5720   Closed Won  225000             2                 8   
17         5725   Closed Won  175000             2                12   

    Contact_Title  Type  Lead_Source  State  days_to_close  
0              29     0            5     42             45  
3              28     0           16     31             36  
8               4     2            5     38             36  
12             11     2            4     24             38  
17             48     2            5     21             40  


In [239]:
print(X_train.shape)

(881, 10)


In [240]:
print(X_test.head())

     Sales_Order                 Stage  Amount  Sales_Person  \
145         5853        Needs Analysis  125000             0   
843         6551   Perception Analysis  345000             0   
115         5823  Proposal/Price Quote  195000             3   
253         5961  Proposal/Price Quote  195000             3   
871         6579        Needs Analysis  125000             0   

     Company_Industry  Contact_Title  Type  Lead_Source  State  days_to_close  
145                21             20     0           16     27             15  
843                23             32     0           14     33            204  
115                 7             32     0           18     16            101  
253                 7             32     0           18     16            101  
871                21             20     0           16     27             15  


In [241]:
print(X_test.shape)

(215, 10)


In [242]:
print(y_train.head())

    Probability
0             0
3           100
8           100
12          100
17          100


In [243]:
print(y_train.shape)

(881, 1)


In [244]:
print(y_test.head())

     Probability
145           30
843           20
115           65
253           65
871           30


In [245]:
print(y_test.shape)

(215, 1)


## Feature Scaling
- As this is a tree based model, no need to do feature scaling

## Removing `Sales_Order` & `Stage` in `X_train` & `X_test`

In [246]:
X_train_new = X_train.drop(columns=['Sales_Order', 'Stage', 'Sales_Person'])


In [247]:
print(X_train_new.head())

    Amount  Company_Industry  Contact_Title  Type  Lead_Source  State  \
0   255000                12             29     0            5     42   
3   135000                19             28     0           16     31   
8   235000                12              4     2            5     38   
12  225000                 8             11     2            4     24   
17  175000                12             48     2            5     21   

    days_to_close  
0              45  
3              36  
8              36  
12             38  
17             40  


In [248]:
print(X_train_new.shape)

(881, 7)


In [249]:
X_test_new = X_test.drop(columns=['Sales_Order', 'Stage', 'Sales_Person'])


In [250]:
print(X_test_new.head())

     Amount  Company_Industry  Contact_Title  Type  Lead_Source  State  \
145  125000                21             20     0           16     27   
843  345000                23             32     0           14     33   
115  195000                 7             32     0           18     16   
253  195000                 7             32     0           18     16   
871  125000                21             20     0           16     27   

     days_to_close  
145             15  
843            204  
115            101  
253            101  
871             15  


In [251]:
print(X_test_new.shape)

(215, 7)


## Training the Model

In [252]:
from xgboost import XGBRegressor

# Initialize the model
xgb_model = XGBRegressor(
    n_estimators=100,
    max_depth=4,
    learning_rate=0.1,
    random_state=42
)


### RFE

In [253]:
from sklearn.feature_selection import RFE

rfe = RFE(estimator=xgb_model, n_features_to_select=5)
rfe.fit(X_train_new, y_train)

# selected features after RFE
selected_features = X_train_new.columns[rfe.support_]
print(selected_features)

Index(['Amount', 'Company_Industry', 'Lead_Source', 'State', 'days_to_close'], dtype='object')


In [254]:
# take only selected columns for both X_train_new & X_test_new
X_train_new = X_train_new[selected_features]
X_test_new = X_test_new[selected_features]
xgb_model_train = xgb_model.fit(X_train_new, y_train.values.ravel())


## Prediction

In [255]:
y_pred = xgb_model.predict(X_test_new)

In [256]:
print(y_pred)

[31.85007  20.080666 62.24568  62.24568  31.85007  35.627983 47.454525
 73.13888  31.85007  84.17819  20.029076 69.18998  33.481136 21.397505
 89.45466  29.166641 29.809128 25.639996 44.253544 62.05004  84.51393
 89.45466  64.675186 31.056934 64.675186 33.481136 62.24568  64.675186
 44.607574 62.05004  36.507935 29.809128 74.63363  36.507935 42.25295
 28.354792 47.454525 85.55771  83.308464 56.236645 41.75324  28.354792
 26.874718 84.51393  40.20785  33.27116  62.24568  31.85007  73.7895
 60.304497 69.54027  20.422338 20.080666 30.45172  42.25295  85.55771
 73.13888  20.080666 89.45466  84.17819  20.029076 28.354792 27.351017
 21.081709 69.18998  42.25295  43.973106 40.20785  64.675186 62.05004
 31.436354 79.14603  18.310768 21.397505 83.308464 36.0685   36.0685
 69.54027  25.520807 60.65057  44.607574 29.166641 83.308464 74.63363
 25.639996 44.607574 28.5654   85.55771  44.607574 73.7895   62.24568
 69.54027  44.22354  90.55687  36.0685   26.874718 30.45172  23.657413
 20.080666 60.65

### Saving the output as new table

In [257]:
sales_order = X_test['Sales_Order'].values
stage = X_test['Stage'].values

In [258]:
print("sales_order shape:", sales_order.shape)
print("stage shape:", stage.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


sales_order shape: (215,)
stage shape: (215,)
y_test shape: (215, 1)
y_pred shape: (215,)


In [259]:
print(y_test)

      Probability
145            30
843            20
115            65
253            65
871            30
...           ...
499            70
579            60
1031           20
136            80
7              25

[215 rows x 1 columns]


In [260]:
# Flatten y_test to 1D
y_test = y_test.values.ravel()

In [261]:
print("sales_order shape:", sales_order.shape)
print("stage shape:", stage.shape)
print("y_test shape:", y_test.shape)
print("y_pred shape:", y_pred.shape)


sales_order shape: (215,)
stage shape: (215,)
y_test shape: (215,)
y_pred shape: (215,)


In [262]:
print(y_test)

[30 20 65 65 30 35 45 75 30 85 20 70 35 20 90 30 30 25 45 65 85 90 65 30
 65 35 65 65 45 65 35 30 75 35 40 25 45 85 85 60 40 25 25 85 40 35 65 30
 70 60 70 20 20 30 40 85 75 20 90 85 20 25 25 20 70 40 45 40 65 65 30 80
 15 20 85 35 35 70 25 60 45 30 85 75 25 45 30 85 45 70 65 70 45 90 35 25
 30 25 20 60 80 65 20 30 60 70 90 45 35 15 85 85 65 65 35 60 80 90 30 35
 25 25 30 40 20 35 20 30 30 25 70 30 45 65 80 85 20 70 30 20 25 65 65 35
 45 45 40 40 45 15 75 60 45 35 25 35 85 40 15 45 30 80 75 30 65 20 25 90
 40 65 35 40 75 40 20 25 70 40 70 20 75 40 45 20 45 70 85 85 20 60 85 75
 70 30 40 75 15 70 65 60 75 85 45 35 85 30 40 20 45 85 70 60 20 80 25]


In [263]:
print(y_pred)

[31.85007  20.080666 62.24568  62.24568  31.85007  35.627983 47.454525
 73.13888  31.85007  84.17819  20.029076 69.18998  33.481136 21.397505
 89.45466  29.166641 29.809128 25.639996 44.253544 62.05004  84.51393
 89.45466  64.675186 31.056934 64.675186 33.481136 62.24568  64.675186
 44.607574 62.05004  36.507935 29.809128 74.63363  36.507935 42.25295
 28.354792 47.454525 85.55771  83.308464 56.236645 41.75324  28.354792
 26.874718 84.51393  40.20785  33.27116  62.24568  31.85007  73.7895
 60.304497 69.54027  20.422338 20.080666 30.45172  42.25295  85.55771
 73.13888  20.080666 89.45466  84.17819  20.029076 28.354792 27.351017
 21.081709 69.18998  42.25295  43.973106 40.20785  64.675186 62.05004
 31.436354 79.14603  18.310768 21.397505 83.308464 36.0685   36.0685
 69.54027  25.520807 60.65057  44.607574 29.166641 83.308464 74.63363
 25.639996 44.607574 28.5654   85.55771  44.607574 73.7895   62.24568
 69.54027  44.22354  90.55687  36.0685   26.874718 30.45172  23.657413
 20.080666 60.65

In [264]:
results_df = pd.DataFrame({
    'sales_order': sales_order,
    'stage': stage,
    'Actual': y_test,
    'Predicted': y_pred
})


In [265]:
# Create the folder if it doesn't exist
os.makedirs("2_1_salesforce_xgboost_prediction_data", exist_ok=True)

# Save the Excel file inside the folder
results_df.to_csv("2_1_salesforce_xgboost_prediction_data/2_1_salesforce_xgboost_prediction_data.csv", index=False)


In [266]:
# spark_df = spark.createDataFrame(results_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.xgboost_regression.2_1_salesforce_xgboost_prediction_data")

## Accuracy

In [267]:
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, mean_absolute_percentage_error

print("r2:\n", r2_score(y_test, y_pred))
print("MAE:\n", mean_absolute_error(y_test, y_pred))
print("RMSE:\n", np.sqrt(mean_squared_error(y_test, y_pred)))
print("MAPE:\n", mean_absolute_percentage_error(y_test, y_pred))

r2:
 0.9949851036071777
MAE:
 1.2929610013961792
RMSE:
 1.623780379741642
MAPE:
 0.033611804246902466


In [268]:
mape = mean_absolute_percentage_error(y_test, y_pred) * 100

accuracy = 100 - mape

print(f"MAPE: {mape:.2f}%")
print(f"Accuracy: {accuracy:.2f}%")


MAPE: 3.36%
Accuracy: 96.64%


### saving the metrics

In [269]:
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = mean_absolute_percentage_error(y_test, y_pred)
accuracy = 100 - (mape*100)

# Create a DataFrame
metrics_df = pd.DataFrame({
    "Metric": ["R² Score", "MAE", "RMSE", "MAPE", "accuracy (100-mape)"],
    "Value": [r2, mae, rmse, mape, accuracy]
})


In [270]:
# Create the folder if it doesn't exist
os.makedirs("2_3_salesforce_xgboost_metrics_data", exist_ok=True)

# Save the Excel file inside the folder
metrics_df.to_csv("2_3_salesforce_xgboost_metrics_data/2_3_salesforce_xgboost_metrics_data.csv", index=False)


In [271]:
# spark_df = spark.createDataFrame(metrics_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.xgboost_regression.2_3_salesforce_xgboost_metrics_data")

## Applying k-Fold Cross Validation

### Shuffled

In [272]:
from sklearn.model_selection import cross_val_score, KFold
num_folds = 5
kf = KFold(n_splits=num_folds, shuffle=True, random_state=42)
cross_val_results = cross_val_score(xgb_model, X=X_train_new, y=y_train, cv=kf)

In [273]:
print("Cross-Validation Results (Accuracy):")
for i, result in enumerate(cross_val_results, 1):
    print(f"  Fold {i}: {result * 100:.2f}%")

print("Kfold Mean ccuracy: {:.2f} %".format(cross_val_results.mean()*100))
print("Standard Deviation: {:.2f} %".format(cross_val_results.std()*100))

Cross-Validation Results (Accuracy):
  Fold 1: 97.77%
  Fold 2: 88.06%
  Fold 3: 82.69%
  Fold 4: 97.65%
  Fold 5: 86.19%
Kfold Mean ccuracy: 90.47 %
Standard Deviation: 6.16 %


### Unshuffled

In [274]:

accuracies = cross_val_score(estimator = xgb_model, X = X_train_new, y = y_train, cv = 5)
print("Kfold Accuracy: {:.2f} %".format(accuracies.mean()*100))
print("Standard Deviation: {:.2f} %".format(accuracies.std()*100))

Kfold Accuracy: 88.85 %
Standard Deviation: 21.15 %


## Feature Importance with `shap`

In [275]:
print(xgb_model_train.feature_importances_)


[0.13880102 0.12153075 0.229406   0.09693804 0.4133242 ]


In [276]:
import shap 

explainer = shap.Explainer(xgb_model_train)
shap_values = explainer(X_train_new)
# for i in range(0,len(X_train)):
#     print(shap_values[i])
#     shap.plots.waterfall(shap_values[i])

In [277]:
# print(shap_values)

In [278]:
import pandas as pd
importance_df = pd.DataFrame({
    "Feature": X_train_new.columns,
    "Mean SHAP Value": np.abs(shap_values.values).mean(axis=0)
}).sort_values(by="Mean SHAP Value", ascending=False)

print(importance_df)


            Feature  Mean SHAP Value
4     days_to_close        15.287538
0            Amount         6.016329
2       Lead_Source         2.701881
1  Company_Industry         2.370403
3             State         1.334266


### Export Feature Importance

In [279]:
# Create the folder if it doesn't exist
os.makedirs("2_2_salesforce_feature_importance_data", exist_ok=True)

# Save the Excel file inside the folder
importance_df.to_csv("2_2_salesforce_feature_importance_data/2_2_salesforce_feature_importance_data.csv", index=False)


In [280]:
# spark_df = spark.createDataFrame(importance_df)
# spark_df.write.mode("overwrite").saveAsTable("sd_bdc_demo.xgboost_regression.2_2_salesforce_feature_importance_data")