In [18]:
# Import libary and sklearn
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.tree import export_text

In [3]:
path = '/Users/josephzhuo/anaconda_projects/eg-online-sales-analysis'

df_details = pd.read_csv(path + '/Details.csv')
df_orders = pd.read_csv(path + '/Orders.csv')

In [5]:
# Display descriptive statistics
print("\nDescriptive Statistics:")
print(df_details.describe())


Descriptive Statistics:
            Amount      Profit     Quantity
count  1500.000000  1500.00000  1500.000000
mean    291.847333    24.64200     3.743333
std     461.924620   168.55881     2.184942
min       4.000000 -1981.00000     1.000000
25%      47.750000   -12.00000     2.000000
50%     122.000000     8.00000     3.000000
75%     326.250000    38.00000     5.000000
max    5729.000000  1864.00000    14.000000


In [6]:
# Display descriptive statistics for numerical columns
print("\nDescriptive Statistics:")
print(df_orders.describe())


Descriptive Statistics:
       Order ID  Order Date CustomerName        State    City
count       500         500          500          500     500
unique      500         307          336           19      25
top     B-25696  24-11-2018       Shreya  Maharashtra  Indore
freq          1           7            6           94      71


In [9]:
# Convert categorical columns to numerical (Label)
label_encoder = LabelEncoder()
df_details['Category'] = label_encoder.fit_transform(df_details['Category'])
df_details['Sub-Category'] = label_encoder.fit_transform(df_details['Sub-Category'])
df_details['PaymentMode'] = label_encoder.fit_transform(df_details['PaymentMode'])

In [10]:
X = df_details[['Amount', 'Quantity', 'Category', 'Sub-Category', 'PaymentMode']]
y = df_details['Profit']

In [12]:
#Tst and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [13]:
# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [16]:
# Initialize a Decision Tree Regressor model with a fixed random state for reproducibility
model = DecisionTreeRegressor(random_state=42)

# Define hyperparameter grid for grid search
# - max_depth: Controls the maximum depth of the tree (prevents overfitting)
# - min_samples_split: Minimum samples required to split an internal node
# - min_samples_leaf: Minimum samples required to be at a leaf node
param_grid = dict(
    max_depth=[3, 5, 7, 10],
    min_samples_split=[2, 5, 10],
    min_samples_leaf=[1, 2, 4]
)

# Set up GridSearchCV to find optimal hyperparameters
# - model: The model to be tuned 
# - param_grid: The hyperparameter grid defined above
# - cv=5: 5-fold cross-validation
# - scoring="neg_mean_squared_error": Optimization metric (negative MSE)
# - n_jobs=-1: Use all available CPU cores
gs = GridSearchCV(
    model,
    param_grid,
    cv=5,
    scoring="neg_mean_squared_error",
    n_jobs=-1 
)

# Fit the grid search to the training data
gs.fit(X_train_scaled, y_train)

# Extract the best model from the grid search results
best_model = gs.best_estimator_

In [17]:
# Make predictions on the test set
y_pred = best_model.predict(X_test_scaled)

In [19]:
# I'd like to see the decision tree in order to show it in PBI
feature_names = ['Amount', 'Quantity', 'Category', 'Sub-Category', 'PaymentMode']
tree_rules = export_text(best_model, feature_names=feature_names)
print(tree_rules)

|--- Amount <= 0.91
|   |--- Amount <= -0.04
|   |   |--- Category <= -0.04
|   |   |   |--- value: [4.63]
|   |   |--- Category >  -0.04
|   |   |   |--- value: [-15.68]
|   |--- Amount >  -0.04
|   |   |--- PaymentMode <= 1.23
|   |   |   |--- value: [47.60]
|   |   |--- PaymentMode >  1.23
|   |   |   |--- value: [-40.61]
|--- Amount >  0.91
|   |--- Amount <= 2.78
|   |   |--- Amount <= 2.66
|   |   |   |--- value: [158.93]
|   |   |--- Amount >  2.66
|   |   |   |--- value: [-235.71]
|   |--- Amount >  2.78
|   |   |--- Quantity <= 0.79
|   |   |   |--- value: [565.21]
|   |   |--- Quantity >  0.79
|   |   |   |--- value: [15.08]



In [20]:
print("MEAN:", scaler.mean_)
print("SCALE:", scaler.scale_)

MEAN: [297.73166667   3.76         0.53333333   7.83583333   1.50833333]
SCALE: [479.04500449   2.20243804   0.75755455   4.46081636   1.62529912]


# Translate the model into PBI DAX language
// ---------- input ----------
Amount_input    = SELECTEDVALUE('pAmount'[pAmount Value])
Quantity_input  = SELECTEDVALUE('pQuantity'[pQuantity Value])
Category_input  = SELECTEDVALUE('pCategory'[pCategory Value])
SubCat_input    = SELECTEDVALUE('pSubCategory'[pSubCategory Value])
PayMode_input   = SELECTEDVALUE('pPaymentMode'[pPaymentMode Value])

// ---------- StandardScaler ----------
µ_Amount   = 297.73166667
σ_Amount   = 479.04500449

µ_Quantity = 3.76
σ_Quantity = 2.20243804

µ_Category = 0.53333333
σ_Category = 0.75755455

µ_SubCat   = 7.83583333
σ_SubCat   = 4.46081636

µ_PayMode  = 1.50833333
σ_PayMode  = 1.62529912

// ---------- Z values ----------
Amount_z =
DIVIDE( [Amount_input] - [µ_Amount], [σ_Amount] )

Quantity_z =
DIVIDE( [Quantity_input] - [µ_Quantity], [σ_Quantity] )

Category_z =
DIVIDE( [Category_input] - [µ_Category], [σ_Category] )

SubCat_z =
DIVIDE( [SubCat_input] - [µ_SubCat], [σ_SubCat] )

PayMode_z =
DIVIDE( [PayMode_input] - [µ_PayMode], [σ_PayMode] )


# DAX 
Predicted Profit =
VAR A = [Amount_z]
VAR Q = [Quantity_z]
VAR C = [Category_z]
VAR P = [PayMode_z]
RETURN
SWITCH(
    TRUE(),

    // |--- Amount <= 0.91
    A <= 0.91 &&
        // |   |--- Amount <= -0.04
        A <= -0.04 &&
          // |   |   |--- Category <= -0.04 -> 4.63
          C <= -0.04, 4.63,

    A <= 0.91 &&
        A <= -0.04 &&
          // |   |   |--- Category > -0.04 -> -15.68
          C > -0.04, -15.68,

    A <= 0.91 &&
        // |   |--- Amount > -0.04
        A > -0.04 &&
          // |   |   |--- PaymentMode <= 1.23 -> 47.60
          P <= 1.23, 47.60,

    A <= 0.91 &&
        A > -0.04 &&
          // |   |   |--- PaymentMode > 1.23 -> -40.61
          P > 1.23, -40.61,

    // |--- Amount > 0.91
    A > 0.91 &&
        // |   |--- Amount <= 2.78
        A <= 2.78 &&
          // |   |   |--- Amount <= 2.66 -> 158.93
          A <= 2.66, 158.93,

    A > 0.91 &&
        A <= 2.78 &&
          // |   |   |--- Amount > 2.66 -> -235.71
          A > 2.66, -235.71,

    A > 0.91 &&
        // |   |--- Amount > 2.78
        A > 2.78 &&
          // |   |   |--- Quantity <= 0.79 -> 565.21
          Q <= 0.79, 565.21,

    A > 0.91 &&
        A > 2.78 &&
          // |   |   |--- Quantity > 0.79 -> 15.08
          Q > 0.79, 15.08,

    BLANK()
)
