In [1]:
import pandas as pd
import numpy as np

import seaborn as sb
import matplotlib.pyplot as plt
import shap

from econml.dml import DML
from econml.dr import DRLearner
from econml.metalearners import TLearner, SLearner, XLearner, DomainAdaptationLearner
from econml.inference import BootstrapInference

from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, GradientBoostingRegressor, GradientBoostingClassifier, HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.linear_model import ElasticNetCV
from sklearn.model_selection import GridSearchCV

In [2]:
sales_df_train = pd.read_csv("/Users/jaydeepchakraborty/JC/git-projects/model_util/DataSets/walmart_sale_challenge/train.csv")
sales_df_test = pd.read_csv("/Users/jaydeepchakraborty/JC/git-projects/model_util/DataSets/walmart_sale_challenge/test.csv")

In [3]:
print(sales_df_train.shape, sales_df_test.shape)

(282451, 16) (139119, 15)


In [4]:
sales_df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 282451 entries, 0 to 282450
Data columns (total 16 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   Store         282451 non-null  int64  
 1   Dept          282451 non-null  int64  
 2   Date          282451 non-null  object 
 3   Weekly_Sales  282451 non-null  float64
 4   IsHoliday     282451 non-null  bool   
 5   Temperature   282451 non-null  float64
 6   Fuel_Price    282451 non-null  float64
 7   MarkDown1     100520 non-null  float64
 8   MarkDown2     74232 non-null   float64
 9   MarkDown3     91521 non-null   float64
 10  MarkDown4     90031 non-null   float64
 11  MarkDown5     101029 non-null  float64
 12  CPI           282451 non-null  float64
 13  Unemployment  282451 non-null  float64
 14  Type          282451 non-null  object 
 15  Size          282451 non-null  int64  
dtypes: bool(1), float64(10), int64(3), object(2)
memory usage: 32.6+ MB


In [5]:
# Applying the condition
sales_df_train['Type'].mask(sales_df_train['Type'] == 'A', 0, inplace=True)
sales_df_train['Type'].mask(sales_df_train['Type'] == 'B', 1, inplace=True)
sales_df_train['Type'].mask(sales_df_train['Type'] == 'C', 2, inplace=True)

sales_df_test['Type'].mask(sales_df_test['Type'] == 'A', 0, inplace=True)
sales_df_test['Type'].mask(sales_df_test['Type'] == 'B', 1, inplace=True)
sales_df_test['Type'].mask(sales_df_test['Type'] == 'C', 2, inplace=True)


# Applying the condition
sales_df_train['IsHoliday'].mask(sales_df_train['IsHoliday'] == True, 0, inplace=True)
sales_df_train['IsHoliday'].mask(sales_df_train['IsHoliday'] == False, 1, inplace=True)

sales_df_test['IsHoliday'].mask(sales_df_test['IsHoliday'] == True, 0, inplace=True)
sales_df_test['IsHoliday'].mask(sales_df_test['IsHoliday'] == False, 1, inplace=True)

In [6]:
# checking which columns are datatype ~ Object
for col_nm in sales_df_train.columns:
    print(col_nm, sales_df_train[col_nm].dtype)

Store int64
Dept int64
Date object
Weekly_Sales float64
IsHoliday object
Temperature float64
Fuel_Price float64
MarkDown1 float64
MarkDown2 float64
MarkDown3 float64
MarkDown4 float64
MarkDown5 float64
CPI float64
Unemployment float64
Type object
Size int64


In [7]:
dml_train_Y = sales_df_train['Weekly_Sales']
dml_train_T = sales_df_train['Type']
dml_train_W = None
dml_train_Z = None
dml_train_X = sales_df_train[['Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI']]
cat_cols_name = [0, 1] # ['Dept', 'IsHoliday']

In [8]:
dml_est = DML(
                model_y = HistGradientBoostingRegressor(random_state=42, categorical_features=cat_cols_name),
                model_t = HistGradientBoostingClassifier(random_state=42, categorical_features=cat_cols_name),
                model_final = ElasticNetCV(fit_intercept=False),
                discrete_treatment=True
            )

In [9]:
dml_est.fit(dml_train_Y, dml_train_T, X=dml_train_X, W=dml_train_W)

<econml.dml.dml.DML at 0x7ff05ce20350>

In [10]:
dml_est.summary()

Final model doesn't have a `coef_stderr_` and `intercept_stderr_` attributes, only point estimates will be available.
Final model doesn't have a `coef_stderr_` and `intercept_stderr_` attributes, only point estimates will be available.


0,1
,point_estimate
Dept|Type_1,-118.879
Dept|Type_2,-18.045
IsHoliday|Type_1,0.0
IsHoliday|Type_2,-0.0
Temperature|Type_1,36.501
Temperature|Type_2,-37.59
Fuel_Price|Type_1,1.084
Fuel_Price|Type_2,-1.852
CPI|Type_1,-37.185

0,1
,point_estimate
cate_intercept|Type_1,0.0
cate_intercept|Type_2,-0.0


In [11]:
print(f"features: {dml_est.cate_feature_names()}, output: {dml_est.cate_output_names()}, treatment: {dml_est.cate_treatment_names()}")

features: ['Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI'], output: ['Weekly_Sales'], treatment: ['Type_1', 'Type_2']


In [12]:
dml_test_T = sales_df_test['Type'].copy()
dml_test_W = None
dml_test_Z = None
dml_test_X = sales_df_test[['Dept', 'IsHoliday', 'Temperature', 'Fuel_Price', 'CPI']].copy()

In [13]:
dml_test_seg = dml_test_X.iloc[[2, 4, 5, 8, 13, 25, 0, 19, 90, 43]].copy()
# dml_test_seg.reset_index(inplace = True)
print(dml_test_seg)

    Dept IsHoliday  Temperature  Fuel_Price         CPI
2      4         1        76.67       4.087  129.062355
4     52         1        45.12       2.841  126.436419
5     49         1        77.30       3.563  129.112500
8     33         1        86.13       3.594  218.450940
13    97         1        42.74       3.619  130.645793
25    35         1        77.16       3.570  129.066300
0     40         1        50.43       3.599  130.157516
19    97         1        81.76       3.311  220.614875
90    32         1        71.17       3.617  223.774444
43    19         1        79.14       3.263  130.701290


In [14]:
'''
# Calculate the heterogeneous treatment effect (.effect() result same)
# The treatment effect is the estimated average effect on Y from moving from T=0 to T=1, given X.
# The treatment effect is the estimated average effect on Y from moving from T=0 to T=2, given X.
# The treatment effect is the estimated average effect on Y from moving from T=1 to T=2, given X.
'''
dml_est_effect_01 = dml_est.effect_inference(dml_test_seg, T0=0, T1=1).summary_frame()
dml_est_effect_02 = dml_est.effect_inference(dml_test_seg, T0=0, T1=2).summary_frame()
dml_est_effect_12 = dml_est.effect_inference(dml_test_seg, T0=1, T1=2).summary_frame()
pd.concat([dml_est_effect_01, dml_est_effect_02, dml_est_effect_12], axis=1)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''