In [87]:
import sys

import pandas as pd # type: ignore
import numpy as np # type: ignore

import matplotlib.pyplot as plt # type: ignore
import seaborn as sns # type: ignore


import networkx as nx # type: ignore
import graphviz # type: ignore

import statsmodels.formula.api as smf # type: ignore
import statsmodels.api # type: ignore
from statsmodels.miscmodels.ordinal_model import OrderedModel # type: ignore

from dowhy import CausalModel, gcm # type: ignore

from sklearn.linear_model import LogisticRegression # type: ignore
from sklearn.neighbors import NearestNeighbors # type: ignore

In [103]:
# Clean data
df = pd.read_csv("../data/processed/data.csv")

In [104]:
# Dealing with missing values for now
df.rename(columns={'Product_price': 'revenue'}, inplace=True) 
df['Product_price'] = df['revenue'] + df['freight_value']
df['Product_weight_kg'] = df['Product_weight_kg'].fillna(df['Product_weight_kg'].mean())
df['distance_km'] = df['distance_km'].fillna(df['distance_km'].mean())
df['Product_size'] = df['Product_size'].fillna(df['Product_size'].mean())
df['customer_avg_score'] = df['customer_avg_score'].fillna(df['customer_avg_score'].mean())
df['No_photos'] = df['No_photos'].fillna(df['No_photos'].mean())
df['Product_price'] = df['Product_price'].fillna(df['Product_price'].mean())
df['freight_value'] = df['freight_value'].fillna(df['freight_value'].mean())
df['Rating'] = df['Rating'].round()
df['Product_category'] = df['Product_category'].fillna(df['Product_category'].mode()[0])
df['revenue'] = df['revenue'].fillna(df['revenue'].mean())

# Regression 
confounded

In [69]:
formula = 'Rating ~ is_delivery_late'
model = smf.ols(formula=formula, data=df)
results = model.fit()
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                 Rating   R-squared:                       0.103
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                 1.365e+04
Date:                Tue, 01 Oct 2024   Prob (F-statistic):               0.00
Time:                        22:31:37   Log-Likelihood:            -2.0222e+05
No. Observations:              119143   AIC:                         4.044e+05
Df Residuals:                  119141   BIC:                         4.045e+05
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            4.1318      0.004  

In [70]:
# Interpret
model = OrderedModel(df['Rating'], df['is_delivery_late'], distr='logit')
results = model.fit(method='bfgs')
print(results.summary())


Optimization terminated successfully.
         Current function value: 1.193346
         Iterations: 19
         Function evaluations: 22
         Gradient evaluations: 22
                             OrderedModel Results                             
Dep. Variable:                 Rating   Log-Likelihood:            -1.4218e+05
Model:                   OrderedModel   AIC:                         2.844e+05
Method:            Maximum Likelihood   BIC:                         2.844e+05
Date:                Tue, 01 Oct 2024                                         
Time:                        22:31:39                                         
No. Observations:              119143                                         
Df Residuals:                  119138                                         
Df Model:                           1                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
--------------------------------

\begin{align*}
    \text{logit}(P(Y \leq 1.0/2.0)) &= -2.1580 - 2.2398x_1 \\
    \text{logit}(P(Y \leq 2.0/3.0)) &= -1.1719 -2.2398x_1
\end{align*}


---
# Effect Inference

### **DoWhy**: Potential Outcomes

In [111]:
# nodes
nodes_list = [
'Rating',
'Product_category_encoded',
'freight_value',
'distance_km',
'season',
'is_delivery_late'
]
# edges
edges_list = [
    ('season', 'Rating'),
    ('season', 'is_delivery_late'),
    ('season', 'Product_category_encoded'),
    ('is_delivery_late', 'Rating'),
    ('Product_category_encoded', 'is_delivery_late'),
    ('Product_category_encoded', 'Rating'),
    ('Product_category_encoded', 'freight_value'),
    ('Product_category_encoded', 'Rating'),
    ('freight_value', 'is_delivery_late'),
    ('freight_value', 'Rating'),
    ('distance_km', 'freight_value'),
    ('distance_km', 'Rating'),
]

# Graph
G = nx.DiGraph()
G.add_nodes_from(nodes_list)
G.add_edges_from(edges_list)

In [65]:
methods_dict = {
    "Propensity Score Matching": "backdoor.propensity_score_matching",
    "Propensity Score Stratification": "backdoor.propensity_score_stratification",
    "Propensity Score-based Inverse Weighting": "backdoor.propensity_score_weighting",
    "Linear Regression": "backdoor.linear_regression",
    "Generalized Linear Models": "backdoor.generalized_linear_model",
    "Instrumental Variables": "iv.instrumental_variable",
    "Regression Discontinuity": "iv.regression_discontinuity",
    "Two Stage Regression": "frontdoor.two_stage_regression"
}


In [114]:
# Causal Graph params
treatment='is_delivery_late' 
outcome='Rating'

# Estimation params
method_name_estimation = 'backdoor.propensity_score_weighting'
target_units = 'ate'


# Causal Model
model = CausalModel(data=df, graph=G, treatment=treatment, outcome=outcome)
# causal_graph.replace("\n", " ")


# Identification
identified_estimand = model.identify_effect(proceed_when_unidentifiable=True)

# Estimation
causal_estimate = model.estimate_effect(identified_estimand, method_name=method_name_estimation, target_units=target_units)
print(causal_estimate)

*** Causal Estimate ***

## Identified estimand
Estimand type: EstimandType.NONPARAMETRIC_ATE

### Estimand : 1
Estimand name: backdoor
Estimand expression:
         d                                                                  
───────────────────(E[Rating|freight_value,season,Product_category_encoded])
d[is_delivery_late]                                                         
Estimand assumption 1, Unconfoundedness: If U→{is_delivery_late} and U→Rating then P(Rating|is_delivery_late,freight_value,season,Product_category_encoded,U) = P(Rating|is_delivery_late,freight_value,season,Product_category_encoded)

## Realized estimand
b: Rating~is_delivery_late+freight_value+season+Product_category_encoded
Target units: ate

## Estimate
Mean value: -1.817011277817044



In [115]:
# Refutation params
method_name='random_common_cause'
placebo_type='permute'

refutation = model.refute_estimate(identified_estimand, causal_estimate, method_name=method_name,
                                           placebo_type=placebo_type, num_simulations=20, show_progress_bar=True) 

print(refutation)

Refuting Estimates: 100%|[32m██████████[0m| 20/20 [00:01<00:00, 15.63it/s]

Refute: Add a random common cause
Estimated effect:-1.817011277817044
New effect:-1.8170112778170442
p value:0.15865525393145707




