In [None]:
"""
Inverse Probability Score Weighting
---

🔍 **Situation**:


📌 **Task**:


✨ **Action**: 


📈 **Result**:


🚀 **Next Steps**:


✍ **Author**: Justin Wall
📅 **Updated**: 04/05/2025
"""

'\nHidden Markov Model (HMM) for Customer Journey Analysis\n---\n\n🔍 **Situation**:\nCustomer journey data consists of observed actions like browsing,\nemail engagement, and purchases, but the underlying behavioral states\n(e.g., "Exploring" vs. "Highly Engaged") are unknown.\nUnderstanding these hidden states could help businesses personalize marketing efforts and predict conversions.\n\n📌 **Task**:\nI aimed to build a Hidden Markov Model (HMM) to infer hidden customer behavior states from observed actions.\nThe goal was to segment users based on their engagement and predict their likelihood of purchasing or becoming inactive.\n\n✨ **Action**: \n    Created Synthetic Customer Journey Data\n        Simulated user interactions across five observed actions (browse, email engagement, app engagement, engaged browse, and purchase).\n        Defined hidden states representing behavioral groups: Exploring, Engaged, Highly Engaged, Buyers, and Dormant.\n        Modeled state transitions and ob

In [9]:
# Generate fake dataset
import numpy as np
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LogisticRegression
import statsmodels.formula.api as smf

# Set random seed for reproducibility
np.random.seed(42)

# Define total counts
treated_count = 533_518
control_count = 7_638_613

# Generate treatment indicator (1 = saw ad, 0 = did not)
treatment = np.concatenate([np.ones(treated_count), np.zeros(control_count)])

# Generate purchases (based on given conversion rates)
treated_purchases = np.random.choice([1, 0], size=treated_count, p=[4_393 / treated_count, 1 - (4_393 / treated_count)])
control_purchases = np.random.choice([1, 0], size=control_count, p=[599 / control_count, 1 - (599 / control_count)])

# Create dataframe
df = pd.DataFrame({
    "treatment": treatment,
    "purchase": np.concatenate([treated_purchases, control_purchases])
})

# Check dataset
df.head()


Unnamed: 0,treatment,purchase
0,1.0,0
1,1.0,0
2,1.0,0
3,1.0,0
4,1.0,0


In [10]:
df['treatment'].value_counts(), df['purchase'].value_counts()

(treatment
 0.0    7638613
 1.0     533518
 Name: count, dtype: int64,
 purchase
 0    8167200
 1       4931
 Name: count, dtype: int64)

In [11]:
# Generate some synthetic user features (e.g., past behavior)
df["past_purchases"] = np.random.poisson(0.1, size=len(df))  # Simulating past purchases
df["site_visits"] = np.random.poisson(5, size=len(df))  # Simulating site visit frequency

# Logistic regression to estimate propensity scores
X = df[["past_purchases", "site_visits"]]
y = df["treatment"]

prop_model = LogisticRegression()
prop_model.fit(X, y)

# Get propensity scores (probability of treatment)
df["propensity_score"] = prop_model.predict_proba(X)[:, 1]

df.head()

Unnamed: 0,treatment,purchase,past_purchases,site_visits,propensity_score
0,1.0,0,1,10,0.065864
1,1.0,0,0,5,0.065251
2,1.0,0,1,6,0.065555
3,1.0,0,0,5,0.065251
4,1.0,0,0,3,0.065097


In [12]:
# Compute inverse probability weights
df["weight"] = np.where(df["treatment"] == 1, 
                        1 / df["propensity_score"], 
                        1 / (1 - df["propensity_score"]))

# Check summary
df.head()

Unnamed: 0,treatment,purchase,past_purchases,site_visits,propensity_score,weight
0,1.0,0,1,10,0.065864,15.182761
1,1.0,0,0,5,0.065251,15.325435
2,1.0,0,1,6,0.065555,15.254434
3,1.0,0,0,5,0.065251,15.325435
4,1.0,0,0,3,0.065097,15.361586


In [13]:
# Fit weighted regression model
model = smf.logit("purchase ~ treatment", data=df, weights=df["weight"]).fit()

# Show results
print(model.summary())



Optimization terminated successfully.
         Current function value: 0.003841
         Iterations 13




                           Logit Regression Results                           
Dep. Variable:               purchase   No. Observations:              8172131
Model:                          Logit   Df Residuals:                  8172129
Method:                           MLE   Df Model:                            1
Date:                Sat, 05 Apr 2025   Pseudo R-squ.:                  0.2433
Time:                        06:14:29   Log-Likelihood:                -31391.
converged:                       True   LL-Null:                       -41483.
Covariance Type:            nonrobust   LLR p-value:                     0.000
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -9.4668      0.041   -230.134      0.000      -9.547      -9.386
treatment      4.6634      0.044    106.303      0.000       4.577       4.749

Possibly complete quasi-separation: A fraction 0.93