In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import statsmodels.api as sm

np.random.seed(42)

# Step 1: Simulate latent variable - User Interest (U)
n = 1000
U = np.random.normal(0, 1, n)  # user interest

# Step 2: Simulate Posting Frequency (A) influenced by User Interest
A = 0.7 * U + np.random.normal(0, 1, n)

# Step 3: Simulate Platform Boost (C) based on Posting Frequency
C = 0.6 * A + np.random.normal(0, 1, n)

# Step 4: Simulate Engagement (B) influenced by both User Interest and Boost
B = 0.5 * U + 0.7 * C + np.random.normal(0, 1, n)

# Build DataFrame
df = pd.DataFrame({
    "UserInterest": U,
    "PostingFrequency": A,
    "PlatformBoost": C,
    "Engagement": B
})

# Step 5: Correlation matrix
print("Correlation matrix:\n", df.corr())

# Step 6: Regression without controlling for UserInterest (U) - potentially confounded
X1 = sm.add_constant(df["PostingFrequency"])
model1 = sm.OLS(df["Engagement"], X1).fit()
print("\nRegression without controlling for User Interest:")
print(model1.summary())

# Step 7: Regression controlling for UserInterest (U) - backdoor path blocked
X2 = sm.add_constant(df[["PostingFrequency", "UserInterest"]])
model2 = sm.OLS(df["Engagement"], X2).fit()
print("\nRegression controlling for User Interest:")
print(model2.summary())
