# Causal Inference with CEM and Weighted Regression

In [None]:
import os
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from cem import CEM
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [None]:
uri = os.environ["MONGODB_URI"]
client = MongoClient(uri, server_api=ServerApi("1"))
client.admin.command("ping")
db = client["real-estate"]
collection = db["listings"]

In [None]:
since = datetime.now() - timedelta(days=30)

pipeline = [
        {
            "$match": {"rental": True, "datetime": {"$gte": since}},
        },
    ]

results = collection.aggregate(pipeline)

df = pd.DataFrame.from_records(results).set_index("_id")
df.head()

In [None]:
df = df[["price", "bed", "bath"]]
df.head()

In [None]:
c = CEM(df, treatment="bed", outcome="price")
c.imbalance()

In [None]:
schema = {
    "bed": ("cut", {"bins": df["bed"].nunique()}),
    "bath": ("cut", {"bins": df["bath"].nunique()}),
}
c.imbalance(schema)

In [None]:
weights = c.match(schema)
print(np.isclose(weights, 0).sum(), f"/{len(weights)} examples effectively discarded")

In [None]:
plt.grid()
sns.scatterplot(data=df, x="bed", y="bath", legend=False)

In [None]:
df["weight"] = weights
plt.grid()
sns.scatterplot(data=df, x="bed", y="bath", size="weight", legend=False)

In [None]:
# TODO: coarsening for string columns, then one-hot encoding for regression

In [None]:
y = df["price"]
X = df[["bed", "bath"]].to_numpy()
X = sm.add_constant(X)

model = sm.WLS(y, X, weights=weights, hasconst=True)
model.exog_names[:] = ["constant", "bed", "bath"]
results = model.fit()

In [None]:
results.summary()

In [None]:
# residuals