# Causal Inference with CEM and Weighted Regression

In [88]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import os
from datetime import datetime, timedelta

import pandas as pd
import numpy as np
import statsmodels.api as sm
import seaborn as sns
import matplotlib.pyplot as plt
from cem import match
from cem import coarsen
from cem.imbalance import L1
from pymongo.mongo_client import MongoClient
from pymongo.server_api import ServerApi

In [89]:
uri = os.environ["MONGODB_URI"]
client = MongoClient(uri, server_api=ServerApi("1"))
client.admin.command("ping")
db = client["real-estate"]
collection = db["listings"]

In [90]:
since = datetime.now() - timedelta(days=30)

pipeline = [
        {
            "$match": {"rental": True, "datetime": {"$gte": since}, "bed": {"$lte": 4}},
        },
    ]

results = collection.aggregate(pipeline)

df = pd.DataFrame.from_records(results).set_index("_id")
df.head()

Unnamed: 0_level_0,datetime,provider,rental,price,address,suburb,state,postcode,council,bed,bath,parking,area,dwelling,version
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
listing-16678011,2023-09-30,domain,True,600,"Tennyson Avenue, Plympton Park, Adelaide, City...",plympton park,sa,5038,city of marion,3,1,2,,house,0.1.0
listing-16677966,2023-09-30,domain,True,595,"Waikiki Court, West Lakes, Adelaide, City of C...",west lakes,sa,5021,city of charles sturt,3,1,2,,house,0.1.0
listing-16675969,2023-09-30,domain,True,650,"Bimini Crescent, Mawson Lakes, Adelaide, City ...",mawson lakes,sa,5095,city of salisbury,3,2,1,,townhouse,0.1.0
listing-16677999,2023-09-30,domain,True,480,"Pierson Street, Campbelltown City Council, Ade...",hectorville,sa,5073,city of campbelltown,3,1,2,,house,0.1.0
listing-16677277,2023-09-30,domain,True,610,"Grote Street, Adelaide, Adelaide City Council,...",adelaide,sa,5000,corporation of the city of adelaide,2,1,0,,apartment / unit / flat,0.1.0


In [91]:
df.describe()

Unnamed: 0,datetime,price,bed,bath,parking,area
count,297,297.0,297.0,297.0,297.0,40.0
mean,2023-10-02 19:43:01.818181888,554.380471,2.771044,1.461279,1.707071,434.175
min,2023-09-30 00:00:00,6.0,1.0,1.0,0.0,71.0
25%,2023-09-30 00:00:00,460.0,2.0,1.0,1.0,300.0
50%,2023-10-04 00:00:00,540.0,3.0,1.0,2.0,406.0
75%,2023-10-04 00:00:00,600.0,3.0,2.0,2.0,514.75
max,2023-10-07 00:00:00,1500.0,4.0,3.0,8.0,990.0
std,,158.767019,0.745325,0.544644,1.204601,214.346579


In [92]:
df[["bed", "bath", "parking"]].cov()

Unnamed: 0,bed,bath,parking
bed,0.55551,0.16678,0.42257
bath,0.16678,0.296638,0.084903
parking,0.42257,0.084903,1.451065


In [93]:
df["council"].value_counts()

council
city of playford                                          34
city of port adelaide enfield                             33
city of charles sturt                                     28
city of salisbury                                         24
corporation of the city of adelaide                       24
corporation of the city of unley                          19
city of marion                                            17
city of onkaparinga                                       16
city of campbelltown                                      12
city of norwood payneham & st peters                      12
city of west torrens                                      11
city of tea tree gully                                    10
city of holdfast bay                                      10
city of burnside                                           9
mount barker district council                              9
city of prospect & city of port adelaide enfield           8
corporation of t

In [94]:
councils = {
    "city of playford": "north",
    "city of port adelaide enfield": "north",
    "city of charles sturt": "west",
    "city of salisbury": "north",
    "corporation of the city of adelaide": "inner",
    "corporation of the city of unley": "inner",
    "city of marion": "south",
    "city of onkaparinga": "south",
    "city of campelltown": "east",
    "city of norwood payneham & st peters": "inner",
    "city of west torrens": "west",
    "city of tea tree gully": "north",
    "city of burnside": "east",
    "city of holdfast bay": "west",
    "mount barker district council": "hills",
    "city of prospect & city of port adelaide enfield": "north",
    "corporation of the town of walkerville": "inner",
    "city of mitcham": "south",
    "city of prospect": "inner",
    "adelaide hills council": "hills",
    "town of gawler": "outer",
    "the barossa council": "outer",
    "city of port adelaide enfield & city of tea tree gully": "north"
}

In [95]:
y = df["price"]
X = df.drop(columns="price")[["bed", "bath", "parking", "council", "dwelling"]]

In [96]:
# no matching
L1(X, "bed")
# very imbalanced

Unnamed: 0,bed_level_a,bed_level_b,imbalance
0,1,2,0.870588
1,1,3,0.99359
2,1,4,1.0
3,2,3,0.84819
4,2,4,0.976471
5,3,4,0.822898


In [97]:
# exact matching
# throw away examples from strata (defined by council, dwelling, bath and parking) that do not contain all levels of the treatment (number of beds)
weights = match(X, "bed")

print(f"{(weights > 0).sum()} observations remain. Threw away {(weights == 0).sum()}")

L1(X, "bed", weights)
# no examples left..

0 observations remain. Threw away 297


  return np.sum(np.abs(tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b))) / 2
  return np.sum(np.abs(tensor_a / np.sum(tensor_a) - tensor_b / np.sum(tensor_b))) / 2


Unnamed: 0,bed_level_a,bed_level_b,imbalance
0,1,2,
1,1,3,
2,1,4,
3,2,3,
4,2,4,
5,3,4,


In [120]:
# coarsened exact matching
# throw away examples from strata (defined by COARSENED council, dwelling, bath and parking) that do not contain all levels of the treatment (number of beds)
X_coarse = X.copy()

X_coarse["council"] = X_coarse["council"].map(councils)  # north, south, east, west, inner, hills, outer
X_coarse["parking"] = X_coarse["parking"] > 0  # yes or no
X_coarse["bath"] = pd.cut(X_coarse["bath"], bins=[-1, 1, 2, 100])  # 1, 2, 3+

weights = match(X_coarse, "bed")
X_coarse = X_coarse.loc[weights.index]  # not necessary, but it supresses that warning

print(f"{(weights > 0).sum()} observations remain. Threw away {(weights == 0).sum()}")

L1(X_coarse, "bed", weights)

47 observations remain. Threw away 250


Unnamed: 0,bed_level_a,bed_level_b,imbalance
0,1,2,0.0
1,1,3,5.5511150000000004e-17
2,1,4,0.0
3,2,3,5.5511150000000004e-17
4,2,4,0.0
5,3,4,5.5511150000000004e-17


In [121]:
X_coarse[weights > 0].sort_values(["council", "dwelling", "bath", "parking", "bed"]).head(20)

Unnamed: 0_level_0,bed,bath,parking,council,dwelling
_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
listing-15824497,1,"(-1, 1]",True,inner,house
listing-16669147,1,"(-1, 1]",True,inner,house
listing-16323766,2,"(-1, 1]",True,inner,house
listing-16670501,3,"(-1, 1]",True,inner,house
listing-16677326,3,"(-1, 1]",True,inner,house
listing-16680618,3,"(-1, 1]",True,inner,house
listing-16687722,3,"(-1, 1]",True,inner,house
listing-16680648,4,"(-1, 1]",True,inner,house
listing-16682898,4,"(-1, 1]",True,inner,house
listing-16680778,1,"(-1, 1]",True,north,house


In [122]:
print(X_coarse[weights > 0]["dwelling"].value_counts())
print(X_coarse[weights > 0]["council"].value_counts())
print(X_coarse[weights > 0]["parking"].value_counts())
print(X_coarse[weights > 0]["bath"].value_counts())

dwelling
house    47
Name: count, dtype: int64
council
north    38
inner     9
Name: count, dtype: int64
parking
True    47
Name: count, dtype: int64
bath
(-1, 1]     47
(1, 2]       0
(2, 100]     0
Name: count, dtype: int64


In [123]:
# after coarsened matching, there is very little imbalance, so i'm happy not to control for council and dwelling
model = sm.WLS(y, sm.add_constant(X[["bed"]]), weights=weights, hasconst=True)
model.exog_names[:] = ["constant", "bed"]
results = model.fit()

In [124]:
results.summary()

  llf += 0.5 * np.sum(np.log(self.weights))


0,1,2,3
Dep. Variable:,price,R-squared:,0.086
Model:,WLS,Adj. R-squared:,0.083
Method:,Least Squares,F-statistic:,27.68
Date:,"Sat, 07 Oct 2023",Prob (F-statistic):,2.76e-07
Time:,17:27:42,Log-Likelihood:,-inf
No. Observations:,297,AIC:,inf
Df Residuals:,295,BIC:,inf
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
constant,370.5619,39.460,9.391,0.000,292.904,448.220
bed,71.3951,13.570,5.261,0.000,44.688,98.102

0,1,2,3
Omnibus:,408.078,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,68528.996
Skew:,6.288,Prob(JB):,0.0
Kurtosis:,76.345,Cond. No.,14.4


In [102]:
# residuals