In [73]:
# imports
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (12, 8)

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from pscore_match.pscore import PropensityScore
from pscore_match.match import Match, whichMatched
from scipy.stats import gaussian_kde

# this allows plots to appear directly in the notebook
%matplotlib inline

limit = 10000

In [74]:
# read in data
sold = pd.read_csv('../CSV_backups/ALL-sales.csv',nrows=limit, index_col=['property_id','transaction_id']).drop_duplicates()

In [75]:
# only show properties that were actually sold (and a check to exclude those with wrong close dates e.g. >10000 days)
df = sold[(sold.date_closed != 0) & (sold.date_closed < 10000 )] \
        [["latitude","longitude", "date_closed","price","sqft","bedrooms", "bathrooms","dist_to_lightrail_station"]]
df.rename(columns = {'date_closed':'date'}, inplace = True)

# convert days since Y2K to a nice looking date
# df['date'] = df['date'].apply(lambda x: dt.date(2000, 1, 1) + dt.timedelta(days=x)) \
#                        .astype("datetime64[ns]")

df = df[df.date >= dt.date(2008,12,31)]
df['near_rail'] = (df['dist_to_lightrail_station'] < 5).astype("int64")

df['ppsf'] = (df.price / df.sqft)
df.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,date,price,sqft,bedrooms,bathrooms,dist_to_lightrail_station,near_rail,ppsf
property_id,transaction_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
346200,23951313,33.5674,-112.094,2014-10-24,400000,2388,4,3,0.7,1,167.504188
9020277,23951289,33.5954,-112.153,2015-05-19,320000,2120,4,2,5.90593,0,150.943396
336902,23951266,33.5442,-112.105,2018-03-13,285000,1723,3,2,0.844038,1,165.40917
990354,23951257,33.2849,-111.867,2017-12-05,367000,2286,3,2,13.0062,0,160.542432
9000471,23951250,33.5853,-111.827,2018-01-13,369000,1502,3,2,17.0247,0,245.672437
9035951,23951029,33.4017,-111.883,2017-11-30,182500,1721,3,2,1.40218,1,106.042998


In [76]:
df.describe()

Unnamed: 0,latitude,longitude,price,sqft,bedrooms,bathrooms,dist_to_lightrail_station,near_rail,ppsf
count,2389.0,2389.0,2389.0,2389.0,2389.0,2389.0,2389.0,2389.0,2389.0
mean,33.549903,-112.090599,199509.432398,1764.598577,3.315613,2.061113,9.731042,0.228548,115.129151
std,0.092477,0.129049,86035.21804,521.019761,0.701168,0.515699,5.065701,0.419985,44.433293
min,33.2849,-112.274,50000.0,725.0,1.0,1.0,0.0,0.0,21.052632
25%,33.4932,-112.207,134000.0,1402.0,3.0,2.0,5.63028,0.0,83.456973
50%,33.5727,-112.118,188477.0,1695.0,3.0,2.0,10.384,0.0,111.830743
75%,33.6222,-111.99,259900.0,2031.0,4.0,2.0,13.6536,0.0,143.084261
max,33.6652,-111.827,400000.0,3976.0,6.0,4.0,22.9746,1.0,332.777778


In [77]:
treatment = np.array(df.near_rail)

cov_list = ['date', 'latitude', 'longitude','sqft','bedrooms','bathrooms']
covariates = df[cov_list]
pscore = PropensityScore(treatment, covariates).compute()

pairs = Match(treatment, pscore)
pairs.create(method='many-to-one', many_method='knn', k=5, replace=True)
data_matched = whichMatched(pairs, pd.DataFrame({'pscore': pscore, 'treatment' :treatment, 'ppsf':df.ppsf}))

ValueError: Pandas data cast to numpy dtype of object. Check input data with np.asarray(data).

In [None]:
plt.figure(1)
plt.subplot(121)
density0 = gaussian_kde(pscore[treatment==0])
density1 = gaussian_kde(pscore[treatment==1])
xs = np.linspace(0,2,200)
plt.plot(xs,density0(xs),color='black')
plt.fill_between(xs,density1(xs),color='gray')
plt.text(0.5, 3, 'Control Group')
plt.text(0.06, 6, 'Treatment Group')
plt.title('Phone: Before Matching')
plt.axis([0,1,0,6])
plt.xlabel('Propensity Score')
plt.ylabel('Density')

plt.subplot(122)
density0_post = gaussian_kde(data_matched.pscore[data_matched.treatment==0])
density1_post = gaussian_kde(data_matched.pscore[data_matched.treatment==1])
xs = np.linspace(0,1,200)
plt.plot(xs,density0_post(xs),color='black')
plt.fill_between(xs,density1_post(xs),color='gray')
plt.title('Phone: After Matching')
plt.axis([0,1,0,6])
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.show()

In [None]:
import plotly
pairs.plot_balance(covariates)

In [None]:
treated_ppsf = df.ppsf[treatment == 1].mean()
control_ppsf = df.ppsf[treatment==0].mean()
matched_control_ppsf = data_matched.ppsf[data_matched.treatment==0].mean()
ATT = treated_ppsf - control_ppsf
matched_ATT = treated_ppsf - matched_control_ppsf
print(str("Premium: " + str(ATT)))
print(str("Premium after matching: " + str(matched_ATT)))

In [None]:
treated_ppsf


In [None]:
control_ppsf

In [None]:
matched_control_ppsf