In [93]:
# imports
import pandas as pd
import numpy as np
import datetime as dt

import matplotlib.pyplot as plt
import seaborn as sns
plt.rcParams['figure.figsize'] = (16, 8)

import statsmodels.api as sm
from statsmodels.formula.api import ols
from statsmodels.sandbox.regression.predstd import wls_prediction_std

from pscore_match.pscore import PropensityScore
from pscore_match.match import Match, whichMatched
from scipy.stats import gaussian_kde

# this allows plots to appear directly in the notebook
%matplotlib inline

limit = 1000000

In [94]:
# read in data
sold = pd.read_csv('../CSV_backups/ALL-sales.csv',nrows=limit, index_col=['property_id','transaction_id']).drop_duplicates()


Columns (12,13,14,16) have mixed types. Specify dtype option on import or set low_memory=False.



In [95]:
# only show properties that were actually sold (and a check to exclude those with wrong close dates e.g. >10000 days)
df = sold[(sold.date_closed > 2920) & (sold.date_closed < 10000 )] \
        [["latitude","longitude", "date_closed","price","sqft","bedrooms", "bathrooms","dist_to_lightrail_station"]]
df.rename(columns = {'date_closed':'date'}, inplace = True)

df['near_rail5'] = (df['dist_to_lightrail_station'] < 5)
df['near_rail3'] = (df['dist_to_lightrail_station'] < 3)
df['near_rail1'] = (df['dist_to_lightrail_station'] < 1)

df['ppsf'] = (df.price / df.sqft)
df.head(6)

Unnamed: 0_level_0,Unnamed: 1_level_0,latitude,longitude,date,price,sqft,bedrooms,bathrooms,dist_to_lightrail_station,near_rail,ppsf
property_id,transaction_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
346200,23951313,33.5674,-112.094,5410,400000,2388,4,3,0.7,True,167.504188
9020277,23951289,33.5954,-112.153,5617,320000,2120,4,2,5.90593,False,150.943396
336902,23951266,33.5442,-112.105,6646,285000,1723,3,2,0.844038,True,165.40917
990354,23951257,33.2849,-111.867,6548,367000,2286,3,2,13.0062,False,160.542432
9000471,23951250,33.5853,-111.827,6587,369000,1502,3,2,17.0247,False,245.672437
9035951,23951029,33.4017,-111.883,6543,182500,1721,3,2,1.40218,True,106.042998


In [None]:
df.describe()

Unnamed: 0,latitude,longitude,date,price,sqft,bedrooms,bathrooms,dist_to_lightrail_station,ppsf
count,197877.0,197877.0,197877.0,197877.0,197877.0,197877.0,197877.0,197877.0,197877.0
mean,33.532659,-112.089942,4888.827014,175362.159983,1647.194252,2.980518,1.944157,7.567827,108.325504
std,0.083722,0.112063,1098.951686,85227.855711,460.356915,1.094127,0.589461,4.717428,48.627678
min,33.2832,-112.275,2921.0,50000.0,500.0,0.0,0.0,0.0,14.184997
25%,33.4775,-112.171,3929.0,108000.0,1329.0,3.0,2.0,3.80005,71.14486
50%,33.5317,-112.107,4881.0,160000.0,1590.0,3.0,2.0,6.90652,103.030303
75%,33.6002,-112.024,5894.0,229000.0,1892.0,4.0,2.0,10.6231,137.698113
max,33.6653,-111.811,6709.0,400000.0,4000.0,6.0,6.0,23.3682,641.666667


In [None]:
treatment = np.array(df.near_rail5)

cov_list = ['date', 'latitude', 'longitude','sqft','bedrooms','bathrooms']
covariates = df[cov_list]
pscore = PropensityScore(treatment, covariates).compute()

pairs = Match(treatment, pscore)
pairs.create(method='many-to-one', many_method='knn', k=5, replace=True)
data_matched = whichMatched(pairs, pd.DataFrame({'pscore': pscore, 'treatment' :treatment, 'ppsf':df.ppsf}))

In [None]:
plt.figure(1)
plt.subplot(121)
density0 = gaussian_kde(pscore[treatment==0])
density1 = gaussian_kde(pscore[treatment==1])
xs = np.linspace(0,1,200)
plt.plot(xs,density0(xs),color='black')
plt.fill_between(xs,density1(xs),color='gray')
plt.text(0.5, 3, 'Control Group')
plt.text(0.06, 6, 'Treatment Group')
plt.title('Phone: Before Matching')
plt.axis([0,1,0,6])
plt.xlabel('Propensity Score')
plt.ylabel('Density')

plt.subplot(122)
density0_post = gaussian_kde(data_matched.pscore[data_matched.treatment==0])
density1_post = gaussian_kde(data_matched.pscore[data_matched.treatment==1])
xs = np.linspace(0,1,200)
plt.plot(xs,density0_post(xs),color='black')
plt.fill_between(xs,density1_post(xs),color='gray')
plt.title('Phone: After Matching')
plt.axis([0,1,0,6])
plt.xlabel('Propensity Score')
plt.ylabel('Density')
plt.show()

In [None]:
data_matched

In [None]:
treated_ppsf = df.ppsf[treatment == 1].mean()
control_ppsf = df.ppsf[treatment==0].mean()
matched_control_ppsf = data_matched.ppsf[data_matched.treatment==0].mean()
ATT = treated_ppsf - control_ppsf
matched_ATT = treated_ppsf - matched_control_ppsf
print(str("Premium: " + str(ATT)))
print(str("Premium after matching: " + str(matched_ATT)))