# Difference in Differences analysis for TCC Craigslist rent data



In [1]:
import pandas as pd
import numpy as np
import geopandas as gpd

import statsmodels.api as sm
import statsmodels.formula.api as smf

In [2]:
fres_listings_20 = gpd.read_file('craigslist_data/2020/fresno_listings.geojson')
ont_listings_20 = gpd.read_file('craigslist_data/2020/ontario_listings.geojson')
la_listings_20 = gpd.read_file('craigslist_data/2020/la_listings.geojson')

In [3]:
fres_tracts = gpd.read_file('craigslist_data/tracts/fresno_tracts.geojson')
la_tracts = gpd.read_file('craigslist_data/tracts/la_tracts.geojson')
ont_tracts = gpd.read_file('craigslist_data/tracts/ontario_tracts.geojson')

In [4]:
def read_2014_data(city):
    assert city in ['la', 'ontario', 'fresno']
    subset = {'la':'losangeles', 'ontario':'inlandempire', 'fresno':'fresno'}
    rent_2014 = pd.read_csv(f'craigslist_data/2014/subset_{subset[city]}.csv')
    rent_2014 = rent_2014.drop_duplicates(subset=['rent', 'sqft', 'date'])
    rent_2014 = gpd.GeoDataFrame(
        rent_2014, geometry=gpd.points_from_xy(
            rent_2014['longitude'].astype('float64'), rent_2014['latitude'].astype('float64')))
    rent_2014 = rent_2014.dropna(subset=['rent'])
    
    #add variable for DiD analysis
    rent_2014['time'] = 0
    return rent_2014

In [5]:
fres_14 = read_2014_data('fresno')
fres_14.crs = 'EPSG:4326'

In [6]:
ont_14 = read_2014_data('ontario')
ont_14.crs = 'EPSG:4326'

In [7]:
la_14 = read_2014_data('la')
la_14.crs = 'EPSG:4326'

In [14]:
fres_listings_20

Unnamed: 0,index,pid,dt,rent,bedrooms,sqft,latitude,longitude,region,domain,time,date,type_code,index_right,group,geometry
0,0,7115564477,2020-05-01T00:01:00,715,2,445,36.811967,-119.760020,Fresno,https://fresno.craigslist.org/search/apa,1,2020-05-01,apa,06019005403,0.0,POINT (-119.76002 36.81197)
1,21,7117200471,2020-05-01T17:38:00,715,2,500,36.811967,-119.760020,Fresno,https://fresno.craigslist.org/search/apa,1,2020-05-01,apa,06019005403,0.0,POINT (-119.76002 36.81197)
2,29,7117194970,2020-05-01T17:08:00,1369,2,821,36.737700,-119.784300,Fresno,https://fresno.craigslist.org/search/apa,1,2020-05-01,apa,06019000600,1.0,POINT (-119.78430 36.73770)
3,30,7117194329,2020-05-01T17:07:00,1415,2,954,36.737700,-119.784300,Fresno,https://fresno.craigslist.org/search/apa,1,2020-05-01,apa,06019000600,1.0,POINT (-119.78430 36.73770)
4,62,7111068398,2020-05-01T16:38:00,1250,1,1097,36.737700,-119.784300,Fresno,https://fresno.craigslist.org/search/apa,1,2020-05-01,apa,06019000600,1.0,POINT (-119.78430 36.73770)
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
753,40854,7169455856,2020-07-31T17:34:00,1200,1,772,36.737700,-119.784300,Fresno,https://fresno.craigslist.org/search/apa,1,2020-07-31,apa,06019000600,1.0,POINT (-119.78430 36.73770)
754,40869,7169426515,2020-07-31T16:41:00,862,2,840,36.786300,-119.828600,Fresno,https://fresno.craigslist.org/search/apa,1,2020-07-31,apa,06019004704,0.0,POINT (-119.82860 36.78630)
755,40886,7169407197,2020-07-31T15:49:00,1399,3,1100,36.786300,-119.828600,Fresno,https://fresno.craigslist.org/search/apa,1,2020-07-31,apa,06019004704,0.0,POINT (-119.82860 36.78630)
756,40911,7168240689,2020-07-31T15:10:00,615,2,445,36.811967,-119.760020,Fresno,https://fresno.craigslist.org/search/apa,1,2020-07-31,apa,06019005403,0.0,POINT (-119.76002 36.81197)


In [43]:
def join_prepare_data(listings_14, listings_20, tracts_gdf):
    
    def join_to_tracts(listing_gdf, tracts_gdf):
        sjoined_gdf = gpd.sjoin(listing_gdf, tracts_gdf, how='left', op='within')
        sjoined_gdf = (
                        #drop listings not in TCC nor control group
                        sjoined_gdf.dropna(subset=['group'])
                        #.rename(columns={'index_right': 'tract_id'})
                        .drop(columns='BASENAME')
                        )
        return sjoined_gdf
    
#     #enable using bedrooms as a predictor
#     listings_20 = listings_20.dropna(subset=['bedrooms'])
#     listings_20 = listings_20.astype({'bedrooms':'float64'})
    
    listings_14_joined = join_to_tracts(listings_14, tracts_gdf)
    #2020 listings joined in previous notebook
    listings_20_joined = listings_20.rename(columns={'index_right':'geoid'})
    all_listings = listings_14_joined.append(listings_20_joined)
    all_listings['post_treatment'] = all_listings['time'] * all_listings['group']
    all_listings['rent_sqft'] = all_listings['rent'] / all_listings['sqft']
    
    def remove_outliers(listing_gdf, quantiles):
        listing_gdf = listing_gdf.loc[listing_gdf['rent_sqft'] > listing_gdf['rent_sqft'].quantile(quantiles[0])]
        listing_gdf = listing_gdf.loc[listing_gdf['rent_sqft'] < listing_gdf['rent_sqft'].quantile(quantiles[1])]
        return listing_gdf

    all_listings = remove_outliers(all_listings, (.02, .98))
    #creates df with median of each statistic (rent_sqft) and a row for each tract at each time
    for_did = all_listings.groupby(['geoid', 'time']).median().reset_index()
    
    return for_did

In [44]:
def estimate_did(df):

    # choose a response and predictors
    response = 'rent_sqft'
    predictors = ['time', 'group', 'post_treatment']

    # filter full dataset to retain only these columns and only rows without nulls in these columns
    data = df[[response] +  predictors].dropna()

    # create design matrix and response vector
    X = data[predictors]
    y = data[response]

    # estimate a simple linear regression model with OLS, using statsmodels
    model = sm.OLS(y, sm.add_constant(X))
    result = model.fit()
    print(result.summary())

In [48]:
estimate_did(join_prepare_data(fres_14, fres_listings_20, fres_tracts))

                            OLS Regression Results                            
Dep. Variable:              rent_sqft   R-squared:                       0.699
Model:                            OLS   Adj. R-squared:                  0.668
Method:                 Least Squares   F-statistic:                     22.47
Date:                Mon, 10 Aug 2020   Prob (F-statistic):           1.02e-07
Time:                        09:59:34   Log-Likelihood:                 15.251
No. Observations:                  33   AIC:                            -22.50
Df Residuals:                      29   BIC:                            -16.52
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              0.7606      0.051     14.

In [49]:
estimate_did(join_prepare_data(ont_14, ont_listings_20, ont_tracts))

                            OLS Regression Results                            
Dep. Variable:              rent_sqft   R-squared:                       0.446
Model:                            OLS   Adj. R-squared:                  0.419
Method:                 Least Squares   F-statistic:                     16.64
Date:                Mon, 10 Aug 2020   Prob (F-statistic):           4.85e-08
Time:                        09:59:34   Log-Likelihood:                -14.252
No. Observations:                  66   AIC:                             36.50
Df Residuals:                      62   BIC:                             45.26
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              1.1679      0.062     18.

In [50]:
estimate_did(join_prepare_data(la_14, la_listings_20, la_tracts))

                            OLS Regression Results                            
Dep. Variable:              rent_sqft   R-squared:                       0.304
Model:                            OLS   Adj. R-squared:                  0.272
Method:                 Least Squares   F-statistic:                     9.601
Date:                Mon, 10 Aug 2020   Prob (F-statistic):           2.41e-05
Time:                        09:59:35   Log-Likelihood:                -74.856
No. Observations:                  70   AIC:                             157.7
Df Residuals:                      66   BIC:                             166.7
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              1.4840      0.130     11.