## Notes

* Database access requires us to add your IP to the AWS security list, contact Eric Dasmalchi or Jason Karpman for access

In [18]:
import pandas as pd
import geopandas as gpd
import numpy as np
from arcgis2geojson import arcgis2geojson
import requests
import mysql
import mysql.connector
from mysql.connector import errorcode
import os

In [8]:
#Set up database conenction. 
#Must add IP to security list on AWS console first.
config = {
  'user': 'luskincenter',
  'password': os.environ['TCC_SQL_PWD'],
  'host': 'housing-site-db.cxxl1so9sozw.us-west-1.rds.amazonaws.com',
  'database': 'housing_site_db',
  'raise_on_warnings': True
}
try:
    cnx = mysql.connector.connect(**config)

except mysql.connector.Error as err:
    if err.errno == errorcode.ER_ACCESS_DENIED_ERROR:
        print("Something is wrong with your user name or password")
    elif err.errno == errorcode.ER_BAD_DB_ERROR:
        print("Database does not exist")
    else:
        print(err)

In [9]:
def get_2020_data(city):
    
    def rentdf_to_gdf(rent_df):
        rent_df = rent_df.replace('', np.nan).dropna(subset=['lat', 'lng'])
        rent_gdf = gpd.GeoDataFrame(
            rent_df, geometry=gpd.points_from_xy(
            rent_df['lng'].astype('float64'), rent_df['lat'].astype('float64')))
        return rent_gdf

    #reformat 2020 data to generally match 2014 data
    def match_2014_data(gdf_2020rent):
        gdf_2020rent['date'] = gdf_2020rent['dt'].apply(lambda x: x[:11])
        gdf_2020rent = gdf_2020rent.rename(columns={'price':'rent',
                                                    'beds':'bedrooms',
                                                   'lat':'latitude',
                                                   'lng':'longitude'})
        gdf_2020rent = gdf_2020rent.drop_duplicates(subset=['rent', 'sqft', 'date'])
        gdf_2020rent = gdf_2020rent.dropna(subset=['rent', 'sqft'])
        gdf_2020rent = gdf_2020rent.replace('', np.nan).dropna(subset=[
                                'latitude', 'longitude', 'rent', 'sqft'])
        gdf_2020rent['rent'] = gdf_2020rent.loc[:,'rent'].astype(int)
        gdf_2020rent['sqft'] = gdf_2020rent.loc[:,'sqft'].astype(int)
        return gdf_2020rent

    assert city in ['la', 'ontario', 'fresno']
    print(f'Getting craigslist data for {city}... ', end = '')
    cols = 'pid, dt, price, beds, sqft, lat, lng, region, domain'
    regions = {'la':['Watts1', 'Watts2'], 'ontario':['Ontario'], 'fresno':['Fresno']}
    
    df = pd.DataFrame()
    for region in regions[city]:
        query = f"SELECT {cols} FROM housing_site_db.craigslist_table WHERE region = '{region}'\
    AND STR_TO_DATE(dt, '%Y-%m-%d %H:%i') BETWEEN '2020-05-01' AND '2020-08-01';"
        query_df = pd.read_sql(query, con=cnx)
        df = df.append(query_df)
    
    #add variable for DiD analysis
    df['time'] = 1
    df = df.reset_index()
    print('Done!')
    
    return match_2014_data(rentdf_to_gdf(df))

In [10]:
fres_listings_20 = get_2020_data('fresno')

Getting craigslist data for fresno... Done!


In [11]:
la_listings_20 = get_2020_data('la')

Getting craigslist data for la... Done!


In [12]:
ont_listings_20 = get_2020_data('ontario')

Getting craigslist data for ontario... Done!


In [13]:
#include only apartments/rooms from 2020 data
def filter_apt_only(listing_gdf):
    assert 'domain' in listing_gdf.columns
    #new column to extract type of listing from domain
    listing_gdf['type_code'] = listing_gdf['domain'].apply(lambda x: x.split('/')[-1])
    #filter to only apartment and room listings (exclude for-sale properties)
    listing_gdf = listing_gdf.loc[listing_gdf['type_code'].isin(['apa', 'roo'])]
    return listing_gdf

In [14]:
fres_listings_20 = filter_apt_only(fres_listings_20)
la_listings_20 = filter_apt_only(la_listings_20)
ont_listings_20 = filter_apt_only(ont_listings_20)

In [15]:
# fres_listings_20.to_file('craigslist_data/2020/fresno_listings.geojson', driver='GeoJSON')
# la_listings_20.to_file('craigslist_data/2020/la_listings.geojson', driver='GeoJSON')
# ont_listings_20.to_file('craigslist_data/2020/ontario_listings.geojson', driver='GeoJSON')

In [19]:
#FIPS codes for census tracts included in our control and tcc groups for each region
la_tracts = {'control': ['06037239601', '06037219901', '06037232120', '06037221500',
                          '06037237720', '06037238310', '06037238320', '06037237710',
                          '06037241120', '06037231100', '06037231210', '06037231300',
                           '06037231600', '06037231710', '06037240500', '06037237500',
                           '06037232500', '06037232700', '06037240600', '06037237101',
                           '06037237202', '06037237401', '06037239202', '06037239501',
                           '06037239602', '06037239802', '06037239801', '06037228500',
                            '06037231720', '06037237102', '06037241400', '06037240010',
                      '06037241202', '06037240401', '06037541604', '06037535102',
                      '06037540901', '06037600304'],
              'tcc': ['06037241001', '06037240900', '06037242700', '06037242100',
                     '06037242000', '06037240800', '06037242300', '06037242200',
                     '06037243000', '06037242600', '06037243100']}

fresno_tracts = {'control': ['06019001202', '06019001304', '06019001407', '06019002800',
                          '06019003202', '06019003807', '06019004704', '06019004802',
                          '06019005100', '06019005403'],
              'tcc': ['06019000700', '06019001100', '06019001000', '06019000901',
                     '06019000200', '06019000300', '06019000400', '06019000600',
                     '06019000902', '06019000100']}

ontario_tracts = {'control': ['06071000603', '06071003803', '06071000207', '06071002804',
                          '06071002602', '06071002902', '06071003200', '06071003102',
                          '06071003301', '06071003101', '06071003509', '06071004700',
                           '06071004604', '06071006700', '06071007000', '06071000201',
                           '06071003401', '06071000904', '06071001104', '06071001001',
                           '06071001305', '06071003607', '06071006604', '06071002204',
                           '06071006302', '06071000303', '06071002402', '06071002401',
                            '06071002501', '06071003302'],
              'tcc': ['06071001600', '06071001702', '06071001400', '06071001813',
                     '06071001707', '06071001812', '06071001504', '06071001706',
                     '06071001501', '06071001503']}

#get geography for all control and tcc tracts in a region
def ctrl_tcc_tract_gdfs(city):
    
    def get_tracts_geog(geoid_list):
        def get_tract_geog(geoid):
            api_url = 'https://tigerweb.geo.census.gov/arcgis/rest/services/TIGERweb/tigerWMS_ACS2016/MapServer/8/query?f=json&outsr=4326&where=GEOID={}'
            data = requests.get(api_url.format(geoid)).json()
            data = arcgis2geojson(data)
            #print(data)
            gdf = gpd.GeoDataFrame.from_features(data['features'])
            gdf['geoid'] = geoid

            return gdf.set_index('geoid')
        
        for geoid in geoid_list:
            try:
                gdf = gdf.append(get_tract_geog(geoid))
            except NameError:
                gdf = get_tract_geog(geoid)
        return gdf
    
        
    print(f'Getting tract shape data for {city}... ', end = '')
    assert city in ['la', 'ontario', 'fresno']
    if city == 'la':
        ctrl_tracts = get_tracts_geog(la_tracts['control'])
        tcc_tracts = get_tracts_geog(la_tracts['tcc'])
    elif city == 'ontario':
        ctrl_tracts = get_tracts_geog(ontario_tracts['control'])
        tcc_tracts = get_tracts_geog(ontario_tracts['tcc'])
    elif city == 'fresno':
        ctrl_tracts = get_tracts_geog(fresno_tracts['control'])
        tcc_tracts = get_tracts_geog(fresno_tracts['tcc'])
        
    #assign numeric treatment group variable for DiD analysis
    ctrl_tracts['group'] = 0
    tcc_tracts['group'] = 1
    
    print('Done!')
    return ctrl_tracts.append(tcc_tracts)

In [20]:
fres_tracts = ctrl_tcc_tract_gdfs('fresno')
la_tracts = ctrl_tcc_tract_gdfs('la')
ont_tracts = ctrl_tcc_tract_gdfs('ontario')

Getting tract shape data for fresno... Done!
Getting tract shape data for la... Done!
Getting tract shape data for ontario... Done!


In [21]:
fres_tracts.to_file('craigslist_data/tracts/fresno_tracts.geojson', driver='GeoJSON')
la_tracts.to_file('craigslist_data/tracts/la_tracts.geojson', driver='GeoJSON')
ont_tracts.to_file('craigslist_data/tracts/ontario_tracts.geojson', driver='GeoJSON')

In [22]:
def join_to_tracts(listing_gdf, tracts_gdf):
    sjoined_gdf = gpd.sjoin(listing_gdf, tracts_gdf, how='left', op='within')
    sjoined_gdf = (
                    #drop listings not in TCC nor control group
                    sjoined_gdf.dropna(subset=['group'])
                    #.rename(columns={'index_right': 'tract_id'})
                    .drop(columns='BASENAME')
                    )
    return sjoined_gdf

In [25]:
join_to_tracts(fres_listings_20, fres_tracts).to_file(
    'craigslist_data/2020/fresno_listings.geojson', driver='GeoJSON')
join_to_tracts(la_listings_20, la_tracts).to_file(
    'craigslist_data/2020/la_listings.geojson', driver='GeoJSON')
join_to_tracts(ont_listings_20, ont_tracts).to_file(
    'craigslist_data/2020/ontario_listings.geojson', driver='GeoJSON')