In [1]:
%load_ext sql

# imports
import pandas as pd
import datetime as dt

# today's date for output filenames
today = dt.date.today()

# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd

'Connected: prod@rental_nerd'

In [2]:
def top_zipcodes(n = 100):
    # query the top 100 zipcodes in the database (roughly equal to all zipcodes >10k properties)
    query = %sql (\
    SELECT zipcode, COUNT(id) \
    FROM properties \
    GROUP BY zipcode \
    ORDER BY 2 DESC \
    limit :n)

    zipcode_filter = query.DataFrame()
#     print("Top zipcode by count is",zipcode_filter.iloc[0,0],"with",zipcode_filter.iloc[0,1],"properties")
#     print("100th zipcode by count is",zipcode_filter.iloc[99,0],"with",zipcode_filter.iloc[99,1],"properties")
    return zipcode_filter.zipcode.values

def city_query():
    query = %sql (\
    SELECT area_name, COUNT(id) \
    FROM area_name_zipcodes \
    GROUP BY area_name \
    ORDER BY 2 DESC \
    limit 100)
    return query.DataFrame().area_name.values

def sanitize(data, zipcode_list = None):
    # abort if the city has no top zipcodes
    if data.empty:
        return 0    
    
    data.drop(['abnormal', 'bookmarked', 'created_at', 'ignore', 'is_latest', 'closed_diff_id', 'id', 'listed_diff_id',
                      'notes', 'source', 'updated_at', 'home_type', 'sfh', 'multifamily', 'description', 
                    'event_name', 'neighborhood'], axis=1, inplace=True)
    
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))

    if(data.transaction_type.iloc[0] == 'sales'):
        data = data[ data.price > 50000 ]
    else:
        data = data [ data.price > 500 ]
    
    if(zipcode_list is not None):
        data = data[data.zipcode.isin(zipcode_list)]

    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data.loc["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data.loc["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data.loc["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data.loc["multifamily"] = data["home_type"].apply(lambda x: True if x == "mfh" else False)
    data.loc['date_closed'] = data['date_closed'].apply(lambda x: today if x == None else (x - dt.date(2000, 1, 1)).days)
    data.loc['date_closed'] = data['date_closed'].astype(int)
    
    # convert the area name into dummy variables
    dm = pd.get_dummies(data[['area_name', 'zipcode']], prefix=['area_name','zipcode'])
    data = pd.concat([data, dm], axis=1)
    data.drop(['area_name', 'zipcode'], axis=1, inplace=True)
    del dm
    
    return data

def query(city="%", zipcode=None, limit=100, start_date="2000-01-01 10:01:13", end_date=today):
    # convert array of zipcodes into sql string which looks like a tuple
    placeholders = tuple(zipcode)
    
    # sql query helper function
    query = %sql (\
    select  \
    *  \
    from  \
    properties, \
    property_transaction_logs, \
    area_name_zipcodes \
    where  \
    property_transaction_logs.abnormal != true and \
    properties.sqft between 1 and 10000 and \
    property_transaction_logs.price between 500 and 400000 and \
    properties.bedrooms <= 6 and \
    properties.bathrooms <= 6 and \
    properties.home_type = 'sfh' and \
    property_transaction_logs.date_closed > :start_date and \
    property_transaction_logs.date_closed < :end_date and \
    area_name_zipcodes.`area_name` LIKE :city and \
    area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
    properties.zipcode IN :placeholders and \
    properties.`id` = property_transaction_logs.`property_id` and \
    property_transaction_logs.`is_latest` = true \
    order by \
    property_transaction_logs.id desc \
    limit :limit) 

    return query.DataFrame().T.groupby(level=0).first().T

def queue_city_queries(city, zipcode_list, for_sale_zipcode_list):
    q = query(zipcode=zipcode_list, limit=limit)
    q = sanitize(q)
    
    for_sale = q[(q.transaction_type == "sales") & 
                 (q.transaction_status == "open") & 
                 (q.date_listed > (today - dt.timedelta(days=30))) &
                 (q.zipcode.isin(for_sale_zipcode_list))]
    sales = q[(q.transaction_type == "sales") & (q.transaction_status == "closed")]
    rent = q[(q.transaction_type == "rental") & (q.transaction_status == "closed")]
        
    data = {'sales': sales, 'rentals': rent, 'for_sale': for_sale }
    
    return data

In [3]:
# get list of top zipcodes to only run the model on them (put down 2000 to get every zipcode)
zipcode_list = top_zipcodes(2000)

# we filter the current listings further to only see the top zipcodes to not predict prices in areas with weak coverage
focus_zipcodes = top_zipcodes(200)

# limit on number of lines returned from sql queries (for debugging)
limit = 100000000

1200 rows affected.
200 rows affected.


In [5]:
q = queue_city_queries("%", zipcode_list, focus_zipcodes)

for k,v in q.items():
    v.to_csv('CSV_backups/ALL-' + k + '.csv')
    

1000000 rows affected.
Entries before filter:  1000000
Entries after filter:  762237


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


TypeError: unsupported operand type(s) for -: 'float' and 'datetime.date'