In [1]:
%load_ext sql

# imports
import pandas as pd
import datetime as dt

# today's date for output filenames
today = dt.date.today()

# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd

'Connected: prod@rental_nerd'

In [2]:
def top_zipcodes(n = 100):
    # query the top 100 zipcodes in the database (roughly equal to all zipcodes >10k properties)
    query = %sql (\
    SELECT zipcode, COUNT(id) \
    FROM properties \
    GROUP BY zipcode \
    ORDER BY 2 DESC \
    limit :n)

    zipcode_filter = query.DataFrame()
#     print("Top zipcode by count is",zipcode_filter.iloc[0,0],"with",zipcode_filter.iloc[0,1],"properties")
#     print("100th zipcode by count is",zipcode_filter.iloc[99,0],"with",zipcode_filter.iloc[99,1],"properties")
    return zipcode_filter.zipcode.values

def city_query():
    query = %sql (\
    SELECT area_name, COUNT(id) \
    FROM area_name_zipcodes \
    GROUP BY area_name \
    ORDER BY 2 DESC \
    limit 100)
    return query.DataFrame().area_name.values

def sanitize(data, zipcode_list = None):
    # abort if the city has no top zipcodes
    if data.empty:
        return 0    
    
    data.drop(['abnormal', 'bookmarked', 'created_at', 'ignore', 'is_latest', 'closed_diff_id', 'id', 'listed_diff_id',
                      'notes', 'source', 'updated_at', 'home_type', 'sfh', 'description', 
                    'event_name', 'neighborhood'], axis=1, inplace=True)
    
    # filters out any non-sensical values or fat finger mistakes in MLS listings
    print("Entries before filter: ", len(data))

    if(data.transaction_type.iloc[0] == 'sales'):
        data = data[ data.price > 50000 ]
    else:
        data = data [ data.price > 500 ]
    
    if(zipcode_list is not None):
        data = data[data.zipcode.isin(zipcode_list)]

    print("Entries after filter: ",len(data))
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data['date_closed'] = data['date_closed'].apply(lambda x: 0 if x == None else (x - dt.date(2000, 1, 1)).days)
    data['date_closed'] = data['date_closed'].astype(int)
    
    # convert the area name into dummy variables
    dm = pd.get_dummies(data[['area_name', 'zipcode']], prefix=['area_name','zipcode'])
    data = pd.concat([data, dm], axis=1)
    del dm
    
    return data

def query(city="%", zipcode=None, limit=100, start_date="2000-01-01 10:01:13", ttype='sales',tstatus='open'):
    # convert array of zipcodes into sql string which looks like a tuple
    placeholders = tuple(zipcode)
    
    # sql query helper function
    query = %sql (\
    SELECT  \
    *, \
    property_transaction_logs.id as 'transaction_id' \
    FROM  \
    properties, \
    property_transaction_logs, \
    area_name_zipcodes \
    where  \
    property_transaction_logs.abnormal != true and \
    properties.sqft between 1 and 10000 and \
    property_transaction_logs.price between 500 and 400000 and \
    properties.bedrooms <= 6 and \
    properties.bathrooms <= 6 and \
    properties.home_type = 'sfh' and \
    area_name_zipcodes.`area_name` LIKE :city and \
    area_name_zipcodes.`zipcode` = properties.`zipcode` and     \
    properties.zipcode IN :placeholders and \
    properties.`id` = property_transaction_logs.`property_id` and \
    property_transaction_logs.`transaction_type` = :ttype and \
    property_transaction_logs.`transaction_status` = :tstatus and \
    property_transaction_logs.`is_latest` = true \
    order by \
    property_transaction_logs.id desc \
    limit :limit) 

    q = query.DataFrame()
    q.set_index('property_id', inplace=True)
    q.index.name = 'property_id'
    return q.T.groupby(level=0).first().T

def queue_city_queries(city, zipcode_list, for_sale_zipcode_list):
    i = query(city=city, zipcode=zipcode_list, limit=limit,ttype = 'sales',tstatus='closed')
    j = query(city=city, zipcode=for_sale_zipcode_list, limit=limit,ttype = 'sales',tstatus='open') 
    k = query(city=city, zipcode=zipcode_list, limit=limit,ttype = 'rental',tstatus='closed')
    
    q = pd.concat([i,j,k])
    q = sanitize(q)
    
    for_sale = q[(q.transaction_type == "sales") & 
                 (q.transaction_status == "open") & 
                 (q.date_listed > (today - dt.timedelta(days=6000))) &
                 (q.zipcode.isin(for_sale_zipcode_list))]
    sales = q[(q.transaction_type == "sales") & (q.transaction_status == "closed")]
    rent = q[(q.transaction_type == "rental") & (q.transaction_status == "closed")]
        
    data = {'sales': sales, 'rentals': rent, 'for_sale': for_sale }
    
    return data

In [3]:
# get list of top zipcodes to only run the model on them (put down 2000 to get every zipcode)
zipcode_list = top_zipcodes(2000)

# we filter the current listings further to only see the top zipcodes to not predict prices in areas with weak coverage
focus_zipcodes = top_zipcodes(2000)

# limit on number of lines returned from sql queries (for debugging)
limit = 2000000

1217 rows affected.
1217 rows affected.


In [4]:
q = queue_city_queries("PH", zipcode_list, focus_zipcodes)

for k,v in q.items():
    v.to_csv('CSV_backups/ALL-' + k + '.csv')
    

188554 rows affected.
7938 rows affected.
41737 rows affected.
Entries before filter:  238229
Entries after filter:  173930


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is tryin

In [5]:
q['sales'].head()

Unnamed: 0_level_0,address,adult,area_name,bathrooms,bedrooms,construction,date_closed,date_listed,date_transacted_latest,days_on_market,...,zipcode_85379,zipcode_85381,zipcode_85382,zipcode_85383,zipcode_85387,zipcode_85388,zipcode_85390,zipcode_85392,zipcode_85395,zipcode_85396
property_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7498334,"16202 W Mohave St, Goodyear, AZ 85338",,PH,3,5,,4717,2012-09-13,2012-11-30,78.0,...,0,0,0,0,0,0,0,0,0,0
7491241,"12544 W Alegre Dr, Litchfield Park, AZ 85340",,PH,2,5,,5354,2014-05-07,2014-08-29,114.0,...,0,0,0,0,0,0,0,0,0,0
7481107,"15542 W Supai Cir, Goodyear, AZ 85338",,PH,2,3,,5149,2014-01-03,2014-02-05,33.0,...,0,0,0,0,0,0,0,0,0,0
7477511,"9038 E Crystal Dr, Sun Lakes, AZ 85248",,PH,2,2,,5595,2015-02-12,2015-04-27,74.0,...,0,0,0,0,0,0,0,0,0,0
1041628,"4590 E Hazeltine Way, Chandler, AZ 85249",,PH,3,5,,6296,,2017-03-28,,...,0,0,0,0,0,0,0,0,0,0


In [6]:
len(q['for_sale'].index)

6745