In [18]:
%load_ext sql

# imports
import pandas as pd
import datetime as dt
import math

# today's date for output filenames
today = (dt.date.today() - dt.date(2000, 1, 1)).days

# %sql mysql://root@localhost/rental_nerd
%sql mysql://prod:nerd@52.2.153.189/rental_nerd


The sql extension is already loaded. To reload it, use:
  %reload_ext sql


'Connected: prod@rental_nerd'

In [19]:
def top_zipcodes(n = 100):
    # query the top 100 zipcodes in the database (roughly equal to all zipcodes >10k properties)
    query = %sql (\
    SELECT zipcode, COUNT(id) \
    FROM properties \
    GROUP BY zipcode \
    ORDER BY 2 DESC \
    limit :n)

    zipcode_filter = query.DataFrame()
#     print("Top zipcode by count is",zipcode_filter.iloc[0,0],"with",zipcode_filter.iloc[0,1],"properties")
#     print("100th zipcode by count is",zipcode_filter.iloc[99,0],"with",zipcode_filter.iloc[99,1],"properties")
    return zipcode_filter.zipcode.values

def city_query():
    query = %sql (\
    SELECT city_code, COUNT(id) \
    FROM property_transaction_logs \
    GROUP BY city_code \
    ORDER BY 2 DESC \
    limit 100)
    return query.DataFrame().city_code.values

def sanitize(data):
    # abort if the city has no top zipcodes
    if data.empty:
        return 0    
    data.fillna(value=0, inplace=True)
    
    data.drop(['abnormal', 'bookmarked', 'created_at', 'ignore', 'closed_diff_id', 'id', 'listed_diff_id',
                      'notes', 'source', 'updated_at', 'home_type', 'sfh', 'description', 
                    'event_name', 'pended', 'neighborhood'], axis=1, inplace=True)
    
    # fills in some sensible defaults where data is missing
    data["near_golf_course"] = data["near_golf_course"].apply(lambda x: True if x == 1.0 else False)
    data["has_pool"] = data["has_pool"].apply(lambda x: True if x == 1.0 else False)
    data["is_latest"] = data["is_latest"].apply(lambda x: True if x == 1.0 else False)
    data["garage"] = data["garage"].apply(lambda x: True if x == 1.0 else False)
    data["adult"] = data["adult"].apply(lambda x: True if x == 1.0 else False)
    data["construction"] = data["construction"].apply(lambda x: True if x == 1.0 else False)
    data["townhouse"] = data["townhouse"].apply(lambda x: True if x == 1.0 else False)
    data["mobile"] = data["mobile"].apply(lambda x: True if x == 1.0 else False)
    data["fsbo"] = data["fsbo"].apply(lambda x: True if x == 1.0 else False)
    
    data['date_closed'] = data['date_closed'].apply(lambda x: 0 if (x == 0) else (x - dt.date(2000, 1, 1)).days)
    data['date_closed'] = data['date_closed'].astype(int)
    
    data['date_listed'] = data['date_listed'].apply(lambda x: 0 if (x == 0) else (x - dt.date(2000, 1, 1)).days)
    data['date_listed'] = data['date_listed'].astype(int)
    
    data["fixer"] = data["fixer"].apply(lambda x: True if x == 1.0 else False)
    data["foreclosure"] = data["foreclosure"].apply(lambda x: True if x == 1.0 else False)

    data["school_district_id"] = data["school_district_id"].astype(str)
    data['year_built'] = data['year_built'].apply(lambda x: 1980 if math.isnan(x) else x)

    data['lot'] = data['lot'].apply(lambda x: int(0 if x is None else x))
    data['hoa_fees'] = data['hoa_fees'].apply(lambda x: int(0 if x is None else x))
    data['rooms'] = data['rooms'].apply(lambda x: int(0 if x is None else x))
    data['saves'] = data['saves'].apply(lambda x: int(0 if x is None else x))
    data['stories'] = data['stories'].apply(lambda x: int(0 if x is None else x))
    
    # convert to km
    data['dist_to_lightrail_station'] = data['dist_to_lightrail_station'].apply(lambda x: x * 100) 
    data['dist_to_lightrail_line'] = data['dist_to_lightrail_line'].apply(lambda x: x * 100) 
    data['dist_to_hiway'] = data['dist_to_hiway'].apply(lambda x: x * 100) 
    data['dist_to_waterway'] = data['dist_to_waterway'].apply(lambda x: x * 100) 
    data['dist_to_airport'] = data['dist_to_airport'].apply(lambda x: x * 100) 
    data['dist_to_starbucks'] = data['dist_to_starbucks'].apply(lambda x: x * 100) 
    data['dist_to_railway'] = data['dist_to_railway'].apply(lambda x: x * 100) 
    data['dist_to_park'] = data['dist_to_park'].apply(lambda x: x * 100) 
    data['dist_to_shopping'] = data['dist_to_shopping'].apply(lambda x: x * 100) 
    
    
    # convert the area name into dummy variables
    dm = pd.get_dummies(data[['city_code', 'zipcode','school_district_id']], prefix=['city_code','zipcode','school_district_id'])
    data = pd.concat([data, dm], axis=1)
    del dm
    
    return data

def query(city="%", zipcode=None, limit=100, start_date="2000-01-01 10:01:13", ttype='sales'):
    # convert array of zipcodes into sql string which looks like a tuple
    placeholders = tuple(zipcode)
    
    # sql query helper function
    query = %sql (\
    SELECT  \
    *, \
    properties.id as 'property_id', \
    property_transaction_logs.id as 'transaction_id', \
    property_school_districts.school_district_id \
    FROM  \
    property_transaction_logs, \
    properties \
    LEFT JOIN \
    property_school_districts ON property_school_districts.property_id = properties.id \
    where  \
    ( abnormal = false OR abnormal IS NULL OR abnormal = 0 ) and \
    properties.sqft between 500 and 4000 and \
    property_transaction_logs.price between 50000 and 400000 and \
    properties.bedrooms <= 6 and \
    properties.bathrooms <= 6 and \
    properties.home_type = 'sfh' and \
    ((properties.latitude BETWEEN 33.421516 AND 33.665268 and \
    properties.longitude BETWEEN -112.274780 AND -111.810608 ) OR \
    (properties.latitude BETWEEN 33.283149 AND 33.513597 and \
    properties.longitude BETWEEN -111.972656 AND -111.846313 )) and \
    properties.`id` = property_transaction_logs.`property_id` and \
    property_transaction_logs.`city_code` = :city and \
    property_transaction_logs.`transaction_type` = :ttype and \
    (properties.fixer is Null or properties.fixer = False) and \
    (properties.townhouse is Null or properties.townhouse = False) and \
    (properties.foreclosure is Null or properties.foreclosure = False) and \
    (properties.adult is Null or properties.adult = False) and \
    (properties.construction is Null or properties.construction = False) and \
    (properties.mobile is Null or properties.mobile = False) \
    order by \
    property_transaction_logs.id desc \
    limit :limit) 
    
    q = query.DataFrame()
    q = q.loc[:,~q.columns.duplicated()]
    q.set_index('property_id', inplace=True)
    q.index.name = 'property_id'
    return q

def queue_city_queries(city, zipcode_list, for_sale_zipcode_list):
    i = query(city=city, zipcode=zipcode_list, limit=limit,ttype = 'sales',tstatus='closed')
    j = query(city=city, zipcode=for_sale_zipcode_list, limit=limit,ttype = 'sales',tstatus='open') 
    
    q = pd.concat([i,j])
    q = sanitize(q)
    
    for_sale = q[(q.transaction_type == "sales") & 
                 (q.transaction_status == "open") & 
                 (q.date_listed > (today - dt.timedelta(days=6000))) &
                 (q.zipcode.isin(for_sale_zipcode_list))]
    sales = q[(q.transaction_type == "sales") & (q.transaction_status == "closed")]
        
    data = {'sales': sales, 'for_sale': for_sale }
    
    return data

In [20]:
# get list of top zipcodes to only run the model on them (put down 2000 to get every zipcode)
# we filter the current listings further to only see the top zipcodes to not predict prices in areas with weak coverage
focus_zipcodes = top_zipcodes(200)
phoenix_zips = list(filter(lambda x: x.startswith('85'), focus_zipcodes))
print(phoenix_zips)

# limit on number of lines returned from sql queries (for debugging)
limit = 200000000

200 rows affected.
['85375', '85142', '85143', '85326', '85249', '85234', '85086', '85225', '85041', '85204', '85205', '85209', '85295', '85037', '85248', '85233', '85286', '85331', '85224', '85226', '85048', '85022', '85251', '85298', '85018', '85138', '85029', '85051', '85254', '85255', '85302', '85020', '85297', '85323', '85016', '85335', '85388', '85373', '85202', '85027', '85706', '85035', '85257', '85043', '85023', '85203', '85212', '85304', '85303', '85206', '85044', '85392', '85301', '85053', '85374', '85712', '85262']


In [21]:
q = query(city='PH', zipcode=phoenix_zips, limit=limit,ttype = 'sales')

523355 rows affected.


In [22]:
q = sanitize(q)

In [23]:
for_sale = q[(q.transaction_type == "sales") & 
             (q.transaction_status == "open") & 
             (q.date_listed > (today - 365)) &
             (q.is_latest == True)] 
sales = q[(q.transaction_type == "sales")]

for_sale.to_csv('CSV_backups/ALL-for_sale.csv')
sales.to_csv('CSV_backups/ALL-sales.csv')
