Each models is required to generate a stack ranking of all properties listed for sale in the database.

This script runs daily, reads in CSV files with a sorted order of homes (most attractive at the top to least attractive at the bottom), and a 'ypred' variable that reflects the scripts absolute metric (like P&L or probability of sale).

The script compares each list with homes that were sold, and identifies closed transactions ('positives') as well as the top 10 properties that each model identified that didn't sell ('negatives'). The remainder is considered 'control set'. 

For the overall data set, and specifically for each of those 3 groups, we output the average 'ypred' value, what number and percentage of homes sold from each.

In [1]:
import pandas as pd
import datetime as dt
import os
from slacker import Slacker
import json
import requests

csv_path = '/home/ilya/Code/rentalnerd/scraper/'
today = dt.date.today()

%load_ext sql
%sql mysql://prod:nerd@52.2.153.189/rental_nerd
        
        
# slack secrets (in your ~/.bashrc)
webhook_url = os.environ.get('SLACK_URL')
slacker = Slacker(os.environ.get('SLACK_TOKEN'))

In [2]:
def slack(text, url = None, title = None):
    print("Slacking: " + text)
    
    if url == None:
        data=json.dumps({"text": text})
    else:
        data = json.dumps( { "text": text, "attachments": [ { "fallback": "Model MAE"
                                           , "title": title
                                           , "image_url": url } ] } )
    
    response = requests.post(webhook_url, data , headers={'Content-Type': 'application/json'})
    if response.status_code != 200:
        raise ValueError('Request to slack returned an error %s, the response is:\n%s' % (response.status_code, response.text))


In [3]:
limit = 100000

query = %sql (\
    SELECT \
    area_name_zipcodes.area_name, \
    properties.*, \
    property_transaction_logs.id as 'transaction_id', \
    property_transaction_logs.* \
    FROM  \
    property_transaction_logs \
    LEFT JOIN \
    properties on properties.id = property_transaction_logs.`property_id`  \
    LEFT JOIN \
    area_name_zipcodes on properties.zipcode = area_name_zipcodes.zipcode \
    where \
    home_type = 'sfh' AND \
    transaction_type = 'sales' and  \
    date_closed is not null and \
    price_closed is not null and \
    days_on_market is not null and \
    transaction_status = 'closed' \
    ORDER BY property_transaction_logs.date_closed DESC \
    LIMIT :limit )


closed = query.DataFrame()
closed.set_index('property_id', inplace=True)
closed.index.name = 'property_id'

100000 rows affected.


In [4]:
# read in prior target list for backtesting purposes
prior_target_list = pd.read_csv(csv_path + 'good_sell/target_list_' + (today-dt.timedelta(days=1)).strftime('%Y%m%d') + '.csv', index_col = 0)

backtest = prior_target_list.merge(closed[['transaction_id','price']], how="inner").sort_values(by='ypred',ascending=False)
backtest['good_sell'] = (backtest.price >= backtest.list)
y_all = prior_target_list.ypred.mean()
y_sold = backtest.ypred.mean()

pos_trigger = 0.6
slack("Prior target list length: %i\tNum sold: %i\tAvg ypred: %f\tAvg ypred of sold: %f" 
      % (len(prior_target_list.index), len(backtest.index),y_all, y_sold))

num_pos = len(prior_target_list[prior_target_list.ypred > pos_trigger].index)
positives = backtest[backtest.ypred > pos_trigger]
y_pos = (-(positives.price - positives.list) / positives.list).mean()
num_sold = len(positives.index)
if num_sold == 0:
    slack("Num of homes with ypred > %f: %i\tPerc of those sold: NONE\tAvg disc to list: NA" % (pos_trigger, num_pos) )    
else:
    slack("Num of homes with ypred > %f: %i\tPerc of those sold: %f\tAvg disc to list: %f" 
          % (pos_trigger, num_pos, num_sold / num_pos *  100, y_pos * 100))

neg_trigger = 0.3
num_neg = len(prior_target_list[prior_target_list.ypred < neg_trigger].index)
negatives = backtest[backtest.ypred < neg_trigger]
y_neg = (-(negatives.price - negatives.list) / negatives.list).mean()
num_sold = len(negatives.index)

if num_sold == 0:
    slack("Num of homes with ypred < %f: %i\tPerc of those sold: NONE\tAvg disc to list: NA" % (neg_trigger, num_neg))
else:
    slack("Num of homes with ypred < %f: %i\tPerc of those sold: %f\tAvg disc to list:%f" 
          % (neg_trigger, num_neg, len(negatives.index) / num_neg * 100, y_neg * 100))

slack(backtest.to_string())

Slacking: Prior target list length: 9371	Num sold: 0	Avg ypred: 0.003631	Avg ypred of sold: nan
Slacking: Num of homes with ypred > 0.600000: 0	Perc of those sold: NONE	Avg disc to list: NA
Slacking: Num of homes with ypred < 0.300000: 9370	Perc of those sold: NONE	Avg disc to list: NA
Slacking: Empty DataFrame
Columns: [transaction_id, address, ypred, list, url, price, good_sell]
Index: []


In [5]:
# read in prior target list for backtesting purposes
prior_target_list = pd.read_csv(csv_path + 'value_buy/target_list_' + (today-dt.timedelta(days=1)).strftime('%Y%m%d') + '.csv', index_col = 0)

backtest = prior_target_list.merge(closed[['transaction_id','price']], how="inner").sort_values(by='ypred',ascending=False)
backtest['good_sell'] = (backtest.price >= backtest.list)
avg_pnl = backtest.ypred.mean()
predicted_price = backtest.predicted_price.mean()
sale_price = backtest.price.mean()

slack("Prior target list length: %i\tNum sold: %i\tAvg P&L: %f\tAvg predict price: %f\tAvg sale price: %f" 
      % (len(prior_target_list.index), len(backtest.index),y_all, predicted_price, sale_price))

pos_trigger = 0
num_pos = len(prior_target_list[prior_target_list.ypred > pos_trigger].index)
positives = backtest[backtest.ypred > pos_trigger]
y_pos = (-(positives.price - positives.list) / positives.list).mean()
num_sold = len(positives.index)
if num_sold == 0:
    slack("Num of homes with ypred > %f: %i\tPerc of those sold: NONE\tAvg disc to list: NA" % (pos_trigger, num_pos) )    
else:
    slack("Num of homes with ypred > %f: %i\tPerc of those sold: %f\tAvg disc to list: %f" 
          % (pos_trigger, num_pos, num_sold / num_pos, y_pos))

neg_trigger = -30000
num_neg = len(prior_target_list[prior_target_list.ypred < neg_trigger].index)
negatives = backtest[backtest.ypred < neg_trigger]
y_neg = (-(negatives.price - negatives.list) / negatives.list).mean()
num_sold = len(negatives.index)

if num_sold == 0:
    slack("Num of homes with ypred < %f: %i\tPerc of those sold: NONE\tAvg disc to list: NA" % (neg_trigger, num_neg))
else:
    slack("Num of homes with ypred < %f: %i\tPerc of those sold: %f\tAvg disc to list:%f" 
          % (neg_trigger, num_neg, len(negatives.index) / num_neg, y_neg))

slack(backtest.to_string())

Slacking: Prior target list length: 6606	Num sold: 0	Avg P&L: 0.003631	Avg predict price: nan	Avg sale price: nan
Slacking: Num of homes with ypred > 0.000000: 332	Perc of those sold: NONE	Avg disc to list: NA
Slacking: Num of homes with ypred < -30000.000000: 6000	Perc of those sold: NONE	Avg disc to list: NA
Slacking: Empty DataFrame
Columns: [transaction_id, address, ypred, predicted_price, list, url, price, good_sell]
Index: []
