In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
DATA_START = pd.to_datetime('2015-01-01')
DATASET_END = pd.to_datetime('2018-01-01')
NUM_SAMPLES = int(1e5)
MIN_APT_SIZE = 600

In [3]:
listing_dates = DATA_START + pd.to_timedelta(np.random.randint(0, 365*3, size=NUM_SAMPLES), unit='D')

In [4]:
house_qualities = np.random.exponential(size=NUM_SAMPLES)
years_since_remodel = np.random.exponential(scale=1e1,size=NUM_SAMPLES)
size_bonus = np.random.rand(NUM_SAMPLES) * 100
listing_price = np.clip(house_qualities * 400000 - years_since_remodel * 1000, 1000, None)

In [5]:
sqft = house_qualities * 1000 + size_bonus + MIN_APT_SIZE

In [6]:
bedroom_bonus = np.random.rand(NUM_SAMPLES) * 1000
bedrooms = np.floor(np.power((sqft + bedroom_bonus) * house_qualities // 300, .4))
#If more bedrooms per same space, fewer bathrooms can fit (hypothetical to make data more interesting)
bathroom_bonus = np.random.rand(NUM_SAMPLES) * 1000 * house_qualities - 1000 * bedrooms
bathrooms = np.floor(np.power(np.clip((sqft + bathroom_bonus) // 10, 0, None) + 1, .3))

In [7]:
#inspection

In [8]:
print('bedroom listing_price correlation,', np.corrcoef(bedrooms, listing_price)[0,1])

bedroom listing_price correlation, 0.960233311836979


In [9]:
print('bathrooms listing_price correlation,', np.corrcoef(bathrooms, listing_price)[0,1])

bathrooms listing_price correlation, 0.316635484405189


In [10]:
print('bedroom bathroom correlation,', np.corrcoef(bedrooms, bathrooms)[0,1])
print('bedroom min mean max', bedrooms.min(), bedrooms.mean(), bedrooms.max())
print('bathroom miin mean max',bathrooms.min(), bathrooms.mean(), bathrooms.max())

bedroom bathroom correlation, 0.1350486312938089
bedroom min mean max 0.0 1.51745 11.0
bathroom miin mean max 1.0 2.68071 8.0


In [11]:
(listing_dates - DATA_START).days.values.astype(float)

array([544., 694., 293., ..., 138., 505., 836.])

In [12]:
time_bonus = (listing_dates - DATA_START).days.values.astype(float)

In [13]:
bath_bed_penalty = bathrooms - bedrooms
bath_bed_ind = bathrooms > bedrooms

In [14]:
broker = np.random.randint(0, 2, NUM_SAMPLES)

In [15]:
sell_inv = bedrooms * bathrooms/bedrooms.std()/bathrooms.std() + bedrooms/bedrooms.std()
sell_inv += broker
sell_inv += time_bonus/time_bonus.std()
sell_inv += bathrooms/bathrooms.std() + sqft/sqft.std()
sell_inv += house_qualities*2
sell_inv += size_bonus/size_bonus.std()
sell_inv -= years_since_remodel/years_since_remodel.std()
sell_inv += np.random.rand(NUM_SAMPLES) + 10
sell_inv[bath_bed_ind] = sell_inv[bath_bed_ind] - bath_bed_penalty[bath_bed_ind]/2
sales_duration = np.floor(1/sell_inv * 3000).astype(int)

In [16]:
sales_duration.min(), sales_duration.mean(), sales_duration.std(), sales_duration.max()

(22, 139.50689, 40.298149492599535, 540)

In [17]:
sales_duration

array([142, 127,  70, ..., 170, 194, 183])

In [18]:
def plt_clipped(a):
    clip_max = np.percentile(a, 99)
    a = np.clip(a, a.min(), clip_max)
    #a = a[a < clip_max]
    plt.hist(a, bins=100)

In [19]:
sales_date = listing_dates + pd.to_timedelta(sales_duration,unit='D')

In [20]:
final_df = pd.DataFrame({'SalesDate':sales_date, 'ListingDate':listing_dates,
              'bedrooms':bedrooms,'bathrooms':bathrooms,'sqft':sqft,'years_since_remodel':years_since_remodel,
                        'broker': broker})

In [21]:
final_df.loc[final_df.SalesDate > DATASET_END, 'SalesDate'] = np.nan

In [22]:
cnt = 0
for ridx in np.random.choice(final_df.shape[0], final_df.shape[0]//6, replace = False):
    cnt += 1
    if cnt % 10000 == 0:
        print(cnt, cnt/final_df.shape[0]*6)
    if final_df.loc[ridx,'broker']:
        if np.random.rand() > .5:
            final_df.loc[ridx, 'sqft'] += 100
            final_df.loc[ridx, 'bedrooms'] = np.nan
        else:
            final_df.loc[ridx, 'bedrooms'] += 1
            final_df.loc[ridx, 'sqft'] = np.nan
    else:
        feature = np.random.randint(2, final_df.shape[1])
        final_df.iloc[ridx, feature] = np.nan
    final_df.loc[ridx,'broker'] = np.nan


10000 0.6000000000000001


In [23]:
final_df.to_csv('houselistings_simulated.csv', index=False)

In [24]:
final_df.isnull().sum()

SalesDate              11858
ListingDate                0
bedrooms                5756
bathrooms               1666
sqft                    5805
years_since_remodel     1701
broker                 16666
dtype: int64

In [25]:
final_df

Unnamed: 0,SalesDate,ListingDate,bedrooms,bathrooms,sqft,years_since_remodel,broker
0,2016-11-17,2016-06-28,2.0,1.0,1572.384151,5.276257,1.0
1,2017-04-01,2016-11-25,1.0,3.0,1391.115888,4.907824,1.0
2,2015-12-30,2015-10-21,4.0,4.0,3433.835995,16.754513,1.0
3,2016-12-04,2016-10-26,7.0,6.0,6626.121440,26.910622,0.0
4,2015-05-22,2015-01-14,2.0,2.0,2042.792376,4.068259,1.0
5,2016-11-06,2016-05-11,2.0,1.0,1598.766533,50.332959,1.0
6,2017-04-08,2016-12-26,2.0,3.0,1930.417254,2.140238,0.0
7,2017-11-09,2017-07-04,1.0,3.0,1606.767406,2.295865,0.0
8,2017-04-10,2016-12-26,3.0,1.0,2821.169249,0.543219,0.0
9,2015-09-11,2015-05-09,2.0,3.0,1637.479501,23.411825,1.0
