In [1]:
import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

In [2]:
DATA_START = pd.to_datetime('2015-01-01')
DATASET_END = pd.to_datetime('2018-01-01')
NUM_SAMPLES = int(1e5)
MIN_APT_SIZE = 600

In [3]:
listing_dates = DATA_START + pd.to_timedelta(np.random.randint(0, 365*3, size=NUM_SAMPLES), unit='D')

In [4]:
house_qualities = np.random.exponential(size=NUM_SAMPLES)
years_since_remodel = np.random.exponential(scale=1e1,size=NUM_SAMPLES)
size_bonus = np.random.rand(NUM_SAMPLES) * 100
listing_price = np.clip(house_qualities * 400000 - years_since_remodel * 1000, 1000, None)

In [5]:
sqft = house_qualities * 1000 + size_bonus + MIN_APT_SIZE

In [6]:
bedroom_bonus = np.random.rand(NUM_SAMPLES) * 1000
bedrooms = np.floor(np.power((sqft + bedroom_bonus) * house_qualities // 300, .4))
#If more bedrooms per same space, fewer bathrooms can fit (hypothetical to make data more interesting)
bathroom_bonus = np.random.rand(NUM_SAMPLES) * 1000 * house_qualities - 1000 * bedrooms
bathrooms = np.floor(np.power(np.clip((sqft + bathroom_bonus) // 10, 0, None) + 1, .3))

In [7]:
#inspection

In [8]:
print('bedroom listing_price correlation,', np.corrcoef(bedrooms, listing_price)[0,1])

bedroom listing_price correlation, 0.9600408459038612


In [9]:
print('bathrooms listing_price correlation,', np.corrcoef(bathrooms, listing_price)[0,1])

bathrooms listing_price correlation, 0.30991081995284797


In [10]:
print('bedroom bathroom correlation,', np.corrcoef(bedrooms, bathrooms)[0,1])
print('bedroom min mean max', bedrooms.min(), bedrooms.mean(), bedrooms.max())
print('bathroom miin mean max',bathrooms.min(), bathrooms.mean(), bathrooms.max())

bedroom bathroom correlation, 0.12783694505764476
bedroom min mean max 0.0 1.50921 12.0
bathroom miin mean max 1.0 2.67764 7.0


In [11]:
(listing_dates - DATA_START).days.values.astype(float)

array([1011., 1084., 1007., ...,  682.,  733.,  885.])

In [12]:
time_bonus = (listing_dates - DATA_START).days.values.astype(float)

In [13]:
bath_bed_penalty = bathrooms - bedrooms
bath_bed_ind = bathrooms > bedrooms

In [14]:
broker = np.random.randint(0, 2, NUM_SAMPLES)

In [15]:
sell_inv = bedrooms * bathrooms/bedrooms.std()/bathrooms.std() + bedrooms/bedrooms.std()
sell_inv += broker
sell_inv += time_bonus/time_bonus.std()
sell_inv += bathrooms/bathrooms.std() + sqft/sqft.std()
sell_inv += house_qualities*2
sell_inv += size_bonus/size_bonus.std()
sell_inv -= years_since_remodel/years_since_remodel.std()
sell_inv += np.random.rand(NUM_SAMPLES) + 10
sell_inv[bath_bed_ind] = sell_inv[bath_bed_ind] - bath_bed_penalty[bath_bed_ind]/2
sales_duration = np.floor(1/sell_inv * 3000).astype(int)

In [16]:
sales_duration.min(), sales_duration.mean(), sales_duration.std(), sales_duration.max()

(22, 139.41186, 40.31076173108615, 463)

In [17]:
sales_duration

array([ 91, 138, 184, ..., 133, 188,  94])

In [18]:
def plt_clipped(a):
    clip_max = np.percentile(a, 99)
    a = np.clip(a, a.min(), clip_max)
    #a = a[a < clip_max]
    plt.hist(a, bins=100)

In [19]:
sales_date = listing_dates + pd.to_timedelta(sales_duration,unit='D')

In [20]:
final_df = pd.DataFrame({'SalesDate':sales_date, 'ListingDate':listing_dates,
              'bedrooms':bedrooms,'bathrooms':bathrooms,'sqft':sqft,'years_since_remodel':years_since_remodel,
                        'broker': broker})

In [21]:
final_df.loc[final_df.SalesDate > DATASET_END, 'SalesDate'] = np.nan

In [22]:
cnt = 0
for ridx in np.random.choice(final_df.shape[0], final_df.shape[0]//6, replace = False):
    cnt += 1
    if cnt % 10000 == 0:
        print(cnt, cnt/final_df.shape[0]*6)
    if final_df.loc[ridx,'broker']:
        if np.random.rand() > .5:
            final_df.loc[ridx, 'sqft'] += 100
            final_df.loc[ridx, 'bedrooms'] = np.nan
        else:
            final_df.loc[ridx, 'bedrooms'] += 1
            final_df.loc[ridx, 'sqft'] = np.nan
    else:
        feature = np.random.randint(2, final_df.shape[1])
        final_df.iloc[ridx, feature] = np.nan
    final_df.loc[ridx,'broker'] = np.nan


10000 0.6000000000000001


In [23]:
final_df.to_csv('houselistings_simulated.csv', index=False)

In [24]:
final_df.isnull().sum()

ListingDate                0
SalesDate              11643
bathrooms               1701
bedrooms                5795
broker                 16666
sqft                    5827
years_since_remodel     1663
dtype: int64

In [25]:
final_df

Unnamed: 0,ListingDate,SalesDate,bathrooms,bedrooms,broker,sqft,years_since_remodel
0,2017-10-08,NaT,4.0,2.0,0.0,2263.200297,3.189165
1,2017-12-20,NaT,2.0,,,1159.398712,13.919031
2,2017-10-04,NaT,3.0,0.0,0.0,771.294248,2.384003
3,2015-02-19,2015-08-06,1.0,1.0,1.0,1012.003459,2.515936
4,2017-09-22,NaT,3.0,2.0,1.0,1732.544302,4.220575
5,2015-05-22,2015-09-17,3.0,,,1978.633589,3.165217
6,2015-08-26,2016-01-26,3.0,1.0,0.0,1456.249476,3.764364
7,2017-06-10,2017-10-01,2.0,2.0,1.0,2087.071320,1.497180
8,2015-09-22,2016-01-30,2.0,2.0,0.0,2115.377784,7.116858
9,2017-07-12,2017-11-15,1.0,2.0,1.0,1574.613888,9.179006
