In [11]:
import json
from datetime import datetime, timedelta

In [24]:
def strip_int_column(dataf, columns):
    for c in columns:
        cvalues = list(dataf[c].values)
        cvalues = [re.sub('[^0-9]','', x) for x in cvalues]
        dataf[f'{c}_clean'] = [0 if y == '' else int(y) for y in cvalues]
    return dataf

def prepare_housing_dict(dicty):
    
    # merge dict into pandas frame
    housesdf = pd.DataFrame.from_dict(dicty, orient='index')
    
    # get city
    housesdf['city'] = housesdf['location'].str.split(',').str[1]
    housesdf['city'] = housesdf['city'].fillna('noaddress')
    
    # convert given columns to ints
    housesdf = strip_int_column(housesdf, ['price', 'meters'])
    
    # divide extreem high prices by 1000 to correct for dtype error in source
    housesdf.loc[lambda x: x['price_clean']>99999999, 'price_clean'] = housesdf['price_clean']/1000
    
    # calculate meter price on row level and replace failed calculation with 0
    housesdf['meter_price'] = housesdf['price_clean'] / housesdf['meters_clean']
    housesdf['meter_price'] = housesdf['meter_price'].replace([np.inf, -np.inf], 0)
    
    # filter on available houses only & houses with at least the field city filled
    housesdf = housesdf.loc[lambda x: x['sold']=='available']
    housesdf = housesdf.loc[lambda x: x['location']!='noadress']
    
    return housesdf

In [39]:
def housing_fact_dict(housesdf, area):
    
    # calculate metrics
    n_houses = housesdf['city'].count()
    average_price = round(housesdf.loc[lambda x: (x['price_clean']>0)]['price_clean'].mean(), 2)
    median_price = round(housesdf.loc[lambda x: (x['price_clean']>0)]['price_clean'].median(), 2)
    average_meters = round(housesdf.loc[lambda x: (x['meters_clean']>0)]['meters_clean'].mean(), 2)
    median_meters = round(housesdf.loc[lambda x: (x['meters_clean']>0)]['meters_clean'].median(), 2)
    average_meter_price = round(housesdf.loc[lambda x: (x['meter_price']>0)]['meter_price'].mean(), 2)
    median_meter_price = round(housesdf.loc[lambda x: (x['meter_price']>0)]['meter_price'].median(), 2)

    # create dict
    houses_dict = {
        area: 
        {
            'number_available': n_houses,
            'price_mean': average_price,
            'price_median': median_price,
            'meters_mean': average_meters,
            'meters_median': median_meters,
            'meter_price_mean': average_meter_price,
            'meter_price_median': median_meter_price
        }
    }
    
    return houses_dict

In [40]:
def build_full_dict(housesdf):
    
    # run totals
    overall_dict = houses_fact_dict(housesdf, area='The Netherlands')
    
    cities = ['Amsterdam', 'Rotterdam', 'Utrecht', 'Hilversum', 'Amstelveen']
    cities_dicts = {}
    # run on cities
    for city in cities:
        # filter on city
        city_housesdf = housesdf.loc[lambda x: x['city'].str.contains(city)]
        # create city sum
        sub_city_dict = houses_fact_dict(city_housesdf, area=city)
        # update cities
        cities_dicts.update(sub_city_dict)
    
    # merge dicts
    full_dict = {**overall_dict, **cities_dict}
    
    return full_dict

In [46]:
with open('../data/housing_data_2020_03_28.json', 'r') as s:
    housesdict = json.load(s)
    
housesdf = prepare_housing_dict(housesdict)

full_dict = build_full_dict(housesdf)

today = datetime.now().strftime('%Y-%m-%d')
output_dict = {
    'creation_date': today,
    'version': 0.1,
    'housing_facts': full_dict
}

In [6]:
today = datetime.now().strftime('%Y_%m_%d')

In [15]:
yesterday = (datetime.now() + timedelta(days=-1)).strftime('%Y_%m_%d')

In [7]:
tt = {'The Netherlands': {'number_available': 51641, 'price_mean': 448247.66, 'price_median': 349000.0, 'meters_mean': 143.42, 'meters_median': 125.0, 'meter_price_mean': 3684.8, 'meter_price_median': 2733.58}, 'Amsterdam': {'number_available': 1661, 'price_mean': 733219.22, 'price_median': 525000.0, 'meters_mean': 140.34, 'meters_median': 122.0, 'meter_price_mean': 6059.79, 'meter_price_median': 4306.57}, 'Rotterdam': {'number_available': 1056, 'price_mean': 464835.87, 'price_median': 324750.0, 'meters_mean': 135.92, 'meters_median': 118.0, 'meter_price_mean': 3845.83, 'meter_price_median': 2840.91}, 'Utrecht': {'number_available': 489, 'price_mean': 483461.98, 'price_median': 400000.0, 'meters_mean': 135.1, 'meters_median': 124.0, 'meter_price_mean': 4118.69, 'meter_price_median': 3339.2}, 'Hilversum': {'number_available': 306, 'price_mean': 693917.06, 'price_median': 585000.0, 'meters_mean': 144.29, 'meters_median': 127.0, 'meter_price_mean': 6020.18, 'meter_price_median': 4324.92}, 'Amstelveen': {'number_available': 254, 'price_mean': 745368.8, 'price_median': 622500.0, 'meters_mean': 144.0, 'meters_median': 126.0, 'meter_price_mean': 6283.83, 'meter_price_median': 4791.67}}

In [32]:
output_dict = {
        today: {
            'version': 0.1,
            'housing_facts': tt
        }
    }

In [33]:
output_dict2 = {
        yesterday: {
            'version': 0.1,
            'housing_facts': tt
        }
    }

In [35]:
fulldict = {**output_dict2, **output_dict}

In [21]:
output_dict2

{'creation_date': '2020_03_31',
 'version': 0.1,
 'housing_facts': {'The Netherlands': {'number_available': 51641,
   'price_mean': 448247.66,
   'price_median': 349000.0,
   'meters_mean': 143.42,
   'meters_median': 125.0,
   'meter_price_mean': 3684.8,
   'meter_price_median': 2733.58},
  'Amsterdam': {'number_available': 1661,
   'price_mean': 733219.22,
   'price_median': 525000.0,
   'meters_mean': 140.34,
   'meters_median': 122.0,
   'meter_price_mean': 6059.79,
   'meter_price_median': 4306.57},
  'Rotterdam': {'number_available': 1056,
   'price_mean': 464835.87,
   'price_median': 324750.0,
   'meters_mean': 135.92,
   'meters_median': 118.0,
   'meter_price_mean': 3845.83,
   'meter_price_median': 2840.91},
  'Utrecht': {'number_available': 489,
   'price_mean': 483461.98,
   'price_median': 400000.0,
   'meters_mean': 135.1,
   'meters_median': 124.0,
   'meter_price_mean': 4118.69,
   'meter_price_median': 3339.2},
  'Hilversum': {'number_available': 306,
   'price_mean':

In [18]:
output_dict2.append

{'creation_date': '2020_03_30',
 'version': 0.1,
 'housing_facts': {'The Netherlands': {'number_available': 51641,
   'price_mean': 448247.66,
   'price_median': 349000.0,
   'meters_mean': 143.42,
   'meters_median': 125.0,
   'meter_price_mean': 3684.8,
   'meter_price_median': 2733.58},
  'Amsterdam': {'number_available': 1661,
   'price_mean': 733219.22,
   'price_median': 525000.0,
   'meters_mean': 140.34,
   'meters_median': 122.0,
   'meter_price_mean': 6059.79,
   'meter_price_median': 4306.57},
  'Rotterdam': {'number_available': 1056,
   'price_mean': 464835.87,
   'price_median': 324750.0,
   'meters_mean': 135.92,
   'meters_median': 118.0,
   'meter_price_mean': 3845.83,
   'meter_price_median': 2840.91},
  'Utrecht': {'number_available': 489,
   'price_mean': 483461.98,
   'price_median': 400000.0,
   'meters_mean': 135.1,
   'meters_median': 124.0,
   'meter_price_mean': 4118.69,
   'meter_price_median': 3339.2},
  'Hilversum': {'number_available': 306,
   'price_mean':

In [59]:
with open('test.json', 'w') as teststream:
    json.dump(output_dict, teststream, indent=2)