# Format data for housing project
1. [Format FIPs, county name and date](#1.-Format-FIPs,-county-name-and-date)
2. [Select categories (median_listing_price, median_days_on_market, new_listing_count, active_listing_count)](#2.-Select-categories)
3. Filter out counties that don't have data for every month
4. Filter out counties with less than a certain amount of active listings
5. Format data for the latest month
6. [Format metadata](6.-Format-metadata)
7. [Format timeseries data and save to file for each fips code](7.-Format-timeseries-data-and-save-to-file-for-each-fips-code)
8. Format national data

In [1]:
import pandas as pd
import json
import warnings

warnings.filterwarnings(action='ignore')

In [2]:
df = pd.read_csv('data/RDC_Inventory_Core_Metrics_County_History.csv')

## Common functions

In [3]:
def format_date(x):
    return str(x)[0:4] + '-' + str(x)[4:6] + '-01'

def format_fips(x):
    return str(x).zfill(5)

selected_categories = ['month_date', 'quality_flag', 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count'];


## 1. Format FIPs, county name and date

#### Format date

In [4]:
df['month_date'] = df['month_date_yyyymm'].apply(format_date)


#### Format FIPs

In [5]:
df['county_fips'] = df['county_fips'].apply(format_fips)

In [6]:
df_meta = pd.read_csv('data/county_fips.csv')
df_meta.columns = ['county_fips', 'county_name']
df_meta['county_fips'] = df_meta['county_fips'].apply(lambda x: str(x).zfill(5))
df_meta = df_meta.set_index('county_fips')

#### Merge formatted datasets

In [7]:
df_merge = pd.merge(df, df_meta, on='county_fips', suffixes=('_old', ''))

## 2. Select categories
- median_listing_price
- median_days_on_market
- new_listing_count
- active_listing_count)

In [8]:
df_selected = df_merge[selected_categories + ['county_name', 'county_fips']]


## 3. Filter out counties that don't have data for every month

In [9]:
df_six_years = df_selected[df_selected['month_date'] >= '2018-01-01']

In [10]:
df_totals = pd.DataFrame(df_six_years.groupby('county_fips')['month_date'].nunique().sort_values(ascending=False))

In [11]:
unique_month_count = df_totals['month_date'].max()
unique_month_count

np.int64(88)

In [12]:
df_complete = df_totals[df_totals['month_date'] == unique_month_count]

In [13]:
df_complete

Unnamed: 0_level_0,month_date
county_fips,Unnamed: 1_level_1
01001,88
39073,88
39055,88
39057,88
39059,88
...,...
22005,88
22003,88
22001,88
21237,88


In [14]:
df_selected_counties = df_six_years[df_six_years['county_fips'].isin(df_complete.index.tolist())]

## 4. Filter out counties with less than a certain amount of active listings

In [15]:
df_non_zero_listings = df_selected_counties[df_selected_counties['active_listing_count'] > 10]
df_non_zero_listings = df_non_zero_listings[~df_selected_counties['active_listing_count'].isna()]

In [16]:
df_listing_totals = pd.DataFrame(df_non_zero_listings.groupby('county_fips')['month_date'].nunique())

In [17]:
df_filtered_counties = df_six_years[df_six_years['county_fips'].isin(df_listing_totals.index)]

## 5. Format data for the latest month

In [33]:
df_population = pd.read_csv('data/co-est2024-formatted.csv', encoding_errors='ignore')

In [35]:
df_population['FIPS'] = df_population['FIPS'].apply(format_fips)

In [36]:
df_population

Unnamed: 0,STNAME,CTYNAME,POPESTIMATE2024,FIPS
0,Alabama,Alabama,5157699,01000
1,Alabama,Autauga County,61464,01001
2,Alabama,Baldwin County,261608,01003
3,Alabama,Barbour County,24358,01005
4,Alabama,Bibb County,22258,01007
...,...,...,...,...
3190,Wyoming,Sweetwater County,41273,56037
3191,Wyoming,Teton County,23272,56039
3192,Wyoming,Uinta County,20621,56041
3193,Wyoming,Washakie County,7662,56043


In [37]:
latest_month = df_filtered_counties['month_date'].max()

In [38]:
last_year = str(int(latest_month[0:4]) - 1) + latest_month[4:]

In [39]:
df_latest_month = df_filtered_counties[(df_filtered_counties['month_date'] == latest_month) | (df_filtered_counties['month_date'] == last_year)]


In [47]:
df_pivot_yoy = df_latest_month.pivot(index='county_fips', columns='month_date', values=['median_listing_price', 'active_listing_count'])

df_pivot_yoy['median_listing_price_yoy'] = (df_pivot_yoy[('median_listing_price', latest_month)] - df_pivot_yoy[('median_listing_price', last_year)]) / df_pivot_yoy[('median_listing_price', last_year)]
df_pivot_yoy['active_listing_yoy'] = (df_pivot_yoy[('active_listing_count', latest_month)] - df_pivot_yoy[('active_listing_count', last_year)]) / df_pivot_yoy[('active_listing_count', last_year)]
df_pivot_yoy = df_pivot_yoy.drop([('median_listing_price', last_year), ('active_listing_count', last_year)], axis=1)
df_pivot_yoy = df_pivot_yoy.reset_index()
df_pivot_yoy.columns = df_pivot_yoy.columns.droplevel(-1)

df_pivot_yoy = df_pivot_yoy.merge(df_meta, how='left', on='county_fips')
df_pivot_yoy = df_pivot_yoy.merge(df_population[['FIPS', 'POPESTIMATE2024']], how='left', left_on='county_fips', right_on='FIPS')
df_pivot_yoy = df_pivot_yoy.drop('FIPS', axis=1)
df_pivot_yoy = df_pivot_yoy.rename({'POPESTIMATE2024': 'population_2024'}, axis=1)

In [57]:
df_pivot_yoy.to_json('../public/data/latest.json', orient='records')

## 6. Format metadata
Save to `../src/assets/fips.json` as `{'county_name': '', 'county_fips': ''}`

In [22]:
df_unique_counties = df_filtered_counties.drop_duplicates('county_fips')

In [24]:
df_unique_counties[['county_name', 'county_fips']].to_json('../src/assets/fips.json', orient='records')

## 7. Format timeseries data and save to file for each fips code
Save to `../public/data/counties/[FIPS].json`

In [23]:
def format_moving_average(df):
    df = df.sort_values('month_date')
    df['median_listing_price_rolling'] = df['median_listing_price'].rolling(window=12).mean()
    df['active_listing_count_rolling'] = df['active_listing_count'].rolling(window=12).mean()
    return df

In [28]:
for county_fips in df_unique_counties['county_fips'].tolist():
    df_selected_fips = df_filtered_counties[df_filtered_counties['county_fips'] == county_fips]
    # print(df_selected_fips)
    df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
    df_selected_fips = format_moving_average(df_selected_fips)
    obj = {
        # 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']
        'county_name': df_selected_fips.reset_index()['county_name'][0],
        'median_listing_price': json.loads(df_selected_fips[['month_date', 'median_listing_price']].to_json(orient='values')),
        'median_listing_price_rolling': json.loads(df_selected_fips[['month_date', 'median_listing_price_rolling']].to_json(orient='values')),
        'active_listing_count_rolling': json.loads(df_selected_fips[['month_date', 'active_listing_count_rolling']].to_json(orient='values')),
        'active_listing_count': json.loads(df_selected_fips[['month_date', 'active_listing_count']].to_json(orient='values')),
        'median_days_on_market': json.loads(df_selected_fips[['month_date', 'median_days_on_market']].to_json(orient='values'))
    }
    with open('../public/data/counties/%s.json' % county_fips, 'w') as outfile:
        json.dump(obj, outfile)

## 7. Format national data

In [56]:
df_national = pd.read_csv('data/RDC_Inventory_Core_Metrics_Country_History.csv')

In [57]:
df_national = df_national[df_national['country'] == 'United States']
df_national['month_date'] = df_national['month_date_yyyymm'].apply(format_date)
df_national = df_national[selected_categories]
df_national = format_moving_average(df_national)

In [59]:
obj = {
    # 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']
    'median_listing_price': json.loads(df_national[['month_date', 'median_listing_price']].to_json(orient='values')),
    'median_listing_price_rolling': json.loads(df_national[['month_date', 'median_listing_price_rolling']].to_json(orient='values')),
    'active_listing_count_rolling': json.loads(df_national[['month_date', 'active_listing_count_rolling']].to_json(orient='values')),
    'active_listing_count': json.loads(df_national[['month_date', 'active_listing_count']].to_json(orient='values')),
    'median_days_on_market': json.loads(df_national[['month_date', 'median_days_on_market']].to_json(orient='values'))
}
with open('../public/data/national.json', 'w') as outfile:
    json.dump(obj, outfile)