# Format data for housing project
1. Format FIPs, county name and date
2. Select categories (median_listing_price, median_days_on_market, new_listing_count, active_listing_count)
3. Filter out counties that don't have data for every month
4. Filter out counties with less than a certain amount of active listings
5. Format data for the latest month

In [35]:
import pandas as pd
import json
import warnings

warnings.filterwarnings(action='once')

In [36]:
df = pd.read_csv('data/RDC_Inventory_Core_Metrics_County_History.csv')

## 1. Format FIPs, county name and date

#### Format date

In [37]:
df['month_date'] = df['month_date_yyyymm'].apply(lambda x: str(x)[0:4] + '-' + str(x)[4:6] + '-01')

#### Format FIPs

In [38]:
df['county_fips'] = df['county_fips'].apply(lambda x: str(x).zfill(5))

In [39]:
df_meta = pd.read_csv('data/county_fips.csv')
df_meta.columns = ['county_fips', 'county_name']
df_meta['county_fips'] = df_meta['county_fips'].apply(lambda x: str(x).zfill(5))
df_meta = df_meta.set_index('county_fips')

#### Merge formatted datasets

In [15]:
df_merge = pd.merge(df, df_meta, on='county_fips', suffixes=('_old', ''))

## 2. Select categories
- median_listing_price
- median_days_on_market
- new_listing_count
- active_listing_count)

In [16]:
df_selected = df_merge[['county_name', 'month_date', 'quality_flag', 'county_fips', 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']]


## 3. Filter out counties that don't have data for every month

In [17]:
df_six_years = df_selected[df_selected['month_date'] >= '2018-01-01']

In [18]:
df_totals = pd.DataFrame(df_six_years.groupby('county_fips')['month_date'].nunique().sort_values(ascending=False))

In [19]:
unique_month_count = df_totals['month_date'].max()

In [20]:
unique_month_count

np.int64(87)

In [21]:
df_complete = df_totals[df_totals['month_date'] == unique_month_count]

In [22]:
df_complete

Unnamed: 0_level_0,month_date
county_fips,Unnamed: 1_level_1
01001,87
39073,87
39055,87
39057,87
39059,87
...,...
21135,87
21109,87
19053,87
21105,87


In [23]:
df_selected_counties = df_six_years[df_six_years['county_fips'].isin(df_complete.index.tolist())]

To-do: check every column for a value and only show counties with all data

## 4. Filter out counties with less than a certain amount of active listings

In [24]:
df_non_zero_listings = df_selected_counties[df_selected_counties['active_listing_count'] > 10]
df_non_zero_listings = df_non_zero_listings[~df_selected_counties['active_listing_count'].isna()]

  df_non_zero_listings = df_non_zero_listings[~df_selected_counties['active_listing_count'].isna()]


In [25]:
df_listing_totals = pd.DataFrame(df_non_zero_listings.groupby('county_fips')['month_date'].nunique())

In [26]:
df_filtered_counties = df_six_years[df_six_years['county_fips'].isin(df_listing_totals.index)]

## 5. Format data for the latest month

In [27]:
latest_month = df_filtered_counties['month_date'].max()

In [28]:
last_year = str(int(latest_month[0:4]) - 1) + latest_month[4:]

In [29]:
df_latest_month = df_filtered_counties[(df_filtered_counties['month_date'] == latest_month) | (df_filtered_counties['month_date'] == last_year)]


In [30]:
df_pivot_yoy = df_latest_month.pivot(index='county_fips', columns='month_date', values=['median_listing_price', 'active_listing_count'])

df_pivot_yoy['median_listing_price_yoy'] = (df_pivot_yoy[('median_listing_price', latest_month)] - df_pivot_yoy[('median_listing_price', last_year)]) / df_pivot_yoy[('median_listing_price', last_year)]
df_pivot_yoy['active_listing_yoy'] = (df_pivot_yoy[('active_listing_count', latest_month)] - df_pivot_yoy[('active_listing_count', last_year)]) / df_pivot_yoy[('active_listing_count', last_year)]
df_pivot_yoy = df_pivot_yoy.drop([('median_listing_price', last_year), ('active_listing_count', last_year)], axis=1)
df_pivot_yoy = df_pivot_yoy.reset_index()
df_pivot_yoy.columns = df_pivot_yoy.columns.droplevel(-1)

df_pivot_yoy = df_pivot_yoy.merge(df_meta, how='left', on='county_fips')


In [31]:
df_pivot_yoy.to_json('../public/data/latest.json', orient='records')

## 6. Format meta data
Save to `../src/assets/fips.json` as `{'county_name': '', 'county_fips': ''}`

In [32]:
df_unique_counties = df_filtered_counties.drop_duplicates('county_fips')

In [33]:
df_unique_counties[['county_name', 'county_fips']].to_json('../src/assets/fips.json', orient='records')

## 7. Format timeseries data and save to file for each fips code
Save to `../public/data/counties/[FIPS].json`

In [52]:
def format_moving_average(df):
    df = df.sort_values('month_date')
    df['median_listing_price_rolling'] = df['median_listing_price'].rolling(window=12).mean()
    df['active_listing_count_rolling'] = df['active_listing_count'].rolling(window=12).mean()
    return df

In [None]:
for county_fips in df_unique_counties['county_fips'].tolist():
    df_selected_fips = df_filtered_counties[df_filtered_counties['county_fips'] == county_fips]
    # print(df_selected_fips)
    df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
    df_selected_fips = format_moving_average(df_selected_fips)
    obj = {
        # 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']
        'county_name': df_selected_fips.reset_index()['county_name'][0],
        'median_listing_price': json.loads(df_selected_fips[['month_date', 'median_listing_price']].to_json(orient='values')),
        'median_listing_price_rolling': json.loads(df_selected_fips[['month_date', 'median_listing_price_rolling']].to_json(orient='values')),
        'active_listing_count_rolling': json.loads(df_selected_fips[['month_date', 'active_listing_count_rolling']].to_json(orient='values')),
        'active_listing_count': json.loads(df_selected_fips[['month_date', 'active_listing_count']].to_json(orient='values')),
        'median_days_on_market': json.loads(df_selected_fips[['month_date', 'median_days_on_market']].to_json(orient='values'))
    }
    with open('../public/data/counties/%s.json' % county_fips, 'w') as outfile:
        json.dump(obj, outfile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['mont

## Format data for the latest month