# Format data for housing project
1. Format FIPs, county name and date
2. Select categories (median_listing_price, median_days_on_market, new_listing_count, active_listing_count)
3. Filter out counties that don't have data for every month
4. Filter out counties with less than a certain amount of active listings
5. Calculate 12-month moving average for median_listing_price and active_listing_count)

In [205]:
import pandas as pd
import json

In [206]:
df = pd.read_csv('data/RDC_Inventory_Core_Metrics_County_History.csv')

## 1. Format FIPs, county name and date

#### Format date

In [207]:
df['month_date'] = df['month_date_yyyymm'].apply(lambda x: str(x)[0:4] + '-' + str(x)[4:6] + '-01')

#### Format FIPs

In [208]:
df['county_fips'] = df['county_fips'].apply(lambda x: str(x).zfill(5))

In [209]:
df_meta = pd.read_csv('data/county_fips.csv')

In [210]:
df_meta['FP'] = df_meta['FP'].apply(lambda x: str(x).zfill(5))

In [211]:
df_meta = df_meta.set_index('FP')

#### Merge formatted datasets

In [212]:
df_merge = pd.merge(df, df_meta, left_on='county_fips', right_on='FP')

## 2. Select categories
- median_listing_price
- median_days_on_market
- new_listing_count
- active_listing_count)

In [213]:
df_selected = df_merge[['NAME', 'month_date', 'quality_flag', 'county_fips', 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']]

## 3. Filter out counties that don't have data for every month

In [214]:
df_six_years = df_selected[df_selected['month_date'] >= '2018-01-01']

In [215]:
df_totals = pd.DataFrame(df_six_years.groupby('county_fips')['month_date'].nunique().sort_values(ascending=False))

In [216]:
unique_month_count = df_totals['month_date'].max()

In [217]:
unique_month_count

np.int64(87)

In [218]:
df_complete = df_totals[df_totals['month_date'] == unique_month_count]

In [219]:
df_complete

Unnamed: 0_level_0,month_date
county_fips,Unnamed: 1_level_1
01001,87
39079,87
39061,87
39063,87
39065,87
...,...
21073,87
21099,87
21069,87
21075,87


In [220]:
df_selected_counties = df_six_years[df_six_years['county_fips'].isin(df_complete.index.tolist())]

To-do: check every column for a value and only show counties with all data

## 4. Filter out counties with less than a certain amount of active listings

In [221]:
df_non_zero_listings = df_selected_counties[df_selected_counties['active_listing_count'] > 10]
df_non_zero_listings = df_non_zero_listings[~df_selected_counties['active_listing_count'].isna()]

  df_non_zero_listings = df_non_zero_listings[~df_selected_counties['active_listing_count'].isna()]


In [222]:
df_listing_totals = pd.DataFrame(df_non_zero_listings.groupby('county_fips')['month_date'].nunique())

In [223]:
df_filtered_counties = df_six_years[df_six_years['county_fips'].isin(df_listing_totals.index)]

In [224]:
df_filtered_counties = df_filtered_counties.rename(columns={'NAME': 'county_name'})

## 5. Calculate 12-month moving average for median_listing_price and active_listing_count)

## 6. Format meta data
Save to `../src/assets/fips.json` as `{'county_name': '', 'county_fips': ''}`

In [225]:
df_unique_counties = df_filtered_counties.drop_duplicates('county_fips')

In [226]:
df_unique_counties[['county_name', 'county_fips']].to_json('../src/assets/fips.json', orient='records')

## 7. Format timeseries data and save to file for each fips code
Save to `../public/data/counties/[FIPS].json`

In [227]:
df_individual_file_data = df_filtered_counties.drop('county_name', axis=1) 

In [228]:
for county_fips in df_unique_counties['county_fips'].tolist():
    df_selected_fips = df_individual_file_data[df_individual_file_data['county_fips'] == county_fips]
    # print(df_selected_fips)
    df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
    obj = {
        # 'median_listing_price', 'median_days_on_market', 'new_listing_count', 'active_listing_count']
        'median_listing_price': json.loads(df_selected_fips[['month_date', 'median_listing_price']].to_json(orient='values')),
        'active_listing_count': json.loads(df_selected_fips[['month_date', 'active_listing_count']].to_json(orient='values')),
        'median_days_on_market': json.loads(df_selected_fips[['month_date', 'median_days_on_market']].to_json(orient='values'))
    }
    with open('../public/data/counties/%s.json' % county_fips, 'w') as outfile:
        json.dump(obj, outfile)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['month_date'].apply(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected_fips['month_date'] = df_selected_fips['mont