In [1]:
#Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.impute import KNNImputer

In [2]:
# Note: Need to add these files locally before running
income_file = '17zpallnoagi.csv'
mh_file = 'MarketHealthIndex_Zip.csv'

df_income = pd.read_csv(income_file)
df_mh = pd.read_csv(mh_file, encoding='unicode_escape')

print("Missing values in income data:")
print(df_income.isnull().sum())
print()

print("Missing values in market health data:")
print(df_mh.isnull().sum())

Missing values in income data:
STATEFIPS    0
STATE        0
ZIPCODE      0
AGI_STUB     0
N1           0
            ..
A11900       0
N11902       0
A11902       0
N12000       0
A12000       0
Length: 153, dtype: int64

Missing values in market health data:
RegionType                  0
RegionName                  0
City                        0
State                       0
Metro                     600
CBSATitle                 600
SizeRank                14089
MarketHealthIndex           0
SellForGain              4609
PrevForeclosed          11544
ForeclosureRatio        10836
ZHVI                      130
MoM                       130
YoY                       130
ForecastYoYPctChange     2743
StockOfREOs             14089
NegativeEquity            395
Delinquency               395
DaysOnMarket              127
dtype: int64


In [3]:
# Drop rows with null values for ZHVI
df_mh = df_mh.dropna(subset=['ZHVI'])
print(df_mh.isnull().sum())

RegionType                  0
RegionName                  0
City                        0
State                       0
Metro                     598
CBSATitle                 598
SizeRank                13959
MarketHealthIndex           0
SellForGain              4609
PrevForeclosed          11496
ForeclosureRatio        10812
ZHVI                        0
MoM                         0
YoY                         0
ForecastYoYPctChange     2613
StockOfREOs             13959
NegativeEquity            395
Delinquency               395
DaysOnMarket              127
dtype: int64


In [4]:
# Impute missing values for home value data filter
imputer = KNNImputer()

excluded_columns = ['RegionType', 'RegionName', 'City', 'State', 'Metro', 'CBSATitle', 'SizeRank', 'StockOfREOs']

columns_for_impute = [col for col in df_mh.columns if col not in excluded_columns]

df_mh_impute_values = df_mh[columns_for_impute]

print('Missing values before imputing:')
print(df_mh.isnull().sum())
print()

df_mh_impute_values = imputer.fit_transform(df_mh_impute_values)

# Make the imputed values into a dataframe
df_mh_impute_values = pd.DataFrame(df_mh_impute_values, columns=columns_for_impute)


df_mh[columns_for_impute] = df_mh_impute_values[columns_for_impute]

df_mh = df_mh.dropna(subset=['ZHVI'])  # Note: This should not be necessary
print('-'*50)
print('Missing values after imputing:')
print(df_mh.isnull().sum())


Missing values before imputing:
RegionType                  0
RegionName                  0
City                        0
State                       0
Metro                     598
CBSATitle                 598
SizeRank                13959
MarketHealthIndex           0
SellForGain              4609
PrevForeclosed          11496
ForeclosureRatio        10812
ZHVI                        0
MoM                         0
YoY                         0
ForecastYoYPctChange     2613
StockOfREOs             13959
NegativeEquity            395
Delinquency               395
DaysOnMarket              127
dtype: int64

  RegionType  RegionName                 City State            Metro  \
0        Zip        1001               Agawam    MA  Springfield, MA   
1        Zip        1002              Amherst    MA  Springfield, MA   
2        Zip        1005        Town of Barre    MA    Worcester, MA   
3        Zip        1007  Town of Belchertown    MA  Springfield, MA   
4        Zip        1008