In [257]:
import numpy as np 
import pandas as pd

Load in the data frame. Get lists of column labels.

In [258]:
all_data = pd.read_csv("Data_Files/state_housing_zillow.csv")
col_labels = all_data.columns.tolist()
timestamps = col_labels[5:]

Tranpose the dataset so that it is indexed by time and each column represents a state.

In [259]:
price_by_state_series = pd.DataFrame(all_data[timestamps].values.T, columns=all_data['RegionName'], index=timestamps)

Create a dataframe of labels. Add a column to one-hot encode whether the original dataframe contained any NaN values for that state. Default to 0 (False).

In [260]:
state_labels = pd.DataFrame(all_data[col_labels[:5]].values, columns=col_labels[:5], index=all_data['RegionName'])
state_labels.drop(columns=['StateName'], inplace=True)
state_labels['LeadingNaN'] = 0
state_labels['ContainsNaN'] = 0

Address NaN values. Replace any leading NaN values with the oldest recorded price measurement (backwards fill method). For intermediate NaN values, use take the average of the values before and after (interpolate). Update the ContainsNaN column of the labels dataframe.

In [261]:
# find columns with NaN values and update the column in labels
col_nans = price_by_state_series.isna().any()
print(np.sum(col_nans.values))
#print(col_nans)

# interpolate intermediate NaN values
for state in all_data['RegionName']:
    #print(col_nans[state])
    if col_nans[state]:
        all_nans = price_by_state_series[state].isna().sum()
        price_by_state_series[state].interpolate(method='linear', inplace=True)
        new_nans = price_by_state_series[state].isna().sum()
        state_labels.loc[state, 'ContainsNaN'] = all_nans - new_nans
        #if all_nans - new_nans != 0:
        print(state, all_nans - new_nans, all_nans, new_nans)   

print()
# backwards fill leading NaN values
col_nans = price_by_state_series.isna().any()
print(np.sum(col_nans.values))
for state in all_data['RegionName']:
    if col_nans[state]:
        print(state, price_by_state_series[state].isna().sum())
        state_labels.loc[state, 'LeadingNaN'] = price_by_state_series[state].isna().sum()
        price_by_state_series[state].fillna(method='bfill', inplace=True)


9
Arizona 1 1 0
New Mexico 0 27 27
Idaho 1 1 0
West Virginia 1 1 0
Montana 0 61 61
South Dakota 1 1 0
North Dakota 0 108 108
Alaska 1 1 0
Wyoming 0 27 27

4
New Mexico 27
Montana 61
North Dakota 108
Wyoming 27


In [262]:
col_nans = price_by_state_series.isnull().any()
print(col_nans)

RegionName
California              False
Texas                   False
Florida                 False
New York                False
Pennsylvania            False
Illinois                False
Ohio                    False
Georgia                 False
North Carolina          False
Michigan                False
New Jersey              False
Virginia                False
Washington              False
Arizona                 False
Massachusetts           False
Tennessee               False
Indiana                 False
Maryland                False
Missouri                False
Wisconsin               False
Colorado                False
Minnesota               False
South Carolina          False
Alabama                 False
Louisiana               False
Kentucky                False
Oregon                  False
Oklahoma                False
Connecticut             False
Utah                    False
Iowa                    False
Nevada                  False
Arkansas                False

Export cleaned datasets.

In [263]:
price_by_state_series.to_csv("Data_Files/price_by_state_cleaned.csv")
state_labels.to_csv("Data_Files/state_data_labels.csv")

In [264]:
state_labels

Unnamed: 0_level_0,RegionID,SizeRank,RegionName,RegionType,LeadingNaN,ContainsNaN
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
California,9,0,California,state,0,0
Texas,54,1,Texas,state,0,0
Florida,14,2,Florida,state,0,0
New York,43,3,New York,state,0,0
Pennsylvania,47,4,Pennsylvania,state,0,0
Illinois,21,5,Illinois,state,0,0
Ohio,44,6,Ohio,state,0,0
Georgia,16,7,Georgia,state,0,0
North Carolina,36,8,North Carolina,state,0,0
Michigan,30,9,Michigan,state,0,0
