# Import Data

In [1]:
import pandas as pd
import matplotlib.pyplot as plt 
from statsmodels.tsa.seasonal import seasonal_decompose
from statsmodels.tsa.stattools import adfuller
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
from statsmodels.tsa.arima_model import ARIMA
import numpy as np



#Read in data.  Zipcode is string to keep leading 0's
raw_data = pd.read_csv('Zillow_data/zhvi_all-homes_zipcode.csv', dtype={'RegionName': object})
raw_data.head(3)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,1996-01-31,...,2019-09-30,2019-10-31,2019-11-30,2019-12-31,2020-01-31,2020-02-29,2020-03-31,2020-04-30,2020-05-31,2020-06-30
0,61639,0,10025,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,233265.0,...,1248340.0,1234262.0,1229890.0,1226466.0,1208024.0,1182758.0,1150900.0,1134880.0,1120949.0,1112549.0
1,84654,1,60657,Zip,IL,IL,Chicago,Chicago-Naperville-Elgin,Cook County,211748.0,...,494425.0,493485.0,492514.0,491726.0,491562.0,492618.0,494017.0,494766.0,494546.0,494435.0
2,61637,2,10023,Zip,NY,NY,New York,New York-Newark-Jersey City,New York County,245773.0,...,1161916.0,1153259.0,1156287.0,1175142.0,1193746.0,1205413.0,1203165.0,1209735.0,1211403.0,1212520.0


In [2]:
raw_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30443 entries, 0 to 30442
Columns: 303 entries, RegionID to 2020-06-30
dtypes: float64(294), int64(2), object(7)
memory usage: 70.4+ MB


Imported data as Pandas DataFrame and stored in raw_data variable.  raw_data is in Wide format.

# Cleaning & Preprocessing Functions

In [3]:
def melt_data(df):
    """
    Turn DataFrame from Wide to Long Format
    
    Arguments:
    df -- Pandas DataFrame with dates in Wide format
    
    Return:
    Pandas DataFrame in Long format.
    """
    melted = pd.melt(df, id_vars=['zipcode', 'City', 'State', 'CountyName', 'Metro'], var_name='time')
    melted['time'] = pd.to_datetime(melted['time'], infer_datetime_format=True)
    melted = melted.dropna(subset=['value'])
    return melted

In [4]:
def clean_data(df):
    """
    Clean DataFrame.  Rename columns, drop unneeded columns, fill missing Metro values, and transform to Long format.
    
    Arguments:
    df -- Pandas DataFrame in Wide format.
    
    Return:
    Cleaned and melted Pandas DataFrame
    """
    #rename RegionName to 'zipcode'
    df.rename(columns={'RegionName': 'zipcode'}, inplace=True)
    
    #drop unneeded columns
    df.drop(columns=['RegionID', 'RegionType', 'StateName', 'SizeRank'], inplace=True)
    
    #Change to long format
    df_long = melt_data(df)
    
    #Non-metro zipcodes now called 'rural'
    df_long.fillna('rural', inplace=True)
    
    return df_long