## Imports

In [1]:
# Imports

import pandas as pd
import numpy as np
from uszipcode import SearchEngine
from sklearn import preprocessing
import folium
from folium import plugins

from state_heatmapwithtime import *

In [2]:
# Reading in the provided CSV file

df = pd.read_csv("zillow_data.csv")
df.head()

Unnamed: 0,RegionID,RegionName,City,State,Metro,CountyName,SizeRank,1996-04,1996-05,1996-06,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
0,84654,60657,Chicago,IL,Chicago,Cook,1,334200.0,335400.0,336500.0,...,1005500,1007500,1007800,1009600,1013300,1018700,1024400,1030700,1033800,1030600
1,90668,75070,McKinney,TX,Dallas-Fort Worth,Collin,2,235700.0,236900.0,236700.0,...,308000,310000,312500,314100,315000,316600,318100,319600,321100,321800
2,91982,77494,Katy,TX,Houston,Harris,3,210400.0,212200.0,212200.0,...,321000,320600,320200,320400,320800,321200,321200,323000,326900,329900
3,84616,60614,Chicago,IL,Chicago,Cook,4,498100.0,500900.0,503100.0,...,1289800,1287700,1287400,1291500,1296600,1299000,1302700,1306400,1308500,1307000
4,93144,79936,El Paso,TX,El Paso,El Paso,5,77300.0,77300.0,77300.0,...,119100,119400,120000,120300,120300,120300,120300,120500,121000,121500


## Pre-Processing

In [3]:
# Setting index to zipcode
df = df.set_index(df["RegionName"])

# Reserving the state information, for later mapping
state_list = df["State"]

In [4]:
# Dropping metadata columns
df = df.drop(columns=["RegionID", "RegionName", "City", "State",
                      "Metro", "CountyName", "SizeRank"])

In [13]:
# Grabbing only data from 2004 onwards
df = df.loc[:,"2004-01":]

In [35]:
df.iloc[:,99]

RegionName
60657     740300.0
75070     201800.0
77494     252400.0
60614     938100.0
79936     112800.0
77084     112000.0
10467     295200.0
60640     552400.0
77449     115200.0
94109    2303700.0
11226     538600.0
32162     196500.0
11375     682600.0
11235     549600.0
37013     122300.0
90250     345600.0
60647     276200.0
37211     137400.0
78660     164700.0
60618     292900.0
10128    5578000.0
77573     182800.0
77584     183400.0
28269     133100.0
79912     174600.0
78572      80100.0
94565     167300.0
30349      54200.0
20002     386800.0
90046     941100.0
           ...    
3457      210300.0
3284      195100.0
95728     349800.0
28757     283100.0
95497     622200.0
12156     174700.0
97149     337800.0
77457     165200.0
80481     223700.0
20625     202200.0
20618     183000.0
80510     194100.0
49710      52200.0
1270      165300.0
12480     148500.0
29915     367900.0
92322     106300.0
3812      176300.0
89413    1194400.0
92341     116400.0
3765      113500.0
8

In [14]:
# Dropping any rows with null values
df.dropna(axis=0, inplace=True)

# Sanity Check
df.isnull().sum().sum()

0

In [15]:
# Creating an array of the normalized data
df_norm = preprocessing.normalize(df)

# Setting that array as a dataframe, with the same row/column labels as before
df_scaled = pd.DataFrame(df_norm, index=df.index, columns=df.columns)

In [16]:
# Exploring the data, post-processing
print(df_scaled.shape)
df_scaled.head()

(13684, 172)


Unnamed: 0_level_0,2004-01,2004-02,2004-03,2004-04,2004-05,2004-06,2004-07,2004-08,2004-09,2004-10,...,2017-07,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60657,0.066883,0.067228,0.067653,0.068149,0.068681,0.069238,0.06977,0.070239,0.070682,0.071116,...,0.089039,0.089216,0.089242,0.089402,0.089729,0.090207,0.090712,0.09127,0.091545,0.091261
75070,0.062166,0.06253,0.063027,0.063524,0.063921,0.064219,0.064418,0.064617,0.064816,0.065047,...,0.102009,0.102671,0.103499,0.104029,0.104327,0.104857,0.105354,0.105851,0.106348,0.10658
77494,0.065783,0.066152,0.066265,0.066123,0.065868,0.065641,0.065471,0.065499,0.065698,0.066038,...,0.091097,0.090983,0.09087,0.090927,0.09104,0.091154,0.091154,0.091664,0.092771,0.093623
60614,0.069419,0.06959,0.069829,0.07013,0.070466,0.070828,0.071198,0.071574,0.071992,0.072436,...,0.088256,0.088113,0.088092,0.088373,0.088722,0.088886,0.089139,0.089392,0.089536,0.089433
79936,0.056091,0.056559,0.057093,0.057694,0.058228,0.058762,0.059363,0.059897,0.060365,0.060832,...,0.079529,0.07973,0.08013,0.080331,0.080331,0.080331,0.080331,0.080464,0.080798,0.081132


In [17]:
# Adding the state information back 
df_scaled["State"] = state_list
df_scaled.head()

Unnamed: 0_level_0,2004-01,2004-02,2004-03,2004-04,2004-05,2004-06,2004-07,2004-08,2004-09,2004-10,...,2017-08,2017-09,2017-10,2017-11,2017-12,2018-01,2018-02,2018-03,2018-04,State
RegionName,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
60657,0.066883,0.067228,0.067653,0.068149,0.068681,0.069238,0.06977,0.070239,0.070682,0.071116,...,0.089216,0.089242,0.089402,0.089729,0.090207,0.090712,0.09127,0.091545,0.091261,IL
75070,0.062166,0.06253,0.063027,0.063524,0.063921,0.064219,0.064418,0.064617,0.064816,0.065047,...,0.102671,0.103499,0.104029,0.104327,0.104857,0.105354,0.105851,0.106348,0.10658,TX
77494,0.065783,0.066152,0.066265,0.066123,0.065868,0.065641,0.065471,0.065499,0.065698,0.066038,...,0.090983,0.09087,0.090927,0.09104,0.091154,0.091154,0.091664,0.092771,0.093623,TX
60614,0.069419,0.06959,0.069829,0.07013,0.070466,0.070828,0.071198,0.071574,0.071992,0.072436,...,0.088113,0.088092,0.088373,0.088722,0.088886,0.089139,0.089392,0.089536,0.089433,IL
79936,0.056091,0.056559,0.057093,0.057694,0.058228,0.058762,0.059363,0.059897,0.060365,0.060832,...,0.07973,0.08013,0.080331,0.080331,0.080331,0.080331,0.080464,0.080798,0.081132,TX


## Visualization Time!

In [18]:
# Using my defined function, found in state_heatmapwithtime.py, to get state-
# level data in the correct format to create a Folium HeatMapWithTime
fl_data = getdata_stateheatmapwithtime(df_scaled, "FL")

In [19]:
# Creating the folium map, centered at FL
fl_map = folium.Map(location=[27.6648, -81.5158],
                   zoom_start=6, prefer_canvas=True)

plugins.HeatMapWithTime(fl_data).add_to(fl_map)

fl_map

In [20]:
# Trying again with a different state
ga_data = getdata_stateheatmapwithtime(df_scaled, "GA")

In [25]:
# Creating a new folium map, this time for Georgia
ga_map = folium.Map(location=[32.3656, -82.9001],
                       zoom_start=7, prefer_canvas=True)

plugins.HeatMapWithTime(ga_data).add_to(ga_map)

ga_map