In [314]:
# imports
import json
import pandas as pd
import numpy as np
import gmaps
import gmaps.datasets
google_api_key = %env GOOGLE_API_KEY
gmaps.configure(api_key=google_api_key)

First step is to get armed with shapefiles that describe the opportunity. I google for "Stockton Shapefiles" and found a pretty [decent government site](http://www.stocktongov.com/services/gis/mapdatDat.html) with a range of links to different shapefiles. I downloaded two of them to start - the "City Limits" and "Parcels". Let's go ahead and graph the city limits of Stockton using the `gmaps` library

In [315]:
with open('data/city_limits_stockton.json') as f: 
    geometry = json.load(f)

geometry['features'] = geometry['features'][0:1] # eliminate the county map that is included in the map
fig = gmaps.figure()
geojson_layer = gmaps.geojson_layer(geometry,fill_color=None,fill_opacity=0)
fig.add_layer(geojson_layer)
fig

Figure(layout=FigureLayout(height='420px'))

We will be using the parcel shapefile for a different purpose - to convert from Zillow listing pages to property coordinates. We do this using the Assessor Parcel Number (APN) or `parcel_number` identifier. Each property carries a unique APN and Zillow listings almost always include it. It's like a VIN for real estate - no need to resolve ambiguities in addresses. 

In [316]:
with open('data/parcels_stockton.json') as f: 
    geometry = json.load(f)

From a quick pass, I was able to scrape about 10k listings from Zillow with their Zestimate (`computed price`) in the Stockton area. Note that there are about 100k unique parcels listed in the shapefile, so we already have about 10% of the area in the dataset. Scraping Zillow is very complicated in general so I won't go into detail of how I did that.

In [318]:
homes = pd.read_csv('data/price_and_apn_stockton.csv',dtype={'parcel_number': np.str})
# remove trailing 4 zeros from any APN in the dataset
homes.parcel_number = homes.parcel_number.apply(lambda x: x[:-4] if x[-4:] == '0000' else x)
print("Number of homes in dataset: %i" % len(homes.index))
homes.head()

Number of homes in dataset: 10245


Unnamed: 0,parcel_number,computed price
0,7105010,4281175
1,10104012,3246555
2,6311006,3159637
3,8913057,2770055
4,6311038,2748240


Let's match up the Zillow dataset with the coordinates in the Stockton shapefile, and graph the parcels visually. Most of the smaller properties are not visible but a fair chunk of the city is already in the dataset. Because the most valuable downtown or golf course parcels are very small, this map doesn't reflect the true coverage we have of the Stockton area. We'll fix in the next iteration.

In [319]:
APN_list = homes.parcel_number
geometry['features'] = list(filter(lambda d: str(int(d['properties']['APN'])) in APN_list.tolist(), geometry['features']))

In [320]:
print("Number of homes in dataset after matching with coords %i" % len(geometry['features']))

Number of homes in dataset after matching with coords 10217


In [321]:
geojson_layer = gmaps.geojson_layer(geometry,fill_color='Blue')
fig.add_layer(geojson_layer)
fig

Figure(layout=FigureLayout(height='420px'))

In [322]:
# extract lats and longs of properties that sold
x = []
for e in geometry['features']:
    if(len(e['geometry']['coordinates'][0][0]) == 2):
        r = [e['geometry']['coordinates'][0][0][1],e['geometry']['coordinates'][0][0][0],str(int(e['properties']['APN']))]
        x.append(r)
    else:
        print(e['properties']['APN'])
    
df = pd.DataFrame(x,columns=['Latitude','Longitude','parcel_number']).set_index('parcel_number')
df.head()

10127002
08525049
{'type': 'Feature', 'properties': {'POLY_CODE': 7, 'APN': '05926065', 'Shape_STAr': 251390.505266, 'Shape_STLe': 2620.29487278}, 'geometry': {'type': 'Polygon', 'coordinates': [[[-121.2550640221702, 38.057547878578106], [-121.25587676635047, 38.05754101049694], [-121.25595179762256, 38.06049852275644], [-121.25514249857456, 38.060502341799676], [-121.2550640221702, 38.057547878578106]]]}}
10104006


Unnamed: 0_level_0,Latitude,Longitude
parcel_number,Unnamed: 1_level_1,Unnamed: 2_level_1
17138012,37.93323,-121.243643
17113213,37.938146,-121.245315
6312021,38.06527,-121.184043
16405019,37.911541,-121.319315
16405053,37.913384,-121.320513


In [325]:
homes_with_price = pd.concat([df,homes.set_index('parcel_number')],axis=1)
print("Number of homes mapped %i" % len(homes_with_price[['Latitude','Longitude']].dropna().index))

homes_with_price.head()

Number of homes mapped 10214


Unnamed: 0,Latitude,Longitude,computed price
10006018,38.009221,-121.36205,373156
10042012,37.998651,-121.352485,394622
10102130,37.987818,-121.240074,294958
10102142,37.987734,-121.238299,329572
10102205,37.988321,-121.231107,544573


Finally I create a heatmap of houses, with each dot weighted by the Z-estimate amount. Now we can see that most of the northern and downtown areas are well represented

In [326]:
homes_with_price = homes_with_price.dropna()
# adjust price to a range of 0 to 1

heatmap_layer = gmaps.heatmap_layer(homes_with_price[['Latitude','Longitude']],weights=homes_with_price['computed price'].rank(pct = True))
fig.add_layer(heatmap_layer)
fig

Figure(layout=FigureLayout(height='420px'))