# EDA

In [12]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [13]:
from my_functions import *

In [14]:
# import data
df = pd.read_csv("data/King_County_House_prices_dataset.csv", index_col="id", parse_dates=["date", "yr_built", "yr_renovated"])
df.head(5)

Unnamed: 0_level_0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7129300520,2014-10-13,221900.0,3,1.0,1180,5650,1.0,,0.0,3,7,1180,0.0,1955-01-01,0.0,98178,47.511,-122.257,1340,5650
6414100192,2014-12-09,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,3,7,2170,400.0,1951-01-01,1991.0,98125,47.721,-122.319,1690,7639
5631500400,2015-02-25,180000.0,2,1.0,770,10000,1.0,0.0,0.0,3,6,770,0.0,1933-01-01,,98028,47.738,-122.233,2720,8062
2487200875,2014-12-09,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,5,7,1050,910.0,1965-01-01,0.0,98136,47.521,-122.393,1360,5000
1954400510,2015-02-18,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,3,8,1680,0.0,1987-01-01,0.0,98074,47.617,-122.045,1800,7503


|    | Column name   | Description   |
|---:|:--------------|:--------------|
|  0 | date          | Date of the sale             |
|  1 | price         | Price of sale |
|  2 | bedrooms      | Number of bedrooms |
|  3 | bathrooms     | Number of bathrooms |
|  4 | sqft_living   | Area of living space |
|  5 | sqft_lot      | Area of plot of land  |
|  6 | floors        | Number of floors within property |
|  7 | waterfront    | On the waterfront [Y/N] |
|  8 | view          | Quality of the view on a scale of 1 to 4 |
|  9 | condition     | Condition on a scale of 1 to 5 |
| 10 | grade         | ? |
| 11 | sqft_above    |  Living area not including basement             |
| 12 | sqft_basement | Living area in basement |
| 13 | yr_built      | Year property was built |
| 14 | yr_renovated  | Year property was last renovated |
| 15 | zipcode       | Zip code |
| 16 | lat           | Latitude |
| 17 | long          | Longitude |
| 18 | sqft_living15 | ? |
| 19 | sqft_lot15    |  ? |

## Clean `yr_renovated` column

In [15]:
df["yr_renovated"] = pd.to_datetime(df["yr_renovated"].replace("0.0", np.NaN), format="%Y.0")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 21597 entries, 7129300520 to 1523300157
Data columns (total 20 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   date           21597 non-null  datetime64[ns]
 1   price          21597 non-null  float64       
 2   bedrooms       21597 non-null  int64         
 3   bathrooms      21597 non-null  float64       
 4   sqft_living    21597 non-null  int64         
 5   sqft_lot       21597 non-null  int64         
 6   floors         21597 non-null  float64       
 7   waterfront     19221 non-null  float64       
 8   view           21534 non-null  float64       
 9   condition      21597 non-null  int64         
 10  grade          21597 non-null  int64         
 11  sqft_above     21597 non-null  int64         
 12  sqft_basement  21597 non-null  object        
 13  yr_built       21597 non-null  datetime64[ns]
 14  yr_renovated   744 non-null    datetime64[ns]
 15  zipco

## 5 highest-value zip codes

In [16]:
zips = get_zip_means(df)
zips

Unnamed: 0_level_0,price,count,latitude,longitude,name
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
98001,281000,0,47.310,-122.265,Auburn
98002,234000,0,47.305,-122.207,Auburn
98003,294000,0,47.320,-122.312,Federal Way
98004,1357000,0,47.615,-122.207,Bellevue
98005,810000,0,47.615,-122.166,Bellevue
...,...,...,...,...,...
98177,676000,0,47.747,-122.369,Seattle
98178,311000,0,47.492,-122.236,Seattle
98188,289000,0,47.448,-122.273,Seattle
98198,303000,0,47.393,-122.313,Seattle


In [17]:
def map_zips(zips, title="Zip Codes in King's County"):
    import plotly.express as px
    zips = zips.copy()
    zips["_prefix"] = " ("
    zips["_suffix"] = ")"
    zips["hover"] = zips.index.astype(str) + zips["_prefix"] + zips["name"] + zips["_suffix"]
    fig = px.scatter_mapbox(
        zips,
        mapbox_style="stamen-toner",
        lat="latitude",
        lon="longitude",
        color="price",
        title=title,
        hover_name="hover",
        width=800,
        height=600,
        size="count",
        zoom=8.5
        )
    fig.show()
map_zips(zips, title="Zip Codes in King's Country: Average Price")

From the map, we can see the point at which things get expensive (purple) is around 850k. This is well above the 75th %ile for the mean price on zip codes.

In [18]:
print(f"The 75th percentile is: {int(round(zips['price'].quantile(0.75), -3))}")

The 75th percentile is: 646000


In [19]:
zips[zips["price"] > 850000].index.size

# Let's get this down to 5
zips.sort_values("price", ascending=False).iloc[0:5]

Unnamed: 0_level_0,price,count,latitude,longitude,name
zipcode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
98039,2161000,0,47.627,-122.231,Medina
98004,1357000,0,47.615,-122.207,Bellevue
98040,1195000,0,47.56,-122.228,Mercer Island
98112,1096000,0,47.63,-122.297,Seattle
98102,900000,0,47.63,-122.321,Seattle


Do the same for historical buildings (i.e. earlier than 1945)

In [20]:
historic_year = pd.to_datetime("1945-12-31", yearfirst=True)
historical_zips = get_zip_means(df[df["yr_built"] < historic_year])
map_zips(historical_zips, title="Zip Codes by Average Price, Historical Only")

Here we can clearly see the range of prices in various zip codes.
* The cheapest neighbourhoods start at $0.2M
* The priciest range up to almost $2M
* 

In [21]:
top_historical_zips = historical_zips.sort_values("price", ascending=False).iloc[0:5]
map_zips(top_historical_zips, "Top 5 historical zip codes by price")

In [22]:
top_historical_zips_min_count = historical_zips[historical_zips["count"] >= 10].sort_values("price", ascending=False).iloc[0:5]
map_zips(top_historical_zips_min_count, "Top 5 historical zip codes, n > 10")