# EDA

In [1]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)

In [2]:
from my_functions import *

In [3]:
# import data
df = pd.read_csv("data/King_County_House_prices_dataset.csv", parse_dates=["date", "yr_built", "yr_renovated"])
df.head(5)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21597 non-null  int64         
 1   date           21597 non-null  datetime64[ns]
 2   price          21597 non-null  float64       
 3   bedrooms       21597 non-null  int64         
 4   bathrooms      21597 non-null  float64       
 5   sqft_living    21597 non-null  int64         
 6   sqft_lot       21597 non-null  int64         
 7   floors         21597 non-null  float64       
 8   waterfront     19221 non-null  float64       
 9   view           21534 non-null  float64       
 10  condition      21597 non-null  int64         
 11  grade          21597 non-null  int64         
 12  sqft_above     21597 non-null  int64         
 13  sqft_basement  21597 non-null  object        
 14  yr_built       21597 non-null  datetime64[ns]
 15  yr_renovated   1775

|    | Column name   | Description   |
|---:|:--------------|:--------------|
|  0 | date          | Date of the sale             |
|  1 | price         | Price of sale |
|  2 | bedrooms      | Number of bedrooms |
|  3 | bathrooms     | Number of bathrooms |
|  4 | sqft_living   | Area of living space |
|  5 | sqft_lot      | Area of plot of land  |
|  6 | floors        | Number of floors within property |
|  7 | waterfront    | On the waterfront [Y/N] |
|  8 | view          | Quality of the view on a scale of 1 to 4 |
|  9 | condition     | Condition on a scale of 1 to 5 |
| 10 | grade         | ? |
| 11 | sqft_above    |  Living area not including basement             |
| 12 | sqft_basement | Living area in basement |
| 13 | yr_built      | Year property was built |
| 14 | yr_renovated  | Year property was last renovated |
| 15 | zipcode       | Zip code |
| 16 | lat           | Latitude |
| 17 | long          | Longitude |
| 18 | sqft_living15 | ? |
| 19 | sqft_lot15    |  ? |

## Clean `yr_renovated` column

In [4]:
df["yr_renovated"] = pd.to_datetime(df["yr_renovated"].replace("0.0", np.NaN), format="%Y.0")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   id             21597 non-null  int64         
 1   date           21597 non-null  datetime64[ns]
 2   price          21597 non-null  float64       
 3   bedrooms       21597 non-null  int64         
 4   bathrooms      21597 non-null  float64       
 5   sqft_living    21597 non-null  int64         
 6   sqft_lot       21597 non-null  int64         
 7   floors         21597 non-null  float64       
 8   waterfront     19221 non-null  float64       
 9   view           21534 non-null  float64       
 10  condition      21597 non-null  int64         
 11  grade          21597 non-null  int64         
 12  sqft_above     21597 non-null  int64         
 13  sqft_basement  21597 non-null  object        
 14  yr_built       21597 non-null  datetime64[ns]
 15  yr_renovated   744 

In [5]:
df["price"].describe()

count     21597.000
mean     540296.574
std      367368.140
min       78000.000
25%      322000.000
50%      450000.000
75%      645000.000
max     7700000.000
Name: price, dtype: float64

## 5 highest-value zip codes

In [7]:
zips = get_zip_means(df)

In [8]:
def map_zips(zips, title="Zip Codes in King's County", sizecount=False, save_html=True, save_jpg=True):
    """ Maps zip codes with Seaborn, optionally saves output to HTML or JPEG files too

    Args:
        zips (_type_): _description_
        title (str, optional): The title. Defaults to "Zip Codes in King's County".
        sizecount (bool, optional): Whether size is the count (i.e. number of data). Defaults to False.
        save_html (bool, optional): Whether to save to HTML. Defaults to True.
        save_jpg (bool, optional):
    """
    import plotly.express as px
    size = "count" if sizecount else "mean_price"
    zips = zips.copy()
    zips["_prefix"] = " ("
    zips["_suffix"] = ")"
    zips["hover"] = zips.index.astype(str) + zips["_prefix"] + zips["name"] + zips["_suffix"]
    fig = px.scatter_mapbox(
        zips,
        mapbox_style="stamen-terrain",
        lat="latitude",
        lon="longitude",
        color="mean_price",
        title=title,
        hover_name="hover",
        width=1024,
        height=576,
        size=size,
        zoom=8.5
        )
    fig.show()
    if save_html or save_jpg:
        file_name = title.replace(" ", "_")
        file_name = ''.join(e for e in file_name if (e.isalnum() or e == "_")).lower()
    if save_html:
        fig.write_html(f"html/{file_name}.html")
    if save_jpg:
        fig.write_image(f"images/{file_name}.jpg")
map_zips(zips, title="Zip Codes in King's Country: Average Price", save_html=True)

From the map, we can see the point at which things get expensive (purple) is around 850k. This is well above the 75th %ile for the mean price on zip codes.

In [None]:
print(f"The 75th percentile is: {int(round(zips['mean_price'].quantile(0.75), -3))}")

In [None]:
# Let's get the top 5
top_5_zips = zips.sort_values("mean_price", ascending=False).iloc[0:5]
top_5_zips.drop(columns=["latitude", "longitude"])

In [None]:
map_zips(top_5_zips, title="Top 5 most expensive zip codes")

Do the same for historical buildings (i.e. earlier than 1945)

In [None]:
historic_year = pd.to_datetime("1945-12-31", yearfirst=True)
mask_historic = df["yr_built"] < historic_year
historical_zips = get_zip_means(df[mask_historic])
map_zips(historical_zips, title="Zip Codes by Average Price, Historical Only")

Here we can clearly see the range of prices in various zip codes.
* The cheapest neighbourhoods start at $0.2M
* The priciest range up to almost $2M
* 

In [None]:
top_historical_zips = historical_zips.sort_values("mean_price", ascending=False).iloc[0:5]
top_historical_zips.drop(columns=["latitude", "longitude"])

In [None]:
map_zips(top_historical_zips, "Top 5 historical zip codes by price")

Are these different from the overall most expensive zip codes? They have four in common - only two are in one but not the other.

In [None]:
set(top_historical_zips.index).symmetric_difference(top_5_zips.index)

In [None]:
top_historical_zips_min_count = historical_zips[historical_zips["count"] >= 10].sort_values("mean_price", ascending=False).iloc[0:5]
map_zips(top_historical_zips_min_count, "Top 5 historical zip codes, n > 10")

Does this give us different results than without the n > 10 criterion?

In [None]:
set(top_historical_zips_min_count.index).intersection(set(top_historical_zips.index))

Again, there are two that are in one but not the other.

## Should I sell now, or wait until autumn?

In [None]:
p = sns.lineplot(df[["price","date"]], x="date", y="price")
fig.set_axis_labels("Date", "Price")

In [None]:
def plot_monthly_prices(df, title=None):
    df = df.copy()
    df["month"] = df["date"].dt.month.astype(int)
    df_month_prices = df[["month", "price"]].groupby("month").mean()
    p = sns.lineplot(df_month_prices, x="month", y="price")
    if title is not None: p.set_title(title)

plot_monthly_prices(df, title="Average price by month for all properties")

In [None]:
# For top-five zip codes
mask_top_5_zips = df["zipcode"].isin(top_5_zips.index.to_list())
plot_monthly_prices(df[mask_top_5_zips], "Average price by month for top 5 zipcodes")

For the top zip-codes, price-wise it makes little difference whether we sell now or in autumn. There's a larger window of opportunity now though.

If we do renovate, there's a chance to sell in autumn, but if things take longer, the spring window is waiting just after New Year.

In [None]:
# For top five historic zip codes
mask_top_historical_zips = df["zipcode"].isin(top_historical_zips.index.to_list())
plot_monthly_prices(df[mask_top_historical_zips], title="Average price by month for top historic zipcodes")

For the top historic zipcodes, the picture is slightly different. Autumn is about the worst time to sell!

Is this because of low number of properties in the data?

In [None]:
mask_top_historical_zips_min_count = df["zipcode"].isin(top_historical_zips_min_count.index.to_list())
plot_monthly_prices(df[mask_top_historical_zips_min_count], "Top historic zip codes with min 10 properties")

Conclusion: historic properties sold for lowest prices in autumn.

## Should I renovate?

Only a small percentage of the dataset has information on renovation date. So we should be suspicious.

In [None]:
def pc_ren(df, return_value=False):
    if not return_value:
        percent_renovated = df["yr_renovated"].count() / df.index.size * 100
        print(f"{round(percent_renovated, 2)}% of buildings have been renovated.")
    else:
        return percent_renovated

pc_ren(df)

What about historic buildings?

In [None]:
pc_ren(df[mask_historic])

This is surprising... so we should take it with a pinch of salt.

In [None]:
# add a column "really_renovated" – this is either the year renovated, or the year of build if NaN

not_renovated_mask = df["yr_renovated"].notna()
bad_condition_mask = df["condition"] <= 3

df["really_renovated"] = df["yr_renovated"].fillna(df["yr_built"])
df

In [None]:
condition_renovation = df[["condition", "yr_renovated"]].groupby("yr_renovated").mean()
condition_renovation

In [None]:
sns.scatterplot(condition_renovation, x="yr_renovated", y="condition")

In [None]:
# find average condition by really_renovated year
condition_x_ren_year = df[["really_renovated", "condition"]].groupby("really_renovated").mean()
sns.scatterplot(data=condition_x_ren_year)

Does yr_renovated really add anything? Let's do the same, but for just the build year.

In [None]:
condition_x_build_year = df[["yr_built", "condition"]].groupby("yr_built").mean()
sns.scatterplot(data=condition_x_build_year)

All the scattergraphs seem to show that the year or renovation/build is actually negatively correlated to condition. We can see that older objects generally are in better condition.

**We can't conclude very much from the data because we would expect condition to correlate with renovation... but the opposite is the case. We can assume that condition is valid, and yr_renovated is not.**

### Is there any correlation between condition and price?

In [None]:
# Separate into price bands, group by average condition, and plot a scatter graph
def mean_condition_x_price_band(df, bins=10, return_pb=False):
    df = df[["price", "condition"]].copy()
    df["price_bands"] = pd.cut(df["price"], bins=bins, labels=range(1, bins+1))
    df_pb_x_c = groupby_mean_and_len(df, use_cols=["price_bands", "condition"], groupby_col="price_bands")
    if return_pb:
        price_bands = groupby_mean_and_len(df, use_cols=["price_bands", "price"], groupby_col="price_bands")
        return (df_pb_x_c, price_bands)
    else:
        return df
condition_x_priceband, price_bands = mean_condition_x_price_band(df, return_pb=True)

In [None]:
sns.scatterplot(data=condition_x_priceband, x=condition_x_priceband.index, y="mean_condition", size="count", hue="count")

It looks like there is a correlation between price and condition up to price band 8. Many of the price bands don't have much data. Let's make a cut-off of at least 100 items.

In [None]:
data = condition_x_priceband[condition_x_priceband["count"]>=100]
sns.scatterplot(data=data, x=data.index, y="mean_condition", size="count", hue="count")

In [None]:
# find out price bands
price_bands["mean_price_million"] = price_bands["mean_price"] / 1e6
price_bands

### Correlation between condition and price for historic properties?

In [None]:
condition_x_priceband_hist, price_bands_hist = mean_condition_x_price_band(df[mask_historic], return_pb=True)
sns.scatterplot(data=condition_x_priceband_hist, x=condition_x_priceband_hist.index, y="mean_condition", size="count", hue="count")

It seems to be the same story - let's see how much data we have for each price band

In [None]:
price_bands_hist

We're going to have to be less choosy if we want to see data for the highest price bands and go right down to 5!

In [None]:
data = condition_x_priceband_hist[condition_x_priceband_hist["count"] >=5]
sns.scatterplot(data=data, x=data.index, y="mean_condition", size="count", hue="count")

### There were no properties that renovated then resold in the timeframe :(

Only 14 renovated then sold within the timeframe.

In [None]:
# Get "duplicates" - i.e. sold at least twice
# find relisted (i.e. duplicate) properties
relisted_mask = df.index.duplicated(keep=False)

# Get those that were renovated in dataset timeframe
renovated_in_timeframe_mask = df["yr_renovated"] >= df["date"].min()

# Get those that weren't renovated in timeframe
not_renovated_in_timeframe_mask = ~renovated_in_timeframe_mask

# Only 14 renovated and sold within the timeframe
df[renovated_in_timeframe_mask]

# Exclude those that were renovated twice - only include those with an ID in both
#ids = pd.merge(left=df[renovated_in_timeframe_mask]["id"], right=df[not_renovated_in_timeframe_mask]["id"], how="inner", on="id")

# Groupby renovation status and average price
# Calculate difference
# Can do this in various segments

In [None]:
# They went for $0.7m on average - relatively low
np.around(df[renovated_in_timeframe_mask]["price"].mean(), -3) / 1e6

### Whether to renovate: Conclusion
We can conclude that:
* condition appears to have an impact on price, when you exclude the bands with very little data
* however, the information on condition is relatively limited - it all falls within quite a small range
* the data is thin above band 5, roughly the $2.5 million price tag
* the data is very thin for historical properties. HOWEVER -> a small increase in condition seems to correlate with a large increase in price.

* Now seems to be a good time to sell, and given the uncertainty in the data, and the risk that is associated with renovating (which could take longer than expected and cost more), it makes sense to pick properties for sale that are in a reasonably good condition already (as far above 3 as possible) but not to renovate them.

## Overall conclusion

* The best districts to sell for historical buildings are:
 * 