# Exploratory Data Analysis of the Kingston Housing Data - Regression Model

In this notebook the 'prettier' figures used in the presentation are plottet. They only contain information already explored in the other notebooks.

## Imports

In [11]:
import numpy as np
import pandas as pd
from plotnine import *
import warnings
warnings.filterwarnings('ignore')

In [96]:
# Load the Data Set
data = pd.read_pickle('King_County_House_prices_dataset_CLEANED.pkl')

# For the business cabse we are exploring price per sqft as we did in our final regression models
#data['price_sqft'] = data['price'] / data['sqft_living']

## Figure creation

In [59]:
p=(ggplot(data, aes('yr_built', 'price/1000', group=1)) 
+ geom_boxplot(aes(group='yr_built') , 
               outlier_size = 0.01, outlier_alpha = 0.2, 
               outlier_color = "#595959", color="#dd8047", fatten = 3)
+ geom_boxplot(aes(group='yr_built') , outlier_size = 0.01, outlier_alpha = 0.2,
               color="#595959", fatten=0, fill=None)
+ ylim(0,1500)
+ labs(x="year built", y="price in 1000 USD")
+ theme_classic()
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='yr_price.png', dpi=300, width=11.5, height=4.5)

In [145]:
p=(ggplot(data, aes('yr_built')) 
+ geom_histogram(binwidth=1, color="#595959", fill="white")
+ labs(x="year built", y="amount of house sales in dataset")
+ theme_classic()
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))


ggsave(plot=p, filename='yr_count.png', dpi=300, width=11.5, height=4.5)

In [190]:
# There does seem to be a slight correlation with latitude. 
# The more north we go, the better the pricing becomes
# How about longitude?
p=(ggplot(data, aes('long', 'lat')) 
+ geom_point(aes(colour='yr_built'), alpha=0.3, size=0.1)
+ theme_classic()
+ labs(x="degree longitude", y="degree latitude", colour = "year built", title = "Sales of old and new Houses in King County")
+ scale_color_gradient2(low="#dd8047", 
                        mid="white", 
                        high="#94b6d2", 
                        midpoint=1950)
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 10),
        plot_title = element_text(size = 20),
       legend_position = 'right'))

ggsave(plot=p, filename='map.png', dpi=300, width=8, height=5)

In [102]:
zip_subset = data.groupby('zipcode').filter(lambda x : len(x) > 1)

zip_zipcodes = pd.Series(zip_subset.groupby('zipcode')['price'].median() # save the zipcodes (which are the indices)
                         .sort_values()
                         .index,
                         name = 'zipcode')
zip_price = pd.Series(zip_subset.groupby('zipcode')['price'].median() # extract the median prices
                      .sort_values()
                      .values, 
                      name = 'price')
zip_std = pd.Series(zip_subset.groupby('zipcode')['price'].std() # extract the standard deviation
                    .sort_values().
                    values.round(), 
                    name = 'std')
zip_order = pd.Series(range(len(zip_price)),  # we need the order to make the plot more readable
                      name='order')

zip_subset = pd.concat([zip_zipcodes, zip_order, zip_price, zip_std], axis=1)
zip_subset['price_upper_bound'] = zip_subset['price'] + zip_subset['std']
zip_subset['price_lower_bound'] = zip_subset['price'] - zip_subset['std']

del zip_zipcodes, zip_price, zip_std, zip_order

labels = [str(i) for i in zip_subset['zipcode']]

In [124]:
p=(ggplot(zip_subset) 
+ geom_errorbar(aes(x='order',ymax='price_upper_bound/1000', ymin = 'price_lower_bound/1000'), color="#595959")
+ geom_point(aes('order','price/1000'), color ="#595959")
+ geom_point(zip_subset[-4:], aes('order','price/1000'), color = "#94b6d2")
+ geom_point(zip_subset[-1:], aes('order','price/1000'), color = "#dd8047")
+ scale_x_continuous(breaks=range(70), labels=labels)
+ theme_classic()
+ labs(x="zipcode", y="median price in 1000 USD")
+ theme(axis_text_x=element_text(rotation=90, size=8),
        axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='zipcodes.png', dpi=300, width=11.5, height=4.5)

In [130]:
p=(ggplot(data, aes('grade', 'price/1000', group=1)) 
+ geom_boxplot(aes(group='grade') , 
               outlier_size = 0.3, outlier_alpha = 0.2, 
               outlier_color = "#595959", color="#595959", width=0.5)
+ theme_classic()
+ labs(x="building grade", y="price in 1000 USD")
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='grade.png', dpi=300, width=11.5, height=4.5)

In [132]:
p=(ggplot(data, aes('condition', 'price/1000', group=1)) 
+ geom_boxplot(aes(group='condition') , 
               outlier_size = 0.3, outlier_alpha = 0.2, 
               outlier_color = "#595959", color="#595959", width=0.5)
+ theme_classic()
+ labs(x="building condition", y="price in 1000 USD")
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='condition.png', dpi=300, width=11.5, height=4.5)

In [136]:
p=(ggplot(data, aes('bedrooms', 'price/1000')) 
+ geom_boxplot(aes(group='bedrooms') , 
               outlier_size = 0.3, outlier_alpha = 0.2, 
               outlier_color = "#595959", color="#595959", width=0.5)
+ geom_smooth(color = "#dd8047", size = .75, linetype = 'dashed', se=False)
+ theme_classic()
+ labs(x="number of bedrooms", y="price in 1000 USD")
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='bedrooms.png', dpi=300, width=11.5, height=4.5)

In [139]:
p=(ggplot(data, aes('bathrooms', 'price/1000')) 
+ geom_boxplot(aes(group='bathrooms'), 
               outlier_size = 0.3, outlier_alpha = 0.2, 
               outlier_color = "#595959", color="#595959")
+ geom_smooth(color = "#94b6d2", size = .75, linetype = 'dashed', se=False)
+ theme_classic()
+ labs(x="number of bathrooms", y="price in 1000 USD")
+ theme(axis_line = element_line(color = "#595959"),
        axis_ticks = element_line(color = "#595959"),
        text = element_text(color= "#595959", size = 15)))

ggsave(plot=p, filename='bathrooms.png', dpi=300, width=11.5, height=4.5)