In [35]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split

# Project Overview

## Business Case/Problem

## Data Description

## Data Load

In [6]:
kc_house_data = pd.read_csv('data/kc_house_data.csv')

In [7]:
kc_house_data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,7129300520,10/13/2014,221900.0,3,1.0,1180,5650,1.0,,0.0,...,7,1180,0.0,1955,0.0,98178,47.5112,-122.257,1340,5650
1,6414100192,12/9/2014,538000.0,3,2.25,2570,7242,2.0,0.0,0.0,...,7,2170,400.0,1951,1991.0,98125,47.721,-122.319,1690,7639
2,5631500400,2/25/2015,180000.0,2,1.0,770,10000,1.0,0.0,0.0,...,6,770,0.0,1933,,98028,47.7379,-122.233,2720,8062
3,2487200875,12/9/2014,604000.0,4,3.0,1960,5000,1.0,0.0,0.0,...,7,1050,910.0,1965,0.0,98136,47.5208,-122.393,1360,5000
4,1954400510,2/18/2015,510000.0,3,2.0,1680,8080,1.0,0.0,0.0,...,8,1680,0.0,1987,0.0,98074,47.6168,-122.045,1800,7503


In [8]:
kc_house_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 21 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   id             21597 non-null  int64  
 1   date           21597 non-null  object 
 2   price          21597 non-null  float64
 3   bedrooms       21597 non-null  int64  
 4   bathrooms      21597 non-null  float64
 5   sqft_living    21597 non-null  int64  
 6   sqft_lot       21597 non-null  int64  
 7   floors         21597 non-null  float64
 8   waterfront     19221 non-null  float64
 9   view           21534 non-null  float64
 10  condition      21597 non-null  int64  
 11  grade          21597 non-null  int64  
 12  sqft_above     21597 non-null  int64  
 13  sqft_basement  21597 non-null  object 
 14  yr_built       21597 non-null  int64  
 15  yr_renovated   17755 non-null  float64
 16  zipcode        21597 non-null  int64  
 17  lat            21597 non-null  float64
 18  long  

Features to remove: 
- 'yr_renovated': large number of missing data.

## Data Cleaning

Let's review columns to see which ones make the most sense to keep.

In [10]:
kc_house_data.columns

Index(['id', 'date', 'price', 'bedrooms', 'bathrooms', 'sqft_living',
       'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade',
       'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode',
       'lat', 'long', 'sqft_living15', 'sqft_lot15'],
      dtype='object')

In [11]:
columns_to_keep = ['id', 
                   'price', 
                   'bedrooms', 
                   'bathrooms', 
                   'sqft_living', 
                   'sqft_lot', 
                   'floors', 
                   'waterfront', 
                   'view', 
                   'condition',
                  'grade',
                  'sqft_above',
                  'sqft_basement',
                  'yr_built',
                  'zipcode'
                  ]

In [30]:
def filter_columns(df):
    columns_to_keep = ['price', 
                   'bedrooms', 
                   'bathrooms', 
                   'sqft_living', 
                   'sqft_lot', 
                   'floors', 
                   'waterfront', 
                   'condition',
                  'grade',
                  'sqft_above',
                  'yr_built',
                  'zipcode'
                  ]
    return df[columns_to_keep]

In [31]:
house_data_filtered = filter_columns(kc_house_data)
house_data_filtered.head()

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,yr_built,zipcode
0,221900.0,3,1.0,1180,5650,1.0,,3,7,1180,1955,98178
1,538000.0,3,2.25,2570,7242,2.0,0.0,3,7,2170,1951,98125
2,180000.0,2,1.0,770,10000,1.0,0.0,3,6,770,1933,98028
3,604000.0,4,3.0,1960,5000,1.0,0.0,5,7,1050,1965,98136
4,510000.0,3,2.0,1680,8080,1.0,0.0,3,8,1680,1987,98074


In [32]:
house_data_filtered.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   price        21597 non-null  float64
 1   bedrooms     21597 non-null  int64  
 2   bathrooms    21597 non-null  float64
 3   sqft_living  21597 non-null  int64  
 4   sqft_lot     21597 non-null  int64  
 5   floors       21597 non-null  float64
 6   waterfront   19221 non-null  float64
 7   condition    21597 non-null  int64  
 8   grade        21597 non-null  int64  
 9   sqft_above   21597 non-null  int64  
 10  yr_built     21597 non-null  int64  
 11  zipcode      21597 non-null  int64  
dtypes: float64(4), int64(8)
memory usage: 2.0 MB


Missing data from 'waterfront' and 'view' columns will need to be addressed.

In [28]:
house_data_filtered.waterfront.unique()

array([nan,  0.,  1.])

Before going further we should keep a hold out set

In [34]:
target = house_data_filtered['price']
features = house_data_filtered.iloc[:,1:12]

X_train, X_test, y_train, y_test = train_test_split(features,target,test_size=0.1)

Unnamed: 0,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,condition,grade,sqft_above,yr_built,zipcode
0,3,1.0,1180,5650,1.0,,3,7,1180,1955,98178
1,3,2.25,2570,7242,2.0,0.0,3,7,2170,1951,98125
2,2,1.0,770,10000,1.0,0.0,3,6,770,1933,98028
3,4,3.0,1960,5000,1.0,0.0,5,7,1050,1965,98136
4,3,2.0,1680,8080,1.0,0.0,3,8,1680,1987,98074


## Model A

## Feature Engineering

## Model Training

## Model Evaluation

## Conclusions/Summary