## Final Project Submission

Please fill out:
* Student name: Josh Blumer
* Student pace: Self Paced 
* Scheduled project review date/time: Feb. 12th 12:00P.M. 
* Instructor name: Jeff Herman
* Blog post URL:


# Exploratory Data Analysis

## Import Necessary Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from scipy import stats
import datetime as dt

## Import and Preview Data File

In [2]:
# Read file in as Pandas dataframe

df = pd.read_csv('kc_house_data.csv')
df.head()

Unnamed: 0,price,bedrooms,bathrooms,floors,waterfront,view,condition,grade,yrs_old,renovated,...,SD_MercerIsland,SD_Northshore,SD_Renton,SD_Riverview,SD_Seattle,SD_Shoreline,SD_SnoqualmieValley,SD_Tahoma,SD_Tukwila,SD_VashonIsland
0,221900.0,3,1.0,1.0,0.0,0.0,3,7,59,0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,538000.0,3,2.25,2.0,0.0,0.0,3,7,63,1,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
2,180000.0,2,1.0,1.0,0.0,0.0,3,6,82,1,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,604000.0,4,3.0,1.0,0.0,0.0,5,7,49,0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
4,510000.0,3,2.0,1.0,0.0,0.0,3,8,28,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# Explore numeric value summary statistics

df.describe(include = 'all')

In [None]:
# Explore file metadata

df.info()

### The sqft_basement and date columns are object data types which will require conversion to be analyzed and represented visually.

## Begin EDA and Cleaning

In [None]:
# Change sqft_basement datatype to float

df['sqft_basement'] = pd.to_numeric(df['sqft_basement'], errors = 'coerce')

In [None]:
# Change date datatype to datetime object

df['date'] = pd.to_datetime(df['date'], infer_datetime_format = True)

In [None]:
# Verify change

df['date'].head()

In [None]:
# Datetime objects cannot be graphed for visual interpretation so we will derive years old
# by subtracting year_built from date to give us continuous values that can be graphed

df['yrs_old'] = [i.year for i in df['date']] - df['yr_built']
df['yrs_old'].head()

In [None]:
# Drop original date column after extracting age information

df.drop(['date'], axis = 1, inplace = True)

In [None]:
# Check to verify change

df.info()

### Inspect Columns for Missing Values

In [None]:
# Check columns for null values

df.isnull().sum()

In [None]:
# Impute missing 'view' values with column mode

df['view'] = df['view'].fillna(df['view'].mode()[0])

In [None]:
# Impute missing 'waterfront' values with column mode

df['waterfront'] = df['waterfront'].fillna(df['waterfront'].mode()[0])

In [None]:
# Binarize yr_renovated to renovated and drop year_renovated to 

df['renovated'] = [0 if i == 0.0 else 1 for i in df['yr_renovated']]
df.drop(['yr_renovated'], axis = 1, inplace = True)

In [None]:
# Binarize sqft_basement to basement and drop sqft_basement 

df['basement'] = [0 if i == 0.0 else 1 for i in df['sqft_basement']]
df.drop(['sqft_basement'], axis = 1, inplace = True)

In [None]:
# Verify null entries were edited. The remaining missing values with be removed 
# after checking features for multicollinearity

df.isnull().sum()

## Check Features for Multicollinearity

In [None]:
# Examine features for linear relationships

pd.plotting.scatter_matrix(df,figsize  = [18, 18]);
plt.show()

In [None]:
# Examine correlations between independent and dependent variables for feature selection and
# between dependent variables for multicollinearity

df.corr()

In [None]:
# Define threshold for multicollinearity correlation as 0.75 to determine which variables
# need to be removed

abs(df.corr()) > 0.75

Sqft_living is correlated with sqft_above and sqft_living15. It is the most highly correlated with the target variable, but will be removed due to high correlation with several other variables. We will also remove the id column because it doesnt provide any useful information to the model and yr_built because it is highly correlated with yrs_old.

In [None]:
# Remove columns with correlation greater than 0.75

df.drop(['sqft_living', 'sqft_above', 'sqft_living15', 'sqft_lot', 'sqft_lot15', 'id', 'yr_built'], axis = 1, inplace = True)

In [None]:
# Review changes

df.head()

## Examine Feature Variables for Outliers

In [None]:
# Examine remaining feature variables relationship with the target variable to meet linearity
# assumptions and inspect observations for visible outliers

for i, col in enumerate(df.columns):
    plt.figure(i)
    plt.scatter(x = df[col], y = df['price'])
    plt.ylabel('price')
    plt.xlabel(col)
    plt.legend()

In [None]:
for i, col in enumerate(df.columns):
    plt.figure(i)
    sns.kdeplot(df[col], bw = 0.5)

In [None]:
df.columns

In [None]:
# Separate categorical columns

df_cat = df[['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade', 'renovated', 'basement']]

In [None]:
for i, col in enumerate(df_cat):
    plt.figure(i)
    sns.boxplot(x = df_cat[col], y = df['price'])

In [None]:
# Price distribution is heavily skewed so in order to preserve model interpretability and 
# accuracy we will manually filter outliers by filtering the column through value limits

df = df[(df['price'] >= 0) & (df['price'] <= 1000000)]

In [None]:
# Review distribution change

sns.boxplot(df['price'])

In [None]:
df.price.shape

In [None]:
sns.boxplot(df['bedrooms'])

In [None]:
# Remove present outliers by filtering column through value limits

df = df[(df['bedrooms'] >= 0) & (df['bedrooms'] <= 10)]

In [None]:
df.bedrooms.shape

# Run Baseline Regression

In [None]:
# Baseline regression will be our frame of reference for performance expectations of model
# once adding engineered features

# Save file alterations for continued processing

In [None]:
# Save file to continue work in another notebook

df.to_csv('kc2_house_data.csv', index = False)