In [None]:
import pandas as pd
import numpy as np
from helpers import *
import statsmodels.api as sm
from statsmodels.formula.api import ols
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
plt.style.use('seaborn')

df = pd.read_csv("kc_house_data.csv")

## Data Cleaning

In [None]:
for col in ['view', 'waterfront', 'yr_renovated']:
        df[col] = df[col].fillna(0)

In [None]:
df["sqft_basement"] = df.apply(replace_null_basement(df), axis = 1)

In [None]:
for col in ['waterfront', 'view', 'yr_renovated']:
    df[col] = df[col].apply(lambda x: to_int(x))

In [None]:
df['posix_date'] = df['date'].apply(lambda x: to_posix(x))

In [None]:
df['date'] = pd.to_datetime(df['date'])

## Data Overview

In [None]:
df_corr = df.loc[:, "price":"posix_date"]
df_corr.columns

## Our Random Correlation Heatmap

In [None]:
map_corr = np.corrcoef(df_corr, rowvar=0)

In [None]:
map_corr = pd.DataFrame(map_corr,
                        columns=['Price', 'Bedrooms', 'Bathrooms', 'Living SqFt', 'Lot SqFt', 'Floors',
                                 'Waterfront', 'View', 'Condition', 'Grade', 'Above-Ground SqFt',
                                 'Basement SqFt', 'Year Built', 'Year Renovated', 'Zipcode', 'Lat', 'Long',
                                 'Neighbor Lv', 'Neighbor Lot', 'Date in Posix'],
                        index=['Price', 'Bedrooms', 'Bathrooms', 'Living SqFt', 'Lot SqFt', 'Floors',
                               'Waterfront', 'View', 'Condition', 'Grade', 'Above-Ground SqFt',
                               'Basement SqFt', 'Year Built', 'Year Renovated', 'Zipcode', 'Lat', 'Long',
                               'Neighbor Lv', 'Neighbor Lot', 'Posix Date'])

map_corr

In [None]:
plt.figure(figsize=(12,10))
ax = sns.heatmap(data = map_corr)
plt.xticks(rotation=45) 
plt.show()

#### Thoughts from Correlation Heat Map:
- Can probably remove sqft_above and sqft_basement since they're included in sqft_living

In [None]:
df.head(10)

In [None]:
df.info()

## Histograms

In [None]:
df["log_price"] = np.log(df["price"])
df[["price", "log_price"]].hist(figsize = [12,6])

In [None]:
df["date"].hist(figsize = [6,6])

In [None]:
df["bedrooms"].hist(figsize = [6,6])

In [None]:
df["bathrooms"].hist(figsize = [6,6])

In [None]:
df["sqft_living"].hist(figsize = [6,6])

In [None]:
df["sqft_lot"].hist(figsize = [6,6])

In [None]:
df["floors"].hist(figsize = [6,6])

In [None]:
df["waterfront"].hist(figsize = [6,6])

In [None]:
df["view"].hist(figsize = [6,6])

In [None]:
df["condition"].hist(figsize = [6,6])

In [None]:
df["grade"].hist(figsize = [6,6])

In [None]:
df["log_sqft_above"] = np.log(df["sqft_above"])
df[["sqft_above", "log_sqft_above"]].hist(figsize = [12,6])

In [None]:
df_with_basement = df.loc[df['sqft_basement'] > 0]

df_with_basement["log_sqft_basement"] = np.log(df_with_basement["sqft_basement"])
df_with_basement[["sqft_basement", "log_sqft_basement"]].hist(figsize = [12,6])
# df["sqft_basement"].hist(figsize = [6,6])

In [None]:
df["yr_built"].hist(figsize = [6,6])

In [None]:
df["yr_renovated"].hist(figsize = [6,6])

In [None]:
df["zipcode"].hist(figsize = [6,6])

In [None]:
df["lat"].hist(figsize = [6,6])

In [None]:
df["long"].hist(figsize = [6,6])

In [None]:
df["sqft_living15"].hist(figsize = [6,6])

In [None]:
df["sqft_lot15"].hist(figsize = [6,6])