## Import Data

In [26]:
# Setup Packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math
from scipy import stats
from scipy.stats import norm, skew
from scipy.stats import pearsonr
from sklearn.impute import SimpleImputer

%matplotlib inline

# Used for auto-formatting code
# terminal - pip install nb_black
%load_ext nb_black

# Display all rows and columns
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)

The nb_black extension is already loaded. To reload it, use:
  %reload_ext nb_black


<IPython.core.display.Javascript object>

In [4]:
house_data = pd.read_csv("train.csv")

<IPython.core.display.Javascript object>

## Data Manipulation

In [5]:
# Log transform response variable
house_data["SalePrice_log"] = np.log(house_data["SalePrice"])

<IPython.core.display.Javascript object>

### Impute Continuous Variables

In [22]:
#LotFrontage
idx = house_data.loc[ pd.isna(house_data["LotFrontage"]), :].index
house_data["LotFrontage"][idx] = house_data["LotFrontage"].mean()

#MasVnrArea
idx = house_data.loc[ pd.isna(house_data["MasVnrArea"]), :].index
house_data["MasVnrArea"][idx] = house_data["MasVnrArea"].mean()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


<IPython.core.display.Javascript object>

## Feature Engineering

In [51]:
#Total area of house = Above ground living area + basement living area
house_data["TotalSF"] = house_data["GrLivArea"] + house_data["TotalBsmtSF"]

#Total finished area = Above ground living area + finished basement area
#house_data["TotalFinishSF"] = house_data["GrLivArea"] + house_data["BsmtFinSF1"]
    #NOTE: BsmtFinSF1 isn't the finished basement area, just SF of Type 1 finish
    
#Has Second Floor = 0 (no) or 1 (yes)
house_data['Has2ndFl'] = house_data['2ndFlrSF']
house_data['Has2ndFl'].loc[ house_data['Has2ndFl'] > 0 ] = 1

#Age = Year sold - year built
house_data["Age"] = house_data["YrSold"] + house_data["YearBuilt"]

#AgeRemod = Year sold - year remodeled
house_data["Age"] = house_data["YrSold"] + house_data["YearRemodAdd"]

#Total porch SF = OpenPorchSF + EnclosedSF + 3SsnPorch + ScreenPorch
house_data["TotPorchSF"] = house_data["OpenPorchSF"] + house_data["EnclosedPorch"] + house_data["3SsnPorch"] + house_data["ScreenPorch"]

#Has Porch = 0 (no) or 1 (yes)
house_data['HasPorch'] = house_data['TotPorchSF']
house_data['HasPorch'].loc[ house_data['HasPorch'] > 0 ] = 1

#Has Deck = 0 (no) or 1 (yes)
house_data['HasDeck'] = house_data['WoodDeckSF']
house_data['HasDeck'].loc[ house_data['HasDeck'] > 0 ] = 1

#Total bathrooms = Full bath + 0.5*Halfbath (for both house and basement)
house_data['TotBaths'] = house_data['FullBath'] + house_data['BsmtFullBath'] + 0.5*house_data['HalfBath'] + 0.5*house_data['BsmtHalfBath']

#Has Pool = 0 (no) or 1 (yes)
house_data['HasPool'] = house_data['PoolArea']
house_data['HasPool'].loc[ house_data['HasPool'] > 0 ] = 1

#Total number of car garage = 1, 2, 3+ cars
house_data['TotCarGarage'] = house_data['GarageCars']
house_data['TotCarGarage'].loc[ house_data['TotCarGarage'] >= 3 ] = 3

#HasFirePlace = 0 (no), 1 (yes, not excellent quality), or 2 (excellent quality)
house_data['HasFirePlace'] = [1] * len(house_data['FireplaceQu'])
idx_none = house_data.index[house_data['FireplaceQu'].isna()]  # indexes of 0 fireplaces
idx_excel = house_data.index[ house_data['FireplaceQu'] == 'Ex' ]  # indexes of excellent
house_data['HasFirePlace'].idx_none = 0
house_data['HasFirePlace'].idx_excel = 2



<IPython.core.display.Javascript object>

## Remove Variables

In [28]:
var_toremove = ['MoSold', 
                'OverallCond', 
                'BedroomAbvGr', 
                'KitchenAbvGr', 
                'Fireplaces', 
                'PoolArea', 
                'MoSold', 
                'YrSold', 
                'GarageCars', 
                'Heating', 
                'Fence', 
                'MiscFeature', 
                'SaleType', 
                'Electrical', 
                'PoolQC', 
                'BsmtFinType2', 
                'Functional',
                'BsmtQual',
                'BsmtExposure',
                'Functional',
                'GarageCond',
                'SaleCondition',
                'Street']




<IPython.core.display.Javascript object>