### Part 1: Import necessary libraries and load the dataset

In [2]:
#import libraries

import pandas as pd
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import sklearn as sk

In [3]:
#load training dataset

df = pd.read_csv("train.csv")
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
#Get the shape of the dataset
print(df.shape)

(1460, 81)


### Part 2: Recode categorical variables into numbers

In [12]:
#create replacement library

cleanup = {"LotShape" : {"Reg":0, "IR1":-1, "IR2":-2, "IR3":-3},
           "LandSlope": {"Gtl":0, "Mod":-1, "Sev":-2},
           "HouseStyle": {"1Story":1, "1.5Fin":1.5, "1.5Unf":1.25, "2Story":2, "2.5Fin":2.5, "2.5Unf":2.25, "SFoyer": -1, "SLvl": -1.5}, 
           #code external quality based on the numbers assigned to OverallCond & OverallQual
           "ExterQual": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1},
           "ExterCond" : {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1}, 
           "BsmtQual" : {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1, "NA": 0},
           "BsmtCond": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1, "NA": 0},
           #split levels or foyers typically score average or above in bsmtExposure
           "BsmtExposure": {"Gd": 4, "Av": 3, "Mn": 2, "No": 1, "NA": 0},
           "BsmtFinType1": {"GLQ": 6, "ALQ": 5, "BLQ":4, "Rec":3, "LwQ":2, "Unf":1, "NA":0},
           "BsmtFinType2": {"GLQ": 6, "ALQ": 5, "BLQ":4, "Rec":3, "LwQ":2, "Unf":1, "NA":0},
           "HeatingQC": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1},
           "CentralAir": {"N": 0, "Y": 1},
           "KitchenQual": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1},
           "Functional": {"Typ":0, "Min1":-1, "Min2":-2, "Mod":-3, "Maj1":-4, "Maj2":-5, "Sev":-6, "Sal":-7},
           "FireplaceQu": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1, "NA": 0},
           "GarageFinish": {"Fin":3, "RFn":2, "Unf":1, "NA":0},
           "GarageQual": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1, "NA": 0},
           "GarageCond": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "Po": 1, "NA": 0},
           "PavedDrive": {"Y": 2, "P":1, "N":0},
           "PoolQC": {"Ex": 9, "Gd": 7, "TA": 5, "Fa": 3, "NA": 0}}

df.replace(cleanup, inplace = True)
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,0,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,0,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,-1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,-1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,-1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [8]:
df['LandSlope'].unique()

array(['Gtl', 'Mod', 'Sev'], dtype=object)