## Import Statements

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import norm
from sklearn.preprocessing import StandardScaler
from scipy import stats
import xgboost as xgb
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')



## Load data

In [2]:
train_df=pd.read_csv('train.csv')

In [3]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


# Convert categorical variables
Use dummy variables to represent the levels of categorical variables that are not ordinal

### Turn 'MSSubClass' levels into dummy variables

In [4]:
dummy_df=pd.get_dummies(train_df['MSSubClass'])
train_df = train_df.join(dummy_df)
train_df=train_df.drop('MSSubClass',axis=1)

In [5]:
train_df.rename(columns={20:'1-STORY 1946 & NEWER ALL STYLES',\
                                  30:'1-STORY 1945 & OLDER',\
                                  40:'1-STORY W/FINISHED ATTIC ALL AGES',\
                                  45:'1-1/2 STORY - UNFINISHED ALL AGES',\
                                  50:'1-1/2 STORY FINISHED ALL AGES',\
                                  60:'2-STORY 1946 & NEWER',\
                                  70:'2-STORY 1945 & OLDER',\
                                  75:'2-1/2 STORY ALL AGES',\
                                  80:'SPLIT OR MULTI-LEVEL',\
                                  85:'SPLIT FOYER',\
                                  90:'DUPLEX - ALL STYLES AND AGES',\
                                  120:'1-STORY PUD (Planned Unit Development) - 1946 & NEWER',\
                                  150:'1-1/2 STORY PUD - ALL AGES',\
                                  160:'2-STORY PUD - 1946 & NEWER',\
                                  180:'PUD - MULTILEVEL - INCL SPLIT LEV/FOYER',\
                                  190:'2 FAMILY CONVERSION - ALL STYLES AND AGES'},inplace=True)

### Turn 'MSZoning' levels into dummy variables

In [6]:
dummy_df=pd.get_dummies(train_df['MSZoning'])
train_df = train_df.join(dummy_df)
train_df=train_df.drop('MSZoning',axis=1)

In [7]:
train_df.rename(columns={'C (all)':'Commercial',\
                         'FV':'Floating Village Residential',\
                         'RL':'Residential Low Density',\
                         'RM':'Residential Medium Density',\
                         'RH':'Residential High Density'},inplace=True)

### Turn 'Street' levels into dummy variables

In [8]:
dummy_df=pd.get_dummies(train_df['Street'])
train_df = train_df.join(dummy_df)
train_df=train_df.drop('Street',axis=1)

In [9]:
train_df.rename(columns={'Grvl':'Gravel Road',\
                         'Pave':'Paved Road'},inplace=True)

### Turn 'Alley' levels into dummy variables

In [10]:
dummy_df = pd.get_dummies(train_df['Alley'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('Alley',axis=1)

In [11]:
train_df.rename(columns={'Grvl':'Gravel Alley Access',\
                         'Pave':'Paved Alley Access'},inplace=True)

### Turn 'LotShape' levels into dummy variables

In [12]:
dummy_df = pd.get_dummies(train_df['LotShape'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('LotShape',axis=1)

In [13]:
train_df.rename(columns={'Reg':'Lot Shape - Regular',\
                         'IR1':'Lot Shape - Slightly Irregular',\
                         'IR2':'Lot Shape - Moderately Irregular',\
                         'IR3':'Lot Shape - Irregular'},inplace=True)

### Turn 'LandContour' levels into dummy variables

In [14]:
dummy_df = pd.get_dummies(train_df['LandContour'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('LandContour', axis=1)

In [15]:
train_df.rename(columns={'Bnk':'Land Contour - Banked',\
                         'HLS':'Land Contour - Hillside',\
                         'Low':'Land Contour - Depression',\
                         'Lvl':'Land Contour - Level'},inplace=True)

### Turn 'Utilities' levels into dummy variables

In [16]:
dummy_df = pd.get_dummies(train_df['Utilities'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('Utilities', axis=1)

In [17]:
train_df.rename(columns={'AllPub':'All Public Utilities',\
                         'NoSeWa':'Electricity and Gas Only'},inplace=True)

In [18]:
dummy_df = pd.get_dummies(train_df['LotConfig'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('LotConfig', axis=1)

In [19]:
train_df.rename(columns={'Inside':'Inside lot',\
                         'Corner':'Corner lot',\
                         'CulDSac':'Cul-de-sac',\
                         'FR2':'Frontage on 2 sides of property',\
                         'FR3':'Frontage on 3 sides of property'},inplace=True)

In [21]:
dummy_df = pd.get_dummies(train_df['LandSlope'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('LandSlope', axis=1)

In [22]:
train_df.rename(columns={'Gtl':'Gentle slope',\
                         'Mod':'Moderate Slope',\
                         'Sev':'Severe Slope'},inplace=True)

In [23]:
train_df.head()

Unnamed: 0,Id,LotFrontage,LotArea,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,...,All Public Utilities,Electricity and Gas Only,Corner lot,Cul-de-sac,Frontage on 2 sides of property,Frontage on 3 sides of property,Inside lot,Gentle slope,Moderate Slope,Severe Slope
0,1,65.0,8450,CollgCr,Norm,Norm,1Fam,2Story,7,5,...,1,0,0,0,0,0,1,1,0,0
1,2,80.0,9600,Veenker,Feedr,Norm,1Fam,1Story,6,8,...,1,0,0,0,1,0,0,1,0,0
2,3,68.0,11250,CollgCr,Norm,Norm,1Fam,2Story,7,5,...,1,0,0,0,0,0,1,1,0,0
3,4,60.0,9550,Crawfor,Norm,Norm,1Fam,2Story,7,5,...,1,0,1,0,0,0,0,1,0,0
4,5,84.0,14260,NoRidge,Norm,Norm,1Fam,2Story,8,5,...,1,0,0,0,1,0,0,1,0,0


In [24]:
dummy_df = pd.get_dummies(train_df['Neighborhood'])
train_df = train_df.join(dummy_df)
train_df = train_df.drop('Neighborhood', axis=1)

In [26]:
train_df.rename(columns={'Blmngtn':'Bloomington Heights',\
                         'Blueste':'Bluestem',\
                         'BrDale':'Briardale',\
                         'BrkSide':'Brookside',\
                         'ClearCr':'Clear Creek',\
                         'CollgCr':'College Creek',\
                         'Crawfor':'Crawford',\
                         'Edwards':'Edwards',\
                         'Gilbert':'Gilbert',\
                         'DOTRR':'Iowa DOT and Rail Road',\
                         'MeadowV':'Meadow Village',\
                         'Mitchel':'Mitchell',\
                         'Names':'North Ames',\
                         'NoRidge':'Northridge',\
                         'NPkVill':'Northpark Villa',\
                         'NridgHt':'Northridge Heights',\
                         'NWAmes':'Northwest Ames',\
                         'OldTown':'Old Town',\
                         'SWISU':'South & West of Iowa State University',\
                         'Sawyer':'Sawyer',\
                         'SawyerW':'Sawyer West',\
                         'Somerst':'Somerset',\
                         'StoneBr':'Stone Brook',\
                         'Timber':'Timberland',\
                         'Veenker':'Veenker'},inplace=True)