In [1]:
import pandas as pd

df = pd.read_csv('./data/train.csv')
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


##  Get Top 5 Columns with Highest Correlation to Sales Price Coefficient

In [2]:
# "The goal of your analysis is to identify the most important features of houses that affect the sale prices."
df.corr()['SalePrice'].sort_values(ascending=False)[1:6]

OverallQual    0.790982
GrLivArea      0.708624
GarageCars     0.640409
GarageArea     0.623431
TotalBsmtSF    0.613581
Name: SalePrice, dtype: float64

## Find Missing Values in the Dataset

In [3]:
null_values = df.isna().sum()
null_values[null_values > 0]

LotFrontage      259
Alley           1369
MasVnrType         8
MasVnrArea         8
BsmtQual          37
BsmtCond          37
BsmtExposure      38
BsmtFinType1      37
BsmtFinType2      38
Electrical         1
FireplaceQu      690
GarageType        81
GarageYrBlt       81
GarageFinish      81
GarageQual        81
GarageCond        81
PoolQC          1453
Fence           1179
MiscFeature     1406
dtype: int64

In [4]:
# Display % missing values
percent_null_values = (null_values[null_values > 0] / df.shape[0]) * 100
percent_null_values

LotFrontage     17.739726
Alley           93.767123
MasVnrType       0.547945
MasVnrArea       0.547945
BsmtQual         2.534247
BsmtCond         2.534247
BsmtExposure     2.602740
BsmtFinType1     2.534247
BsmtFinType2     2.602740
Electrical       0.068493
FireplaceQu     47.260274
GarageType       5.547945
GarageYrBlt      5.547945
GarageFinish     5.547945
GarageQual       5.547945
GarageCond       5.547945
PoolQC          99.520548
Fence           80.753425
MiscFeature     96.301370
dtype: float64

In [5]:
# Drop columns with > 10% missing values
columns_to_drop = list(percent_null_values[percent_null_values > 10].index)
df.drop(columns_to_drop, axis=1, inplace=True)

## Get Descriptive Statistics about SalePrice

In [6]:
df.SalePrice.describe()

count      1460.000000
mean     180921.195890
std       79442.502883
min       34900.000000
25%      129975.000000
50%      163000.000000
75%      214000.000000
max      755000.000000
Name: SalePrice, dtype: float64

## Examine Columns with String Values

In [7]:
# Print all string columns with less than 10 unique values:
columns_to_encode = []
for col in df.columns:
    if df[col].dtype == "object":
        if len(df[col].value_counts()) < 10:
            columns_to_encode.append(col)
print(columns_to_encode)

['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition']


## Hot Encode Categorical Columns with < 10 Unique Values

In [8]:
df = pd.get_dummies(df, columns=columns_to_encode)
df.head()

Unnamed: 0,Id,MSSubClass,LotArea,Neighborhood,OverallQual,OverallCond,YearBuilt,YearRemodAdd,Exterior1st,Exterior2nd,...,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,1,60,8450,CollgCr,7,5,2003,2003,VinylSd,VinylSd,...,0,0,0,1,0,0,0,0,1,0
1,2,20,9600,Veenker,6,8,1976,1976,MetalSd,MetalSd,...,0,0,0,1,0,0,0,0,1,0
2,3,60,11250,CollgCr,7,5,2001,2002,VinylSd,VinylSd,...,0,0,0,1,0,0,0,0,1,0
3,4,70,9550,Crawfor,7,5,1915,1970,Wd Sdng,Wd Shng,...,0,0,0,1,1,0,0,0,0,0
4,5,60,14260,NoRidge,8,5,2000,2000,VinylSd,VinylSd,...,0,0,0,1,0,0,0,0,1,0


## Re-calculate Correlation Coefficients

In [9]:
df.corr()['SalePrice'].sort_values(ascending=False)[1:]

OverallQual              0.790982
GrLivArea                0.708624
GarageCars               0.640409
GarageArea               0.623431
TotalBsmtSF              0.613581
1stFlrSF                 0.605852
FullBath                 0.560664
BsmtQual_Ex              0.553105
TotRmsAbvGrd             0.533723
YearBuilt                0.522897
YearRemodAdd             0.507101
KitchenQual_Ex           0.504094
Foundation_PConc         0.497734
GarageYrBlt              0.486362
MasVnrArea               0.477493
Fireplaces               0.466929
ExterQual_Gd             0.452466
ExterQual_Ex             0.451164
BsmtFinType1_GLQ         0.434597
HeatingQC_Ex             0.434543
GarageFinish_Fin         0.419678
BsmtFinSF1               0.386420
SaleType_New             0.357509
SaleCondition_Partial    0.352060
GarageType_Attchd        0.335961
MasVnrType_Stone         0.330476
WoodDeckSF               0.324413
KitchenQual_Gd           0.321641
2ndFlrSF                 0.319334
OpenPorchSF   