### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

### Load Data

In [2]:
# load dataset
data = pd.read_csv('train.csv')

# take a glimpse
display(data.shape, data.head())

(1460, 81)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Missing Values

In [5]:
# make a list of the variables that contain missing values
vars_with_na = [col for col in data.columns if data[col].isnull().sum() > 0]

# determine percentage of missing values
data[vars_with_na].isnull().mean()

LotFrontage     0.177397
Alley           0.937671
MasVnrType      0.005479
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtExposure    0.026027
BsmtFinType1    0.025342
BsmtFinType2    0.026027
Electrical      0.000685
FireplaceQu     0.472603
GarageType      0.055479
GarageYrBlt     0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
PoolQC          0.995205
Fence           0.807534
MiscFeature     0.963014
dtype: float64

In [None]:
# observe relations between missing values and the target

def analyze_na(df, var):
    df = df.copy()
    df[var] = np.where(df[var].isnull(), 1, 0)
    df.groupby(var)['target'].sum().plot.bar()
    
    plt.title(var)
    plt.show()

for col in data.columns:
    analyze_na(data, col)

### Numerical Variables

In [9]:
num_vars = [col for col in data.columns if data[col].dtypes != 'O']
display('Number of numerical variables: {}'.format(len(num_vars)), data[num_vars].head())

'Number of numerical variables: 38'

Unnamed: 0,Id,MSSubClass,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SalePrice
0,1,60,65.0,8450,7,5,2003,2003,196.0,706,...,0,61,0,0,0,0,0,2,2008,208500
1,2,20,80.0,9600,6,8,1976,1976,0.0,978,...,298,0,0,0,0,0,0,5,2007,181500
2,3,60,68.0,11250,7,5,2001,2002,162.0,486,...,0,42,0,0,0,0,0,9,2008,223500
3,4,70,60.0,9550,7,5,1915,1970,0.0,216,...,0,35,272,0,0,0,0,2,2006,140000
4,5,60,84.0,14260,8,5,2000,2000,350.0,655,...,192,84,0,0,0,0,0,12,2008,250000


In [11]:
# find year variables

year_vars = [col for col in data.columns if 'Yr' in col or 'Year' in col]
year_vars

['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']

### Discrete Variables

In [15]:
discrete_vars = [var for var in num_vars if len(data[var].unique())<20 and var not in year_vars+['Id']]
display('Number of discrete variables: {}'.format(len(discrete_vars)), data[discrete_vars].head())

'Number of discrete variables: 14'

Unnamed: 0,MSSubClass,OverallQual,OverallCond,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageCars,PoolArea,MoSold
0,60,7,5,1,0,2,1,3,1,8,0,2,0,2
1,20,6,8,0,1,2,0,3,1,6,1,2,0,5
2,60,7,5,1,0,2,1,3,1,6,1,2,0,9
3,70,7,5,1,0,1,0,3,1,7,1,3,0,2
4,60,8,5,1,0,2,1,4,1,9,1,3,0,12


In [None]:
# observe relations between discrete values and the target

for each in discrete_vars:
    data.groupby(each)['target'].sum().plot.bar()
    plt.title(each)
    plt.show()

### Continuous Variable

In [19]:
cont_vars = [var for var in data.columns if var in num_vars and var not in ['Id']+year_vars+discrete_vars]
display('Number of continuous variables: {}'.format(len(cont_vars)), data[cont_vars].head())

'Number of continuous variables: 19'

Unnamed: 0,LotFrontage,LotArea,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,MiscVal,SalePrice
0,65.0,8450,196.0,706,0,150,856,856,854,0,1710,548,0,61,0,0,0,0,208500
1,80.0,9600,0.0,978,0,284,1262,1262,0,0,1262,460,298,0,0,0,0,0,181500
2,68.0,11250,162.0,486,0,434,920,920,866,0,1786,608,0,42,0,0,0,0,223500
3,60.0,9550,0.0,216,0,540,756,961,756,0,1717,642,0,35,272,0,0,0,140000
4,84.0,14260,350.0,655,0,490,1145,1145,1053,0,2198,836,192,84,0,0,0,0,250000


In [None]:
# observe the distribution of continuous variables

for each in cont_vars:
    data[each].hist(bins=30)
    plt.title(each)
    plt.ylabel('Number of houses')
    plt.xlabel(each)
    plt.show()

In [None]:
# observe the distribution of continuous variables after log transformation

for each in cont_vars:
    if any(data[each]<=0):
        pass
    else:
        data[each] = np.log(data[each])
        data[each].hist(bins=30)
        plt.title(each)
        plt.ylabel('Number of houses')
        plt.xlabel(each)
        plt.show()

### Outliers

In [None]:
# observe outliers from the continuous variables

for each in cont_vars:
    if any(data[each]<=0):
        pass
    else:
        data[each] = np.log(data[each])
        data.box(column=each)
        plt.title(each)
        plt.show()

### Categorical Variables

In [23]:
cat_vars = [col for col in data.columns if data[col].dtypes=='O']
display('Number of categorical variables: {}'.format(len(cat_vars)), data[cat_vars].head())

'Number of categorical variables: 43'

Unnamed: 0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageType,GarageFinish,GarageQual,GarageCond,PavedDrive,PoolQC,Fence,MiscFeature,SaleType,SaleCondition
0,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
1,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
2,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal
3,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,Detchd,Unf,TA,TA,Y,,,,WD,Abnorml
4,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,Attchd,RFn,TA,TA,Y,,,,WD,Normal


In [24]:
# cardinality
data[cat_vars].nunique()

MSZoning          5
Street            2
Alley             2
LotShape          4
LandContour       4
Utilities         2
LotConfig         5
LandSlope         3
Neighborhood     25
Condition1        9
Condition2        8
BldgType          5
HouseStyle        8
RoofStyle         6
RoofMatl          8
Exterior1st      15
Exterior2nd      16
MasVnrType        4
ExterQual         4
ExterCond         5
Foundation        6
BsmtQual          4
BsmtCond          4
BsmtExposure      4
BsmtFinType1      6
BsmtFinType2      6
Heating           6
HeatingQC         5
CentralAir        2
Electrical        5
KitchenQual       4
Functional        7
FireplaceQu       5
GarageType        6
GarageFinish      3
GarageQual        5
GarageCond        5
PavedDrive        3
PoolQC            3
Fence             4
MiscFeature       4
SaleType          9
SaleCondition     6
dtype: int64

In [31]:
# rare labels

for each in cat_vars:
    temp = data[each].value_counts() / len(data)
    display(temp[temp<0.01])

C (all)    0.006849
Name: MSZoning, dtype: float64

Grvl    0.00411
Name: Street, dtype: float64

Series([], Name: Alley, dtype: float64)

IR3    0.006849
Name: LotShape, dtype: float64

Series([], Name: LandContour, dtype: float64)

NoSeWa    0.000685
Name: Utilities, dtype: float64

FR3    0.00274
Name: LotConfig, dtype: float64

Sev    0.008904
Name: LandSlope, dtype: float64

Veenker    0.007534
NPkVill    0.006164
Blueste    0.001370
Name: Neighborhood, dtype: float64

RRAe    0.007534
PosA    0.005479
RRNn    0.003425
RRNe    0.001370
Name: Condition1, dtype: float64

Feedr     0.004110
Artery    0.001370
RRNn      0.001370
PosN      0.001370
PosA      0.000685
RRAn      0.000685
RRAe      0.000685
Name: Condition2, dtype: float64

Series([], Name: BldgType, dtype: float64)

1.5Unf    0.009589
2.5Unf    0.007534
2.5Fin    0.005479
Name: HouseStyle, dtype: float64

Flat       0.008904
Gambrel    0.007534
Mansard    0.004795
Shed       0.001370
Name: RoofStyle, dtype: float64

Tar&Grv    0.007534
WdShngl    0.004110
WdShake    0.003425
Membran    0.000685
Metal      0.000685
Roll       0.000685
ClyTile    0.000685
Name: RoofMatl, dtype: float64

BrkComm    0.001370
Stone      0.001370
CBlock     0.000685
AsphShn    0.000685
ImStucc    0.000685
Name: Exterior1st, dtype: float64

ImStucc    0.006849
Brk Cmn    0.004795
Stone      0.003425
AsphShn    0.002055
Other      0.000685
CBlock     0.000685
Name: Exterior2nd, dtype: float64

Series([], Name: MasVnrType, dtype: float64)

Fa    0.009589
Name: ExterQual, dtype: float64

Ex    0.002055
Po    0.000685
Name: ExterCond, dtype: float64

Stone    0.004110
Wood     0.002055
Name: Foundation, dtype: float64

Series([], Name: BsmtQual, dtype: float64)

Po    0.00137
Name: BsmtCond, dtype: float64

Series([], Name: BsmtExposure, dtype: float64)

Series([], Name: BsmtFinType1, dtype: float64)

GLQ    0.009589
Name: BsmtFinType2, dtype: float64

Grav     0.004795
Wall     0.002740
OthW     0.001370
Floor    0.000685
Name: Heating, dtype: float64

Po    0.000685
Name: HeatingQC, dtype: float64

Series([], Name: CentralAir, dtype: float64)

FuseP    0.002055
Mix      0.000685
Name: Electrical, dtype: float64

Series([], Name: KitchenQual, dtype: float64)

Maj1    0.009589
Maj2    0.003425
Sev     0.000685
Name: Functional, dtype: float64

Series([], Name: FireplaceQu, dtype: float64)

CarPort    0.006164
2Types     0.004110
Name: GarageType, dtype: float64

Series([], Name: GarageFinish, dtype: float64)

Gd    0.009589
Po    0.002055
Ex    0.002055
Name: GarageQual, dtype: float64

Gd    0.006164
Po    0.004795
Ex    0.001370
Name: GarageCond, dtype: float64

Series([], Name: PavedDrive, dtype: float64)

Gd    0.002055
Ex    0.001370
Fa    0.001370
Name: PoolQC, dtype: float64

MnWw    0.007534
Name: Fence, dtype: float64

Othr    0.001370
Gar2    0.001370
TenC    0.000685
Name: MiscFeature, dtype: float64

ConLD    0.006164
ConLI    0.003425
ConLw    0.003425
CWD      0.002740
Oth      0.002055
Con      0.001370
Name: SaleType, dtype: float64

Alloca     0.008219
AdjLand    0.002740
Name: SaleCondition, dtype: float64