In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()

In [2]:
filepath='D:/downloads/house-prices-advanced-regression-techniques/'
train = pd.read_csv(filepath+'train.csv')
test = pd.read_csv(filepath+'test.csv')
split_point = train.shape[0]
data = train.drop('SalePrice',axis=1).append(test,ignore_index=True)

In [3]:
data.isnull().sum()[data.isnull().sum()>0].sort_values(ascending=False)

PoolQC          2909
MiscFeature     2814
Alley           2721
Fence           2348
FireplaceQu     1420
LotFrontage      486
GarageFinish     159
GarageYrBlt      159
GarageQual       159
GarageCond       159
GarageType       157
BsmtExposure      82
BsmtCond          82
BsmtQual          81
BsmtFinType2      80
BsmtFinType1      79
MasVnrType        24
MasVnrArea        23
MSZoning           4
BsmtFullBath       2
BsmtHalfBath       2
Utilities          2
Functional         2
Exterior2nd        1
Exterior1st        1
SaleType           1
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
Electrical         1
KitchenQual        1
GarageCars         1
GarageArea         1
TotalBsmtSF        1
dtype: int64

### Threshold : 100 (more than 100 missing NA values) 

In [4]:
threshold = 100
drop_cols = data.isnull().sum()[data.isnull().sum()>threshold].index.values
drop_cols

array(['LotFrontage', 'Alley', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'], dtype=object)

drop colunms above threshold

In [5]:
data = data.drop(drop_cols,axis=1)

### Split types : numeric , object

In [6]:
object_cols = data.columns[data.columns.map(lambda x : data[x].dtype) == 'object']
numeric_cols = data.columns[data.columns.map(lambda x : data[x].dtype) != 'object']

numeric type columns with na values

In [7]:
data[numeric_cols].isnull().sum()[data[numeric_cols].isnull().sum()>0]

MasVnrArea      23
BsmtFinSF1       1
BsmtFinSF2       1
BsmtUnfSF        1
TotalBsmtSF      1
BsmtFullBath     2
BsmtHalfBath     2
GarageCars       1
GarageArea       1
dtype: int64

object type columns with na values

In [8]:
data[object_cols].isnull().sum()[data[object_cols].isnull().sum()>0]

MSZoning         4
Utilities        2
Exterior1st      1
Exterior2nd      1
MasVnrType      24
BsmtQual        81
BsmtCond        82
BsmtExposure    82
BsmtFinType1    79
BsmtFinType2    80
Electrical       1
KitchenQual      1
Functional       2
SaleType         1
dtype: int64

### Fill fake na values with relative groups (where nan actually means something)  
1. __`group 1`__  
    - MasVnrArea , MasVnrType  
2. __`group 2`__  
    - BsmtFinSF1 , BsmtFinSF2 , BsmtUnfSF , TotalBsmtSF , BsmtQual , BsmtCond , BsmtExposure , BsmtFinType1 , BsmtFinType2  
3. __`group 3`__  
    - Exterior1st , Exterior2nd  

`group 1` : MasVnrArea , MasVnrType

In [9]:
group1 = ['MasVnrArea','MasVnrType']

In [10]:
# incorrect MasVnrType to 'None'
print(data[group1][(data.MasVnrArea==0) & (data.MasVnrType!='None')])
idx = data[(data.MasVnrArea==0) & (data.MasVnrType!='None')].index.values
data.loc[idx,'MasVnrType'] = 'None'

      MasVnrArea MasVnrType
688          0.0    BrkFace
1241         0.0      Stone
2319         0.0    BrkFace


In [11]:
# incorrect MasVnrArea to '0.0'
print(data[group1][(data.MasVnrArea!=0) & (data.MasVnrType=='None')])
idx = data[(data.MasVnrArea!=0) & (data.MasVnrType=='None')].index.values
data.loc[idx,'MasVnrArea'] = 0

      MasVnrArea MasVnrType
624        288.0       None
773          1.0       None
1230         1.0       None
1300       344.0       None
1334       312.0       None
1669       285.0       None
2452         1.0       None


In [12]:
# if both nan , fill with {'MasVnrType':'None' , 'MasVnrArea':0}
print(data[group1][(data.MasVnrArea.isnull()) & (data.MasVnrType.isnull())])
idx = data[(data.MasVnrArea.isnull()) & (data.MasVnrType.isnull())].index.values
data.loc[idx,'MasVnrArea'] = 0
data.loc[idx,'MasVnrType'] = 'None'

      MasVnrArea MasVnrType
234          NaN        NaN
529          NaN        NaN
650          NaN        NaN
936          NaN        NaN
973          NaN        NaN
977          NaN        NaN
1243         NaN        NaN
1278         NaN        NaN
1691         NaN        NaN
1706         NaN        NaN
1882         NaN        NaN
1992         NaN        NaN
2004         NaN        NaN
2041         NaN        NaN
2311         NaN        NaN
2325         NaN        NaN
2340         NaN        NaN
2349         NaN        NaN
2368         NaN        NaN
2592         NaN        NaN
2657         NaN        NaN
2686         NaN        NaN
2862         NaN        NaN


`group 2` : BsmtFinSF1 , BsmtFinSF2 , BsmtUnfSF , TotalBsmtSF , BsmtQual , BsmtCond , BsmtExposure , BsmtFinType1 , BsmtFinType2  

In [13]:
group2 = ['BsmtFinSF1' , 'BsmtFinSF2' , 'BsmtUnfSF' , 'TotalBsmtSF' ,
          'BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2']

In [19]:
# No basment 로 확인할 수 있는 공통 NaN 데이터는 79개
data[['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2']].isnull().sum()

BsmtQual        81
BsmtCond        82
BsmtExposure    82
BsmtFinType1    79
BsmtFinType2    80
dtype: int64

In [18]:
# BsmtFinType1 == NaN 인 데이터에 대해 나머지 컬럼도 NaN 인지 확인 ==> O
idx = data[data.BsmtFinType1.isnull()].index.values
data.loc[idx,['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType2']].isnull().sum()

BsmtQual        79
BsmtCond        79
BsmtExposure    79
BsmtFinType2    79
dtype: int64

In [24]:
# 공통 NaN 데이터 == > fill with 'NA' (no basement)
data.loc[idx,['BsmtQual' , 'BsmtCond' , 'BsmtExposure' , 'BsmtFinType1' , 'BsmtFinType2']] = 'NA'

In [25]:
data[group2].isnull().sum()

BsmtFinSF1      1
BsmtFinSF2      1
BsmtUnfSF       1
TotalBsmtSF     1
BsmtQual        2
BsmtCond        3
BsmtExposure    3
BsmtFinType1    0
BsmtFinType2    1
dtype: int64

In [36]:
# 동일한 컬럼의 결측치 이며 분류가 NA 이므로 모두 0  
print(data[group2][data.BsmtFinSF1.isnull()].index)
print(data[group2][data.BsmtFinSF2.isnull()].index)
print(data[group2][data.BsmtUnfSF.isnull()].index)
print(data[group2][data.TotalBsmtSF.isnull()].index)
print(data[group2[4:]][data.BsmtFinSF2.isnull()])
data.loc[2120,group2[:4]]=0

Int64Index([2120], dtype='int64')
Int64Index([2120], dtype='int64')
Int64Index([2120], dtype='int64')
Int64Index([2120], dtype='int64')
     BsmtQual BsmtCond BsmtExposure BsmtFinType1 BsmtFinType2
2120       NA       NA           NA           NA           NA


### Fill na with RF-classifier , RF-regressor

In [None]:
for col in numeric_cols:
    if dataset[col].isnull().sum() :



for col in object_cols:
    pass
    
    