In [35]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 

housing = pd.read_csv('Ames_HousePrice.csv', index_col=0)

In [36]:
housing.columns

Index(['PID', 'GrLivArea', 'SalePrice', 'MSSubClass', 'MSZoning',
       'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'MasVnrArea', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF',
       '2ndFlrSF', 'LowQualFinSF', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond

In [37]:
price = housing['SalePrice']
size_related = housing.filter(regex='SF$|Area$')
size_related.head()

Unnamed: 0,GrLivArea,LotArea,MasVnrArea,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GarageArea,WoodDeckSF,OpenPorchSF,PoolArea
1,856,7890,0.0,618.0,856.0,856,0,0,399.0,0,0,0
2,1049,4235,149.0,104.0,1049.0,1049,0,0,266.0,0,105,0
3,1001,6060,0.0,100.0,837.0,1001,0,0,216.0,154,0,0
4,1039,8146,0.0,405.0,405.0,717,322,0,281.0,0,0,0
5,1665,8400,0.0,167.0,810.0,810,855,0,528.0,0,45,0


In [38]:
size_related = size_related.fillna(0.0)

In [39]:
print(housing.isnull().sum(axis=0))

PID              0
GrLivArea        0
SalePrice        0
MSSubClass       0
MSZoning         0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 81, dtype: int64


In [40]:
size_related.columns

Index(['GrLivArea', 'LotArea', 'MasVnrArea', 'BsmtUnfSF', 'TotalBsmtSF',
       '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GarageArea', 'WoodDeckSF',
       'OpenPorchSF', 'PoolArea'],
      dtype='object')

In [41]:
for col in size_related.columns:
    print(f'{col}: ', round(size_related[size_related[col] != 0 ][col].mean()))

GrLivArea:  1486
LotArea:  10120
MasVnrArea:  252
BsmtUnfSF:  588
TotalBsmtSF:  1065
1stFlrSF:  1145
2ndFlrSF:  779
LowQualFinSF:  332
GarageArea:  491
WoodDeckSF:  199
OpenPorchSF:  84
PoolArea:  476


In [42]:
for col in housing.columns:
    if housing[col].isnull().sum() != 0:
        print(f'{col}: ', housing[col].isnull().sum())

LotFrontage:  462
Alley:  2412
MasVnrType:  14
MasVnrArea:  14
BsmtQual:  69
BsmtCond:  69
BsmtExposure:  71
BsmtFinType1:  69
BsmtFinSF1:  1
BsmtFinType2:  70
BsmtFinSF2:  1
BsmtUnfSF:  1
TotalBsmtSF:  1
Electrical:  1
BsmtFullBath:  2
BsmtHalfBath:  2
FireplaceQu:  1241
GarageType:  127
GarageYrBlt:  129
GarageFinish:  129
GarageCars:  1
GarageArea:  1
GarageQual:  129
GarageCond:  129
PoolQC:  2571
Fence:  2055
MiscFeature:  2483


In [43]:
with open('housePrice_features') as hp_feat:
    hp_dict = {}
    for line in hp_feat.readlines():
        feature, description = [*line.split(':')]
        hp_dict[feature] = description

In [73]:
def fill_house_nan(col = None, val = None, col_lst = None, val_lst = None):
    if col:
        housing[col] = housing[col].fillna(val)
    elif col_lst:
        for new_col, new_val in zip(col_lst, val_lst):
            housing[new_col] = housing[new_col].fillna(new_val)

nan_col_lst = []
for col in housing.columns:
    if housing[col].isnull().sum() != 0:
        nan_col_lst.append(col)
        
nan_val_lst = [0, 'none', 'none', 0, 'none', 'none', 'No', 'none', 0, 'none', 0, 0, 0, 'unknown', 0, 0, 'none', 'none', 'none', 'none', 0, 0, 'none', 'none', 'none', 'none', 'none']

In [74]:
fill_house_nan(col_lst = nan_col_lst, val_lst = nan_val_lst)

In [75]:
housing.head()

Unnamed: 0,PID,GrLivArea,SalePrice,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
1,909176150,856,126000,30,RL,0.0,7890,Pave,0,Reg,...,166,0,none,none,none,0,3,2010,WD,Normal
2,905476230,1049,139500,120,RL,42.0,4235,Pave,0,Reg,...,0,0,none,none,none,0,2,2009,WD,Normal
3,911128020,1001,124900,30,C (all),60.0,6060,Pave,0,Reg,...,0,0,none,none,none,0,11,2007,WD,Normal
4,535377150,1039,114000,70,RL,80.0,8146,Pave,0,Reg,...,111,0,none,none,none,0,5,2009,WD,Normal
5,534177230,1665,227000,60,RL,70.0,8400,Pave,0,Reg,...,0,0,none,none,none,0,11,2009,WD,Normal


In [82]:
housing[housing['GarageQual'] == 0]['GarageQual']

18     0
23     0
45     0
49     0
51     0
      ..
665    0
672    0
697    0
724    0
730    0
Name: GarageQual, Length: 129, dtype: object

In [61]:
hp_dict['BsmtExposure']

' Walkout or garden level basement walls\n'

In [70]:
housing['MiscFeature'].unique()

array([nan, 'Shed', 'Othr', 'Gar2', 'TenC'], dtype=object)

In [50]:
housing[housing['PoolQC'] == 'TA']['SalePrice']

180    228500
192    130000
657    153000
Name: SalePrice, dtype: int64

In [79]:
pools = housing.groupby('BsmtQual')
pools = pd.DataFrame(pools['SalePrice'].aggregate(['mean', 'median']))
pools

Unnamed: 0_level_0,mean,median
BsmtQual,Unnamed: 1_level_1,Unnamed: 2_level_1
0,109335.710145,101800.0
Ex,327308.005319,316250.0
Fa,113467.051282,112000.0
Gd,200961.085422,191000.0
Po,84950.0,84950.0
TA,141389.656947,136500.0


In [78]:
housing[housing['GarageQual'] != housing['GarageCond']][['GarageQual', 'GarageCond']]

Unnamed: 0,GarageQual,GarageCond
3,TA,Po
18,0,none
23,0,none
38,Fa,TA
45,0,none
...,...,...
697,0,none
702,Fa,TA
724,0,none
730,0,none
