In [1]:
cd ..

/home/jovyan/portfolio/ames_housing_data_model_development


In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.tree import DecisionTreeRegressor
import lib.eda as eda
%matplotlib inline

# Create Categorical Features

In [3]:
ames_train_df = pd.read_pickle('data/train_df.p')
ames_test_df = pd.read_pickle('data/test_df.p')

In [4]:
ames_train_df.shape, ames_test_df.shape

((1459, 82), (1459, 81))

In [5]:
ames_train_df.sample(5)

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,MasVnrType_1,MasVnrType_2,MasVnrArea_1,MasVnrArea_2
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
731,120,RL,39.0,5389,Pave,,IR1,Lvl,AllPub,Inside,...,0,3,2010,WD,Normal,236500,,,0.0,0.0
84,20,RL,80.0,8892,Pave,,IR1,Lvl,AllPub,Inside,...,0,7,2007,COD,Normal,126500,BrkCmn,BrkCmn,66.0,66.0
305,75,RM,87.0,18386,Pave,,Reg,Lvl,AllPub,Inside,...,0,5,2008,WD,Normal,295000,,,0.0,0.0
227,60,RL,82.0,9950,Pave,,IR1,Lvl,AllPub,Inside,...,0,6,2007,WD,Abnorml,290000,BrkFace,BrkFace,290.0,290.0
677,70,RM,60.0,9600,Pave,Grvl,Reg,Lvl,AllPub,Inside,...,0,5,2006,WD,Normal,87000,,,0.0,0.0


### Which features are numerical and categorical?

In [6]:
for i, col in enumerate(ames_train_df.columns):
    dtype = ames_train_df[col].dtype    
    unique_train = ames_train_df[col].unique()
    unique_test = ames_train_df[col].unique()
    diff_test_train = set(unique_test) - set(unique_train)
    if len(unique_train) > 20:
        unique_train = "list length: {}".format(len(unique_train))
    print("{:3} {:20} {:10} \n{}".format(str(i), col, str(dtype), str(unique_train), str(diff_test_train)))
    if len(diff_test_train) > 0:
        print("diff: ".format(str(diff_test_train)))
    print()

0   MSSubClass           object     
[60 20 70 50 190 45 90 120 30 85 80 160 75 180 40]

1   MSZoning             object     
['RL' 'RM' 'C (all)' 'FV' 'RH']

2   LotFrontage          float64    
list length: 111

3   LotArea              int64      
list length: 1072

4   Street               object     
['Pave' 'Grvl']

5   Alley                object     
['None' 'Grvl' 'Pave']

6   LotShape             object     
['Reg' 'IR1' 'IR2' 'IR3']

7   LandContour          object     
['Lvl' 'Bnk' 'Low' 'HLS']

8   Utilities            object     
['AllPub' 'NoSeWa']

9   LotConfig            object     
['Inside' 'FR2' 'Corner' 'CulDSac' 'FR3']

10  LandSlope            object     
['Gtl' 'Mod' 'Sev']

11  Neighborhood         object     
list length: 25

12  Condition1           object     
['Norm' 'Feedr' 'PosN' 'Artery' 'RRAe' 'RRNn' 'RRAn' 'PosA' 'RRNe']

13  Condition2           object     
['Norm' 'Artery' 'RRNn' 'Feedr' 'PosN' 'PosA' 'RRAn' 'RRAe']

14  BldgType             object 

For consistency, we will treat all quality measures as categorical. We will also treat home features, such as number of rooms, as categorical.

The following features still need to be processed:

These all need to be categorical:

    16  OverallQual          int64      [ 7  6  8  5  9  4 10  3  1  2]
    17  OverallCond          int64      [5 8 6 7 4 2 3 9 1]
    44  BsmtFullBath         int64      [1 0 2 3]
    45  BsmtHalfBath         int64      [0 1 2]
    46  FullBath             int64      [2 1 3 0]
    47  HalfBath             int64      [1 0 2]
    48  BedroomAbvGr         int64      [3 4 1 2 0 5 6 8]
    49  KitchenAbvGr         int64      [1 2 3 0]
    51  TotRmsAbvGrd         int64      [ 8  6  7  9  5 11  4 10 12  3  2 14]
    53  Fireplaces           int64      [0 1 2 3]
    56  GarageYrBlt          object     [7.0 4.0 6.0 8.0 0.0 3.0 2.0 9.0 5.0 1.0 'None']
    58  GarageCars           int64      [2 3 1 0 4]
    73  MoSold               int64      [ 2  5  9 12 10  8 11  4  1  7  3  6]
    74  YrSold               int64      [2008 2007 2006 2009 2010]

Bin these, then make categorical:

    18  YearBuilt            int64      list length: 112
    19  YearRemodAdd         int64      list length: 61
    
Make this a Boolean:     

    38  CentralAir           object     ['Y' 'N']
    
Make two different dataframes for each:    

    78  MasVnrType_1         object     ['BrkFace' 'None' 'Stone' 'BrkCmn' 'CBlock']
    79  MasVnrType_2         object     ['BrkFace' 'None' 'Stone' 'BrkCmn']
    80  MasVnrArea_1         float64    list length: 328
    81  MasVnrArea_2         float64    list length: 327

In [7]:
ames_train_df.OverallQual = ames_train_df.OverallQual.astype('str')
ames_train_df.OverallCond = ames_train_df.OverallCond.astype('str')
ames_train_df.OverallQual = ames_train_df.OverallQual.astype('str') 
ames_train_df.OverallCond = ames_train_df.OverallCond.astype('str') 
ames_train_df.BsmtFullBath = ames_train_df.BsmtFullBath.astype('str')
ames_train_df.BsmtHalfBath = ames_train_df.BsmtHalfBath.astype('str')
ames_train_df.FullBath = ames_train_df.FullBath.astype('str')    
ames_train_df.HalfBath = ames_train_df.HalfBath.astype('str')    
ames_train_df.BedroomAbvGr = ames_train_df.BedroomAbvGr.astype('str')
ames_train_df.KitchenAbvGr = ames_train_df.KitchenAbvGr.astype('str')
ames_train_df.TotRmsAbvGrd = ames_train_df.TotRmsAbvGrd.astype('str')
ames_train_df.Fireplaces = ames_train_df.Fireplaces.astype('str')  
ames_train_df.GarageYrBlt = ames_train_df.GarageYrBlt.astype('str') 
ames_train_df.GarageCars = ames_train_df.GarageCars.astype('str')  
ames_train_df.MoSold = ames_train_df.MoSold.astype('str')      
ames_train_df.YrSold = ames_train_df.YrSold.astype('str')      

ames_test_df.OverallQual = ames_test_df.OverallQual.astype('str')
ames_test_df.OverallCond = ames_test_df.OverallCond.astype('str')
ames_test_df.OverallQual = ames_test_df.OverallQual.astype('str') 
ames_test_df.OverallCond = ames_test_df.OverallCond.astype('str') 
ames_test_df.BsmtFullBath = ames_test_df.BsmtFullBath.astype('str')
ames_test_df.BsmtHalfBath = ames_test_df.BsmtHalfBath.astype('str')
ames_test_df.FullBath = ames_test_df.FullBath.astype('str')    
ames_test_df.HalfBath = ames_test_df.HalfBath.astype('str')    
ames_test_df.BedroomAbvGr = ames_test_df.BedroomAbvGr.astype('str')
ames_test_df.KitchenAbvGr = ames_test_df.KitchenAbvGr.astype('str')
ames_test_df.TotRmsAbvGrd = ames_test_df.TotRmsAbvGrd.astype('str')
ames_test_df.Fireplaces = ames_test_df.Fireplaces.astype('str')  
ames_test_df.GarageYrBlt = ames_test_df.GarageYrBlt.astype('str') 
ames_test_df.GarageCars = ames_test_df.GarageCars.astype('str')  
ames_test_df.MoSold = ames_test_df.MoSold.astype('str')      
ames_test_df.YrSold = ames_test_df.YrSold.astype('str')      

#### Bin Years

In [8]:
YearBuilt = ames_train_df['YearBuilt'].values
_, deciles = pd.qcut(ames_train_df['YearBuilt'], 10, retbins=True)

In [9]:
YearBuilt = ames_train_df['YearBuilt'].copy()
YearBuilt[YearBuilt < deciles[1]] = 0
YearBuilt[YearBuilt >= deciles[9]] = 9
for i in range(1,9):
    YearBuilt[(YearBuilt >= deciles[i]) & (YearBuilt < deciles[i+1])] = i
ames_train_df['YearBuilt'] = YearBuilt.astype('str')

In [10]:
YearBuilt = ames_test_df['YearBuilt'].copy()
YearBuilt[YearBuilt < deciles[1]] = 0
YearBuilt[YearBuilt >= deciles[9]] = 9
for i in range(1,9):
    YearBuilt[(YearBuilt >= deciles[i]) & (YearBuilt < deciles[i+1])] = i
ames_test_df['YearBuilt'] = YearBuilt.astype('str')

In [11]:
ames_test_df['YearBuilt'].unique(), ames_train_df['YearBuilt'].unique()

(array(['3', '7', '6', '4', '5', '9', '8', '0', '2', '1'], dtype=object),
 array(['8', '5', '7', '0', '6', '1', '4', '3', '9', '2'], dtype=object))

In [12]:
YearRemodAdd = ames_train_df['YearRemodAdd'].values
_, deciles = pd.qcut(ames_train_df['YearRemodAdd'], 5, retbins=True)

In [13]:
YearRemodAdd = ames_train_df['YearRemodAdd'].copy()
YearRemodAdd[YearRemodAdd < deciles[1]] = 0
YearRemodAdd[YearRemodAdd >= deciles[4]] = 4
for i in range(1,4):
    YearRemodAdd[(YearRemodAdd >= deciles[i]) & (YearRemodAdd < deciles[i+1])] = i
ames_train_df['YearRemodAdd'] = YearRemodAdd.astype('str')

In [14]:
YearRemodAdd = ames_test_df['YearRemodAdd'].copy()
YearRemodAdd[YearRemodAdd < deciles[1]] = 0
YearRemodAdd[YearRemodAdd >= deciles[4]] = 4
for i in range(1,4):
    YearRemodAdd[(YearRemodAdd >= deciles[i]) & (YearRemodAdd < deciles[i+1])] = i
ames_test_df['YearRemodAdd'] = YearRemodAdd.astype('str')

In [15]:
ames_test_df['YearRemodAdd'].unique(), ames_train_df['YearRemodAdd'].unique()

(array(['0', '3', '2', '4', '1'], dtype=object),
 array(['3', '1', '2', '4', '0'], dtype=object))

#### Boolean Feature

In [16]:
ames_test_df['CentralAir'] = ames_test_df['CentralAir'].mask(ames_test_df['CentralAir'] == 'N', False)
ames_test_df['CentralAir'] = ames_test_df['CentralAir'].mask(ames_test_df['CentralAir'] == 'Y', True)

# Verify Features

In [17]:
for col in ames_train_df.columns:
    if ames_train_df[col].dtype != np.dtype('O'):
        ames_train_df[col] = ames_train_df[col].astype('float')
    if ames_train_df[col].dtype == np.dtype('O'):
        ames_train_df[col] = ames_train_df[col].astype('category')
for col in ames_test_df.columns:
    if ames_test_df[col].dtype != np.dtype('O'):
        ames_test_df[col] = ames_test_df[col].astype('float')  
    if ames_test_df[col].dtype == np.dtype('O'):
        ames_test_df[col] = ames_test_df[col].astype('category')      

In [18]:
for i, col in enumerate(ames_train_df.columns):
    dtype = ames_train_df[col].dtype    
    unique_train = ames_train_df[col].unique()
    unique_test = ames_train_df[col].unique()
    diff_test_train = set(unique_test) - set(unique_train)
    if len(unique_train) > 20:
        unique_train = "list length: {}".format(len(unique_train))
    print("{:3} {:20} {:10} {}".format(str(i), col, str(dtype), str(unique_train)))
    print()

0   MSSubClass           category   [60, 20, 70, 50, 190, ..., 80, 160, 75, 180, 40]
Length: 15
Categories (15, int64): [60, 20, 70, 50, ..., 160, 75, 180, 40]

1   MSZoning             category   [RL, RM, C (all), FV, RH]
Categories (5, object): [RL, RM, C (all), FV, RH]

2   LotFrontage          float64    list length: 111

3   LotArea              float64    list length: 1072

4   Street               category   [Pave, Grvl]
Categories (2, object): [Pave, Grvl]

5   Alley                category   [None, Grvl, Pave]
Categories (3, object): [None, Grvl, Pave]

6   LotShape             category   [Reg, IR1, IR2, IR3]
Categories (4, object): [Reg, IR1, IR2, IR3]

7   LandContour          category   [Lvl, Bnk, Low, HLS]
Categories (4, object): [Lvl, Bnk, Low, HLS]

8   Utilities            category   [AllPub, NoSeWa]
Categories (2, object): [AllPub, NoSeWa]

9   LotConfig            category   [Inside, FR2, Corner, CulDSac, FR3]
Categories (5, object): [Inside, FR2, Corner, CulDSac, FR3

#### Split DataFrames

In [19]:
ames_train_cblock_df = (ames_train_df.drop('MasVnrType_2', axis=1)
                                     .drop('MasVnrArea_2', axis=1))
ames_test_cblock_df = (ames_test_df.drop('MasVnrType_2', axis=1)
                                   .drop('MasVnrArea_2', axis=1))

In [20]:
ames_train_none_df = (ames_train_df.drop('MasVnrType_1', axis=1)
                                   .drop('MasVnrArea_1', axis=1))
ames_test_none_df = (ames_test_df.drop('MasVnrType_1', axis=1)
                                 .drop('MasVnrArea_1', axis=1))

In [21]:
ames_train_cblock_df.to_pickle('data/train_cblock_df.p')
ames_test_cblock_df.to_pickle('data/test_cblock_df.p')
ames_train_none_df.to_pickle('data/train_none_df.p')
ames_test_none_df.to_pickle('data/test_none_df.p')