# 8. AMES HOUSING: FEATURE ENGINEERING
---

## 1. Introduction
Let's start by filtering the training set to just the columns containing no missing values.

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
pd.set_option("display.max_columns", 99)
pd.set_option("display.max_rows", 999)
pd.set_option('precision', 3)

ames = pd.read_csv('data/Ames_Housing.txt', delimiter="\t")
train = ames[0:1460]
test = ames[1460:]

train_null_counts = train.isnull().sum()
df_no_null = train[train_null_counts[train_null_counts==0].index]
df_no_null.isnull().sum()

Order              0
PID                0
MS SubClass        0
MS Zoning          0
Lot Area           0
Street             0
Lot Shape          0
Land Contour       0
Utilities          0
Lot Config         0
Land Slope         0
Neighborhood       0
Condition 1        0
Condition 2        0
Bldg Type          0
House Style        0
Overall Qual       0
Overall Cond       0
Year Built         0
Year Remod/Add     0
Roof Style         0
Roof Matl          0
Exterior 1st       0
Exterior 2nd       0
Exter Qual         0
Exter Cond         0
Foundation         0
Heating            0
Heating QC         0
Central Air        0
Electrical         0
1st Flr SF         0
2nd Flr SF         0
Low Qual Fin SF    0
Gr Liv Area        0
Full Bath          0
Half Bath          0
Bedroom AbvGr      0
Kitchen AbvGr      0
Kitchen Qual       0
TotRms AbvGrd      0
Functional         0
Fireplaces         0
Garage Cars        0
Garage Area        0
Paved Drive        0
Wood Deck SF       0
Open Porch SF

In [6]:
df_no_null.dtypes.value_counts()

int64      28
object     28
float64     2
dtype: int64

## 2. Categorical Features
Let's convert all of the text columns that contain no missing values into the categorical data type.

In [5]:
text_cols = df_no_null.select_dtypes(include=['object']).columns
print(len(text_cols))
text_cols

28


Index(['MS Zoning', 'Street', 'Lot Shape', 'Land Contour', 'Utilities',
       'Lot Config', 'Land Slope', 'Neighborhood', 'Condition 1',
       'Condition 2', 'Bldg Type', 'House Style', 'Roof Style', 'Roof Matl',
       'Exterior 1st', 'Exterior 2nd', 'Exter Qual', 'Exter Cond',
       'Foundation', 'Heating', 'Heating QC', 'Central Air', 'Electrical',
       'Kitchen Qual', 'Functional', 'Paved Drive', 'Sale Type',
       'Sale Condition'],
      dtype='object')

In [7]:
for col in text_cols:
    print(col+":", len(train[col].unique()))

MS Zoning: 6
Street: 2
Lot Shape: 4
Land Contour: 4
Utilities: 3
Lot Config: 5
Land Slope: 3
Neighborhood: 26
Condition 1: 9
Condition 2: 6
Bldg Type: 5
House Style: 8
Roof Style: 6
Roof Matl: 5
Exterior 1st: 14
Exterior 2nd: 16
Exter Qual: 4
Exter Cond: 5
Foundation: 6
Heating: 6
Heating QC: 4
Central Air: 2
Electrical: 4
Kitchen Qual: 5
Functional: 7
Paved Drive: 3
Sale Type: 9
Sale Condition: 5


In [11]:
for col in text_cols:
    train = train.copy()
    train[col] = train[col].astype('category')
    
train['Utilities'].cat.codes.value_counts()

0    1457
2       2
1       1
dtype: int64

In [14]:
train['Utilities'].value_counts()

AllPub    1457
NoSewr       2
NoSeWa       1
Name: Utilities, dtype: int64

In [16]:
train[text_cols].dtypes

MS Zoning         category
Street            category
Lot Shape         category
Land Contour      category
Utilities         category
Lot Config        category
Land Slope        category
Neighborhood      category
Condition 1       category
Condition 2       category
Bldg Type         category
House Style       category
Roof Style        category
Roof Matl         category
Exterior 1st      category
Exterior 2nd      category
Exter Qual        category
Exter Cond        category
Foundation        category
Heating           category
Heating QC        category
Central Air       category
Electrical        category
Kitchen Qual      category
Functional        category
Paved Drive       category
Sale Type         category
Sale Condition    category
dtype: object