In [1]:
from helper_funcs import find_lack_vars_and_gen_var_dict, print_table
import numpy as np
import pandas as pd

### <font color='grey'> I. Data Pre-processing</font>

#### <font color='grey'> §I.1 Identifying null values and dropping columns/rows</font>

The first step comes from the particularities of this data set: some columns contain bona fide missing entries while others contain an entry called NA which signifies the known lack of a given house/property feature. To separate out these two types, we need to consult the variable description file.

In [8]:
df = pd.read_csv('AmesHousing.csv')
cols = list(df.columns)

In [9]:
var_descr_file = 'VariableDescriptions.txt'
lack_vars, var_descrs = find_lack_vars_and_gen_var_dict(var_descr_file)
fix_var_names = dict(zip(list(var_descrs.keys()), cols))

# Replace `NA` entries w/ `'None'` strings.
for var in lack_vars:
    var = fix_var_names[var]
    df[var] = df[var].apply(lambda x: 'None' if pd.isna(x) else x)

# Although we have replaced the NA entries with 'None' strings and have 
# thus made all corresponding columns completely non-null, there still may 
# be too large a fraction of such entries to justify keeping such columns.
none_thresh = 0.3
none_fracs = [df[fix_var_names[var]].value_counts(normalize=True)['None'] for var in lack_vars]
none_fracs_dict = dict(zip(lack_vars, none_fracs))
lack_thresh_vars = [var for var in lack_vars if none_fracs_dict[var] > none_thresh]
none_thresh_fracs = [none_frac for none_frac in none_fracs if none_frac > none_thresh]
print_table(['Feature', 'None Frac'], [lack_thresh_vars, none_thresh_fracs])

Feature         None Frac
 Alley            0.9324232081911262
 FireplaceQu      0.4853242320819113
 Pool QC          0.9955631399317406
 Fence            0.8047781569965871
 Misc Feature     0.9638225255972697


We will immediately drop the `{'Alley', 'Pool QC', 'Fence', 'Misc Feature'}` columns. Although almost half the houses in the dataset lack fireplaces, we will accept this fraction of lack entries and understand later downstream whether this variable is helping in the regression objective.

In [10]:
drop_cols = ['Alley', 'Pool QC', 'Fence', 'Misc Feature']
df.drop(columns=drop_cols, inplace=True)

In [11]:
lack_thresh_vars = list(set(lack_thresh_vars)-set(drop_cols))
none_fracs = [df[fix_var_names[var]].value_counts(normalize=True)['None'] for var in lack_thresh_vars]
none_thresh_fracs = [none_frac for none_frac in none_fracs if none_frac > 0.3]
print_table(['Feature', 'None Frac'], [lack_thresh_vars, none_thresh_fracs])

Feature         None Frac
 FireplaceQu      0.4853242320819113


In [27]:
cols = list(df.columns)
bona_fide_nulls = list()
for col in cols:
    if len(df[col].isnull().value_counts().index.tolist())==2:
        bona_fide_nulls.append(col)
        
null_fracs = [str(np.around(df[col].isnull().value_counts(normalize=True)[True],4)) for col in features]
num_nulls = [str(np.around(df[col].isnull().value_counts(normalize=False)[True],4)) for col in features]
drop_rows = [df[df[col].isnull()][col].index.tolist() for col in features if df[col].isnull().value_counts(normalize=False)[True] < 25]

col_titles = ['Feature', 'Null Frac', 'Num Null']
cols = [bona_fide_nulls, null_fracs, num_nulls]
print_table(col_titles, cols)

Feature         Null Frac       Num Null
 Lot Frontage     0.1672           490
 Mas Vnr Type     0.0078           23
 Mas Vnr Area     0.0078           23
 BsmtFin SF 1     0.0003           1
 BsmtFin SF 2     0.0003           1
 Bsmt Unf SF      0.0003           1
 Total Bsmt SF    0.0003           1
 Electrical       0.0003           1
 Bsmt Full Bath   0.0007           2
 Bsmt Half Bath   0.0007           2
 Garage Yr Blt    0.0543           159
 Garage Cars      0.0003           1
 Garage Area      0.0003           1


In [30]:
drop_rows = [item for sublist in drop_rows for item in sublist]

In [31]:
df.drop(drop_rows, inplace=True)

In [32]:
bona_fide_nulls = [col for col in bona_fide_nulls if col in list(df.columns) and len(df[col].isnull().value_counts().index.tolist()) > 1]
print(bona_fide_nulls)

['Lot Frontage', 'Garage Yr Blt']


Since we cannot impute any data without first encoding our categorical variables, we stop here and move on to identifying those categorical variables and carrying out an appropriate one-hot encoding.