## Data cleaning
- Refer to data dictionary on the Github repository

In [1]:
import pandas as pd
import numpy as np
import pickle

from sklearn.impute import SimpleImputer

In [2]:
df_train = pd.read_csv("../kaggle-california-housing-data/train.csv")

### Inspect columns and values
- Let's first take a look at the null values
- Make sure that a null value makes sense in these columns
- Then decide on a function to replace
- NaN means this in the columns:
    - LotFrontage: No street connected to the property
    - Alley: NaN means no Alley
    - MasVnrType: Not sure, already has a None category
    - MasVnrArea: Not sure, already has a 0 area
    - BsmtQual: No basement
    - BsmtFinType1: No basement
    - BsmtFinType2: No basement
    - Electrical: Not sure, no electrical system?
    - FireplaceQu: No fireplace
    - GarageType: No garage
    - GarageYrBlt: No garage (check)
    - GarageFinish: No garage (check)
    - GarageQual: No garage (check)
    - GarageCond: No garage (check)
    - PoolQC: No pool
    - Fence: No fence
    - MiscFeature: No miscellaneous features

In [3]:
df_train.columns[
    list(
        df_train.isnull().any()
    )
]

Index(['LotFrontage', 'Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Electrical', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence',
       'MiscFeature'],
      dtype='object')

### Individual checks
- Make sure that the NaNs are consistent

In [4]:
df_train.loc[
    df_train["BsmtQual"].isnull(), 
    ["BsmtFinType1", "BsmtFinType2"]
].drop_duplicates()

Unnamed: 0,BsmtFinType1,BsmtFinType2
17,,


In [5]:
df_train.loc[
    df_train["GarageType"].isnull(), 
    [
        "GarageYrBlt", 
        "GarageFinish", 
        "GarageQual", 
        "GarageCond"
    ]
].drop_duplicates()

Unnamed: 0,GarageYrBlt,GarageFinish,GarageQual,GarageCond
39,,,,


## Impute values
- LotFrontage will be changed to 0
- MasVnrType will be changed to "None"
- MasVnrArea will be changed to 0
- Everything elese will be changed to "None"

In [6]:
def replace_nans(df, col, method):
    imputer = SimpleImputer(
        missing_values=np.nan, 
        strategy="constant", 
        fill_value=method
    )
    return imputer.fit_transform(np.array(df[col]).reshape(1, -1))[0]

In [7]:
df_train["LotFrontage"] = replace_nans(df_train, "LotFrontage", 0)
df_train["MasVnrArea"] = replace_nans(df_train, "MasVnrArea", 0)

In [8]:
df_train = df_train.loc[
    df_train["Electrical"].notnull(), 
    :
].fillna("None")

## Export cleaned data

In [9]:
with open('../kaggle-california-housing-data/df_imputed.pickle', 'wb') as f:
    pickle.dump(df_train, f)