## Imports

In [20]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

## Loading the train and test data
We use the data from the home-data-for-ml-course from Kaggle:
* train.csv, which we split to the training and validation datasets
* test.csv, which we use to test our model

We preliminarily clean the data, by:
* dropping the rows with a null ('NaN') target variable
* separating the target variable (to y) and the predictor variables (to X)

In [2]:
# Load the data
X_full = pd.read_csv('train.csv', index_col='Id')
X_test_full = pd.read_csv('test.csv', index_col='Id')

# Drop the rows (axis=0) with a null target variable, from X_full
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)

# Split X_full to X and y
# We do this so we don't give the model y (the 'answer') by accident, since it is supposed to predict y from X
y = X_full.SalePrice
X = X_full.drop('SalePrice', axis=1)

In [3]:
# Split X, y into training and validation datasets
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, train_size = 0.8, test_size = 0.2, random_state=0)

In [6]:
# Investigating cardinality
object_nunique = list(map(lambda col : X_train_full[col].nunique(), cat_cols))
d = dict(zip(cat_cols, object_nunique))
sorted(d.items(), key=lambda x : x[1])

[('Street', 2),
 ('Alley', 2),
 ('Utilities', 2),
 ('CentralAir', 2),
 ('LandSlope', 3),
 ('GarageFinish', 3),
 ('PavedDrive', 3),
 ('PoolQC', 3),
 ('MiscFeature', 3),
 ('LotShape', 4),
 ('LandContour', 4),
 ('MasVnrType', 4),
 ('ExterQual', 4),
 ('BsmtQual', 4),
 ('BsmtCond', 4),
 ('BsmtExposure', 4),
 ('KitchenQual', 4),
 ('Fence', 4),
 ('MSZoning', 5),
 ('LotConfig', 5),
 ('BldgType', 5),
 ('ExterCond', 5),
 ('HeatingQC', 5),
 ('Electrical', 5),
 ('FireplaceQu', 5),
 ('GarageQual', 5),
 ('GarageCond', 5),
 ('Condition2', 6),
 ('RoofStyle', 6),
 ('Foundation', 6),
 ('BsmtFinType1', 6),
 ('BsmtFinType2', 6),
 ('Heating', 6),
 ('Functional', 6),
 ('GarageType', 6),
 ('SaleCondition', 6),
 ('RoofMatl', 7),
 ('HouseStyle', 8),
 ('Condition1', 9),
 ('SaleType', 9)]

Subsequently we want to split our data into numerical columns and categorical columns.

For categorical columns, we want to choose only low-cardinality columns; which means excluding the last 3 columns.

In [5]:
# Selecting numerical columns
num_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes in ['int64', 'float64']]

# Selecting categorical columns
cat_cols = [col for col in X_train_full.columns if X_train_full[col].dtypes == 'object' and X_train_full[col].nunique() < 10]

# We have split the columns correctly
# len(num_cols) + len(cat_cols) + 3 == len(X_train.columns)

# Keep selected columns only
my_cols = num_cols + cat_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()

# Learning to use pipelines
A Pipeline() is a sequence of transforms (SimpleImputer, OneHotEncoding, LabelEncoding; DecisionTreeRegressor, RandomForestRegressor) that can be fitted and transformed
* Note that Pipelines(), ColumnTransformer() are composite transforms and can be included in Transform1(), Transform2() etc.

        
        my_pipeline = Pipeline(steps=[
            ('name1', Transform1()),
            ('name2', Transform2())
            ])
            
A ColumnTransformer() applies different pipelines/transforms to different columns

        column_transformer = ColumnTransformer(transformers=[
            ('name1', Transform1(), column_list_1), # applies Transform1() to columns in column_list_1
            ('name2', Transform2(), column_list_2)  # applies Transform2() to columns in column_list_2
            ])

## Pre-processing steps
* Numerical data: impute missing data
* Categorical data: impute missing data, and apply one-hot encoding

In [15]:
# Define the transform for numerical columns
# No pipeline is needed, since only one transformation is applied
numerical_transformer = SimpleImputer(strategy='mean')

# Define the transform for categorical column
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define the pre-processor column transformer
preprocessor = ColumnTransformer(transformers=[
    ('num', numerical_transformer, num_cols),
    ('cat', categorical_transformer, cat_cols)
])

## Defining the model
* Using the random forest regressor

In [16]:
model = RandomForestRegressor(n_estimators=100, random_state=0)

## Create and evaluate the pipeline

In [25]:
my_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

my_pipeline.fit(X_train, y_train)
preds = my_pipeline.predict(X_valid)
mae = mean_absolute_error(preds, y_valid)
print('MAE:', mae)

MAE: 17648.417157534244


# Cross-validation
* Principle: iterate training and testing over the whole dataset so that the whole dataset is eventually used to validate the model


* Break the predictor variable dataset X into k 'folds'
    * In experiment 1, use fold 1 as the validation dataset
    * In experiment 2, use fold 2 as the validation dataset
    * ...
    * In experiment k, use fold k as the validation dataset


In [24]:
# Multiply by -1 since cross_val_scores use negative MAE
scores = -1 * cross_val_score(my_pipeline, X, y,
                             cv=5,
                             scoring='neg_mean_absolute_error')

print('Average MAE:', scores.mean())

Average MAE: 17664.960273972603
