# Setup

In [34]:
# General imports:
import pandas as pd

from termcolor import colored
from IPython.display import display

# ploting
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style('darkgrid')

# preprocessing
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

# models
from sklearn.ensemble import RandomForestRegressor

# evaluating
from sklearn.metrics import mean_absolute_error

# Global variables:
target = 'SalePrice'
train_path = '../data/house-prices/train.csv'
test_path = '../data/house-prices/test.csv'
submission_path = '../data/house-prices/submission.csv'

print('Setup complete.')

Setup complete.


In [27]:
# Loading data:
X_full = pd.read_csv(train_path)
X_test_full  = pd.read_csv(test_path)

X_full.dropna(axis=0, subset=[target], inplace=True)
y = X_full[target]
X_full.drop([target], axis=1, inplace=True)

#display(data_train.head())
print(colored(f"Train data columns", color = 'blue', attrs= ['dark', 'bold']))
print(colored(X_full.columns, color = 'green'))
print(colored('%d columns' % X_full.columns.size, color = 'red', attrs= ['dark', 'bold']))
print(colored(X_full.dtypes.value_counts(), color = 'red'))

[1m[2m[34mTrain data columns[0m
[32mIndex(['Id', 'MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street',
       'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig',
       'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType',
       'HouseStyle', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'Gara

# Preprocessing and basic checks

In [29]:
# Split train/test
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

cat_cols = [col for col in X_train_full.columns if 
            X_train_full[col].nunique() < 10 and
            X_train_full[col].dtype == 'object']

num_cols = [col for col in X_train_full.columns if
            X_train_full[col].dtype in ['int64', 'float64']]

my_cols = cat_cols + num_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test  = X_test_full[my_cols].copy()

print('done.')

done.


In [30]:
# Preprocessor
num_transf = SimpleImputer(strategy='constant')
cat_transf = Pipeline(steps=[
    ('impute', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transf, num_cols),
        ('cat', cat_transf, cat_cols)
    ]
)

print('preprocessor done.')

preprocessor done.


# Choosing the model

In [59]:
def score_model(n_estimators, X_train, y_train, X_valid, y_valid):
    model = RandomForestRegressor(n_estimators=n_estimators, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid) 
    return mean_absolute_error(y_valid, preds)

processed_X_train = preprocessor.fit_transform(X_train, y_train)
processed_X_valid = preprocessor.transform(X_valid)

scores_dict = {
    n: score_model(n, processed_X_train, y_train, processed_X_valid, y_valid)
        for n in range(700, 900, 10)
}

# best: 800 (mae 17413.38)
# best: 770 (mae 17400.31)

best_par =  min(scores_dict, key=scores_dict.get)
print('best: %d (mae %.2f)' % (best_par, scores_dict[best_par]))

best: 770 (mae 17400.31)


# Creating the final model

In [66]:
best_par = 770
model = RandomForestRegressor(n_estimators=best_par, random_state=0)

# Creating pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', model)
])

# fit
pipeline.fit(X, y)

# prediction
preds_test = pipeline.predict(X_test)

# Save test predictions to file
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': preds_test})
output.to_csv(submission_path, index=False)

print('Done.')

Done.
