In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

## Reading data

In [2]:
# Read the data
X_full = pd.read_csv('../input/train.csv', index_col='Id')
X_test_full = pd.read_csv('../input/test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
#X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y,test_size=0.2, random_state=0)

# Select categorical columns
categorical_cols = [col for col in X_full.columns if X_full[col].dtype == "object"]

# Separating by cardinality
high_card_cat_cols = [col for col in categorical_cols if X_full[col].nunique() > 9]
low_card_cat_cols = list(set(categorical_cols)-set(high_card_cat_cols))

# Select numerical columns
numerical_cols = [col for col in X_full.columns if X_full[col].dtype != "object"]

# Selected columns
my_cols = categorical_cols + numerical_cols

# Outlied columns
outlied_cols = ['LotArea', 'MiscVal', '1stFlrSF', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'MasVnrArea', 'GrLivArea', 'TotalBsmtSF',
                'WoodDeckSF', 'ScreenPorch', 'GarageArea']
unoutlied_cols = list(set(my_cols) - set(outlied_cols))


# Keep selected columns only
X = X_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

In [3]:
#unoutlied_cols

In [4]:
X.head()

Unnamed: 0_level_0,MSZoning,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,...,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,RL,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,548,0,61,0,0,0,0,0,2,2008
2,RL,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,...,460,298,0,0,0,0,0,0,5,2007
3,RL,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,...,608,0,42,0,0,0,0,0,9,2008
4,RL,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,...,642,0,35,272,0,0,0,0,2,2006
5,RL,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,...,836,192,84,0,0,0,0,0,12,2008


In [5]:
#X.boxplot(column=unoutlied_cols, figsize=(40,10))

## Preprocess and Pipeline create

In [6]:
myStandardScaler = StandardScaler()
myRobustScaler = RobustScaler()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy = 'median')

# Preprocessing for high cardinality categorical data
high_card_categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
                                                      ('ordinal', OrdinalEncoder(handle_unknown="use_encoded_value", unknown_value=-1))])

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps = [('imputer', SimpleImputer(strategy='most_frequent')),
                                            ('oneHote', OneHotEncoder(handle_unknown='ignore'))])


# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('low_card_cat', categorical_transformer, low_card_cat_cols),
        ('high_card_cat', high_card_categorical_transformer, high_card_cat_cols)
    ])

# Scalling the data separating the LotArea, because there is an outlier
Scaller = ColumnTransformer(
    transformers=[
        ('outlied', myStandardScaler, outlied_cols),
        ('other_cols', myRobustScaler, unoutlied_cols),
    ])

# Define model
model_XGB = XGBRegressor()

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline([('preprocessor', preprocessor), ('scaller', myStandardScaler), ('model', model_XGB)])


In [7]:
# creating the param grid for grid search
param_grid = dict(model__n_estimators=[850], model__learning_rate = [0.025])

#param_grid = {}

gs_XGB = GridSearchCV(my_pipeline, param_grid, cv=5, scoring='neg_mean_absolute_error', return_train_score=True)

gs_XGB.fit(X,y)

cv_results = pd.DataFrame(gs_XGB.cv_results_)

# Evaluate the model
print('Best Train Score:', abs(cv_results.mean_train_score[0]))
print('Mean Valid Score:', abs(cv_results.mean_test_score[0]))
#print('Best Params:', gs_XGB.best_params_)

Best Train Score: 1830.0023196703764
Mean Valid Score: 15964.613192958048


In [8]:
cv = pd.DataFrame(gs_XGB.cv_results_)
#cv.columns

In [9]:
#my_pipeline.get_params().keys()

## Predict on the test data

In [10]:
# Preprocessing of test data, fit model
preds_test = gs_XGB.predict(X_test)

In [11]:
# Save test predictions to file
output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': preds_test})
output.to_csv('submission.csv', index=False)