# This is my second attempt at the Kaggle Advanced House Prices competition I incorporated pipeline into this data cleaning and used an XGBoost model 
## You can find this at https://www.kaggle.com/c/house-prices-advanced-regression-techniques/overview
## The model and cleaning are all done here, whereas the EDA was done in my first attempt

In [9]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [14]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.neural_network import MLPRegressor

# Read the data
X_full = pd.read_csv('./train.csv', index_col='Id')
X_test_full = pd.read_csv('./test.csv', index_col='Id')

# Remove rows with missing target, separate target from predictors
X_full.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X_full.SalePrice
X_full.drop(['SalePrice'], axis=1, inplace=True)

# Break off validation set from training data
X_train_full, X_valid_full, y_train, y_valid = train_test_split(X_full, y, 
                                                                train_size=0.8, test_size=0.2,
                                                                random_state=0)

# "Cardinality" means the number of unique values in a column
# Select categorical columns with relatively low cardinality (convenient but arbitrary)
categorical_cols = [cname for cname in X_train_full.columns if
                    #X_train_full[cname].nunique() < 10 and 
                    X_train_full[cname].dtype == "object"]

# Select numerical columns
numerical_cols = [cname for cname in X_train_full.columns if 
                X_train_full[cname].dtype in ['int64', 'float64']]


# Keep selected columns only
my_cols = categorical_cols + numerical_cols
X_train = X_train_full[my_cols].copy()
X_valid = X_valid_full[my_cols].copy()
X_test = X_test_full[my_cols].copy()

# Preprocessing for numerical data
numerical_transformer = SimpleImputer(strategy='constant')

# Preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Bundle preprocessing for numerical and categorical data
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])



from sklearn.base import BaseEstimator
class CustomTransformer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        x_dataset.loc[x_dataset['Fireplaces']==0,'FireplaceQu']='Nothing'
        x_dataset['LotFrontage'] = x_dataset['LotFrontage'].fillna(x_dataset.groupby('1stFlrSF')['LotFrontage'].transform('mean'))
        x_dataset['LotFrontage'].interpolate(method='linear',inplace=True)
        #x_dataset['LotFrontage']=x_dataset['LotFrontage'].astype(int)
        x_dataset['MasVnrArea'] = x_dataset['MasVnrArea'].fillna(x_dataset.groupby('MasVnrType')['MasVnrArea'].transform('mean'))
        x_dataset['MasVnrArea'].interpolate(method='linear',inplace=True)
        #x_dataset['MasVnrArea']=x_dataset['MasVnrArea'].astype(int)
        x_dataset["Fence"] = x_dataset["Fence"].fillna("None")
        x_dataset["FireplaceQu"] = x_dataset["FireplaceQu"].fillna("None")
        x_dataset["Alley"] = x_dataset["Alley"].fillna("None")
        x_dataset["PoolQC"] = x_dataset["PoolQC"].fillna("None")
        x_dataset["MiscFeature"] = x_dataset["MiscFeature"].fillna("None")
        x_dataset.loc[x_dataset['BsmtFinSF1']==0,'BsmtFinType1']='Unf'
        x_dataset.loc[x_dataset['BsmtFinSF2']==0,'BsmtQual']='TA'
        x_dataset['YrBltRmd']=x_dataset['YearBuilt']+x_dataset['YearRemodAdd']
        x_dataset['Total_Square_Feet'] = (x_dataset['BsmtFinSF1'] + x_dataset['BsmtFinSF2'] + x_dataset['1stFlrSF'] + x_dataset['2ndFlrSF'] + x_dataset['TotalBsmtSF'])
        x_dataset['Total_Bath'] = (x_dataset['FullBath'] + (0.5 * x_dataset['HalfBath']) + x_dataset['BsmtFullBath'] + (0.5 * x_dataset['BsmtHalfBath']))
        x_dataset['Total_Porch_Area'] = (x_dataset['OpenPorchSF'] + x_dataset['3SsnPorch'] + x_dataset['EnclosedPorch'] + x_dataset['ScreenPorch'] + x_dataset['WoodDeckSF'])
        x_dataset['exists_pool'] = x_dataset['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_garage'] = x_dataset['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_fireplace'] = x_dataset['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_bsmt'] = x_dataset['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['old_house'] = x_dataset['YearBuilt'].apply(lambda x: 1 if x <1990 else 0)
        
        
        return x_dataset

        

from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error

# Bundle preprocessing and modeling code in a pipeline
my_pipeline = Pipeline(steps=[('CustomTransformer', CustomTransformer()),
                                ('pre_processing',preprocessor),
                              ('model', XGBRegressor(n_estimators=1000, learning_rate=0.01, n_jobs=4))
                             ])

#from sklearn.model_selection import cross_val_score
#scores = cross_val_score(my_pipeline,X_train,y_train,scoring='neg_mean_squared_error',cv=10)
#finalavg


# Preprocessing of training data, fit model 
my_pipeline.fit(X_train, y_train)

# Preprocessing of validation data, get predictions
preds = my_pipeline.predict(X_valid)

# Evaluate the model
score = mean_absolute_error(y_valid, preds)
print('MAE:', score)


MAE: 15936.357809824487


In [61]:
final_predictions = my_pipeline.predict(X_test)
# adding the ID column back
temp_df = pd.read_csv('./test.csv')
temp_df['SalePrice'] = final_predictions
final_df = temp_df[['Id','SalePrice']]
print(final_df.head())

def fillInAnyNulls(x):
    if (x == 0):
        return final_df['SalePrice'].mean()
    else: 
        return x

final_df.apply(lambda x: fillInAnyNulls(x['SalePrice']),axis=1) 
final_df.to_csv("final_submission7.csv", index=False)
final_df.head()

     Id      SalePrice
0  1461  123159.757812
1  1462  148119.078125
2  1463  189682.078125
3  1464  190631.515625
4  1465  194007.281250


Unnamed: 0,Id,SalePrice
0,1461,123159.757812
1,1462,148119.078125
2,1463,189682.078125
3,1464,190631.515625
4,1465,194007.28125


In [None]:
# Trying to do some customs pipeline transformations




# pre-processsing step
# Drop the columns - 
# Impute the missing values in column Item_Weight by mean
# Scale the data in the column Item_MRP
pre_process = ColumnTransformer(remainder='passthrough',
                                transformers=[
                                            #('impute_item_weight', SimpleImputer(strategy='mean'), ['Item_Weight']),
                                            #  ('scale_data', StandardScaler(),['Item_MRP'])
                                            ('num', SimpleImputer(strategy='constant'), numerical_cols),
                                            ('imputer', SimpleImputer(strategy='most_frequent'), categorical_cols),
                                            ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols),
                                             ])


# import the BaseEstimator
# define the class OutletTypeEncoder
# This will be our custom transformer that will create 3 new binary columns
# custom transformer must have methods fit and transform
from sklearn.base import BaseEstimator
class CustomTransformer(BaseEstimator):

    def __init__(self):
        pass

    def fit(self, documents, y=None):
        return self

    def transform(self, x_dataset):
        x_dataset.loc[x_dataset['Fireplaces']==0,'FireplaceQu']='Nothing'
        x_dataset['LotFrontage'] = x_dataset['LotFrontage'].fillna(x_dataset.groupby('1stFlrSF')['LotFrontage'].transform('mean'))
        x_dataset['LotFrontage'].interpolate(method='linear',inplace=True)
        x_dataset['LotFrontage']=x_dataset['LotFrontage'].astype(int)
        x_dataset['MasVnrArea'] = x_dataset['MasVnrArea'].fillna(x_dataset.groupby('MasVnrType')['MasVnrArea'].transform('mean'))
        x_dataset['MasVnrArea'].interpolate(method='linear',inplace=True)
        x_dataset['MasVnrArea']=x_dataset['MasVnrArea'].astype(int)
        x_dataset["Fence"] = x_dataset["Fence"].fillna("None")
        x_dataset["FireplaceQu"] = x_dataset["FireplaceQu"].fillna("None")
        x_dataset["Alley"] = x_dataset["Alley"].fillna("None")
        x_dataset["PoolQC"] = x_dataset["PoolQC"].fillna("None")
        x_dataset["MiscFeature"] = x_dataset["MiscFeature"].fillna("None")
        x_dataset.loc[x_dataset['BsmtFinSF1']==0,'BsmtFinType1']='Unf'
        x_dataset.loc[x_dataset['BsmtFinSF2']==0,'BsmtQual']='TA'
        x_dataset['YrBltRmd']=x_dataset['YearBuilt']+x_dataset['YearRemodAdd']
        x_dataset['Total_Square_Feet'] = (x_dataset['BsmtFinSF1'] + x_dataset['BsmtFinSF2'] + x_dataset['1stFlrSF'] + x_dataset['2ndFlrSF'] + x_dataset['TotalBsmtSF'])
        x_dataset['Total_Bath'] = (x_dataset['FullBath'] + (0.5 * x_dataset['HalfBath']) + x_dataset['BsmtFullBath'] + (0.5 * x_dataset['BsmtHalfBath']))
        x_dataset['Total_Porch_Area'] = (x_dataset['OpenPorchSF'] + x_dataset['3SsnPorch'] + x_dataset['EnclosedPorch'] + x_dataset['ScreenPorch'] + x_dataset['WoodDeckSF'])
        x_dataset['exists_pool'] = x_dataset['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_garage'] = x_dataset['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_fireplace'] = x_dataset['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['exists_bsmt'] = x_dataset['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
        x_dataset['old_house'] = x_dataset['YearBuilt'].apply(lambda x: 1 if x <1990 else 0)
        
        return x_dataset


In [5]:
train.loc[train['Fireplaces']==0,'FireplaceQu']='Nothing'
train['LotFrontage'] = train['LotFrontage'].fillna(train.groupby('1stFlrSF')['LotFrontage'].transform('mean'))
train['LotFrontage'].interpolate(method='linear',inplace=True)
train['LotFrontage']=train['LotFrontage'].astype(int)
train['MasVnrArea'] = train['MasVnrArea'].fillna(train.groupby('MasVnrType')['MasVnrArea'].transform('mean'))
train['MasVnrArea'].interpolate(method='linear',inplace=True)
train['MasVnrArea']=train['MasVnrArea'].astype(int)
train["Fence"] = train["Fence"].fillna("None")
train["FireplaceQu"] = train["FireplaceQu"].fillna("None")
train["Alley"] = train["Alley"].fillna("None")
train["PoolQC"] = train["PoolQC"].fillna("None")
train["MiscFeature"] = train["MiscFeature"].fillna("None")
train.loc[train['BsmtFinSF1']==0,'BsmtFinType1']='Unf'
train.loc[train['BsmtFinSF2']==0,'BsmtQual']='TA'
train['YrBltRmd']=train['YearBuilt']+train['YearRemodAdd']
train['Total_Square_Feet'] = (train['BsmtFinSF1'] + train['BsmtFinSF2'] + train['1stFlrSF'] + train['2ndFlrSF'] + train['TotalBsmtSF'])
train['Total_Bath'] = (train['FullBath'] + (0.5 * train['HalfBath']) + train['BsmtFullBath'] + (0.5 * train['BsmtHalfBath']))
train['Total_Porch_Area'] = (train['OpenPorchSF'] + train['3SsnPorch'] + train['EnclosedPorch'] + train['ScreenPorch'] + train['WoodDeckSF'])
train['exists_pool'] = train['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
train['exists_garage'] = train['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
train['exists_fireplace'] = train['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
train['exists_bsmt'] = train['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
train['old_house'] = train['YearBuilt'].apply(lambda x: 1 if x <1990 else 0)