In [1]:
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from matplotlib.pyplot import figure
sns.set()

In [2]:
data = pd.read_csv("data/train.csv")
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
from sklearn.model_selection import train_test_split
X = data.drop('SalePrice', axis=1)
y = data.SalePrice
X_test = pd.read_csv('data/test.csv')
X_train, X_val ,y_train, y_val = train_test_split(X,y, random_state=16)
full_X_train = pd.concat([X_train,y_train],axis=1)

In [12]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.compose import ColumnTransformer

class PandasTransform(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.columns_dropped = ['LotFrontage', 'Alley', 'FireplaceQu', 'PoolQC', 'Fence', 'MiscFeature','YearRemodAdd','YearBuilt','MoSold', 'YrSold','GarageType','GarageYrBlt','GarageFinish','GarageArea','GarageCond','Exterior2nd']
        self.cat_col = ['MSZoning','Street','LotShape',
         'LandContour','Utilities','LotConfig','LandSlope','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl','MasVnrType','ExterQual','ExterCond','Foundation','BsmtQual','BsmtCond','BsmtExposure','BsmtFinType1','BsmtFinType2',
         'Heating','HeatingQC','CentralAir','Electrical','KitchenQual','Functional','GarageQual','PavedDrive','SaleType','SaleCondition']
        self.qc_order = {"na":0,"po":1, "fa":2, "ta":3,'gd':4,'ex':5}
        self.col_to_ordinal = ['ExterQual','ExterCond','BsmtQual','BsmtCond','HeatingQC','KitchenQual','GarageQual']
        self.catcol_wout_ordinal = ['MSZoning','Street', 'LotShape',
     'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1',
     'BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']
        self.g1 = [['MeadowV', 'BrDale', 'IDOTRR'],
         ['OldTown',
          'BrkSide',
          'Edwards',
          'Sawyer',
          'NPkVill',
          'SWISU',
          'NAmes',
          'Blueste',
          'Mitchel',
          'SawyerW'],
         ['NWAmes',
          'Gilbert',
          'Blmngtn',
          'CollgCr',
          'Crawfor',
          'ClearCr',
          'Somerst',
          'Veenker'],
         ['Timber', 'StoneBr'],
         ['NridgHt', 'NoRidge']]
        self.g2 = ['BrkComm',
         ['AsphShn', 'CBlock', 'AsbShng'],
         ['Wd Sdng', 'MetalSd', 'WdShing', 'HdBoard'],
         ['Plywood', 'Stucco', 'BrkFace'],
         ['VinylSd', 'CemntBd'],
         ['ImStucc', 'Stone']]
    def fit(self, X, y= None):
        return self
    def transform(self, X):
        print(X.shape)
        X = X.drop(self.columns_dropped,axis=1)
        for col in self.cat_col:
            X[col] = X[col].str.lower()
        X[self.col_to_ordinal] = X[self.col_to_ordinal].fillna("na") 
        X[self.col_to_ordinal] = X[self.col_to_ordinal].replace(self.qc_order)
        
        for x in range(len(self.g1)):
            X = X.replace(self.g1[x], x)
        for x in range(len(self.g2)):
            X = X.replace(self.g2[x], x)
        X = X.set_index(X['Id'])
        X = X.drop('Id',axis=1)
        return X

In [13]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer

num_col = ['MSSubClass', 'LotArea', 'OverallQual', 'OverallCond', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Fireplaces', 'GarageCars', 'GarageQual', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
cat_col = ['MSZoning','Street', 'LotShape','LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'MasVnrType', 'Foundation', 'BsmtExposure', 'BsmtFinType1','BsmtFinType2', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive', 'SaleType', 'SaleCondition']

custom_pipeline = Pipeline([
    ('pandas_transform', PandasTransform())
    ])
cat_pipeline = Pipeline([
    ('cat_encoder', OneHotEncoder(handle_unknown='ignore'))
    ])
preprocessor = ColumnTransformer(transformers=[
    ('cat', cat_pipeline, cat_col),
    ('num', SimpleImputer(), num_col)
    ])

In [14]:
def make_pipeline(model):
    return Pipeline([
        ('custom', PandasTransform()),
        ('preprocessor', preprocessor),       
        ('model', model)
        ])

In [15]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
pipeline_xgb = make_pipeline( XGBRegressor() )


param_grid = {'model__n_estimators':(100,500),
              'model__learning_rate':(0.05,0.1,0.5,1)
              }


# cv_model_search = GridSearchCV(pipeline_xgb, 
#                                param_grid, 
#                                cv = 3,
#                                refit=False, 
#                                verbose = 1,
#                               error_score='raise') 

# # Fit all (1) models defined in our model-search object
# best = cv_model_search.fit(X_train,y_train)

In [8]:
best.cv_results_['mean_test_score']

NameError: name 'best' is not defined

In [None]:
best.cv_results_['params']

In [16]:
param_grid2 = {'model__n_estimators':(100,500,1000),
              'model__learning_rate':(0.05,0.075,0.08,0.09)
              }


cv_model_search2 = GridSearchCV(pipeline_xgb, 
                               param_grid2, 
                               cv = 3,
                               refit=True, 
                               verbose = 0,
                              error_score='raise') 

# Fit all (1) models defined in our model-search object
cv_model_search2 = cv_model_search2.fit(X_train,y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(730, 80)
(365, 80)
(1095, 80)


In [17]:
cv_model_search2.cv_results_['mean_test_score']

array([0.86951303, 0.87386984, 0.87366273, 0.87989778, 0.87964166,
       0.87960392, 0.87657063, 0.87711472, 0.87703585, 0.87573612,
       0.87474257, 0.87470041])

In [20]:
cv_model_search2.best_estimator_

Pipeline(steps=[('custom', PandasTransform()),
                ('preprocessor',
                 ColumnTransformer(transformers=[('cat',
                                                  Pipeline(steps=[('cat_encoder',
                                                                   OneHotEncoder(handle_unknown='ignore'))]),
                                                  ['MSZoning', 'Street',
                                                   'LotShape', 'LandContour',
                                                   'Utilities', 'LotConfig',
                                                   'LandSlope', 'Condition1',
                                                   'Condition2', 'BldgType',
                                                   'HouseStyle', 'RoofStyle',
                                                   'RoofMatl', 'MasVnrType',
                                                   'Foundati...
                              gamma=0, gpu_id=-1, importance_type=Non

In [18]:
cv_model_search2.cv_results_['params']

[{'model__learning_rate': 0.05, 'model__n_estimators': 100},
 {'model__learning_rate': 0.05, 'model__n_estimators': 500},
 {'model__learning_rate': 0.05, 'model__n_estimators': 1000},
 {'model__learning_rate': 0.075, 'model__n_estimators': 100},
 {'model__learning_rate': 0.075, 'model__n_estimators': 500},
 {'model__learning_rate': 0.075, 'model__n_estimators': 1000},
 {'model__learning_rate': 0.08, 'model__n_estimators': 100},
 {'model__learning_rate': 0.08, 'model__n_estimators': 500},
 {'model__learning_rate': 0.08, 'model__n_estimators': 1000},
 {'model__learning_rate': 0.09, 'model__n_estimators': 100},
 {'model__learning_rate': 0.09, 'model__n_estimators': 500},
 {'model__learning_rate': 0.09, 'model__n_estimators': 1000}]

In [27]:
cv_model_search2.fit(X,y)

pred = cv_model_search2.predict(X_test)
pred = cv_model_search2.predict(X_test)

output = pd.DataFrame({'Id': X_test.index,
                       'SalePrice': pred})
output.to_csv('./submission.csv', index=False)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(973, 80)
(487, 80)
(973, 80)
(487, 80)
(974, 80)
(486, 80)
(1460, 80)
(1459, 80)
(1459, 80)


In [30]:
output = pd.DataFrame({'Id': X_test.Id,
                       'SalePrice': pred})
output.to_csv('./submission.csv', index=False)

In [29]:
X_test

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,,Reg,Lvl,AllPub,...,0,0,,,,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,,Reg,Lvl,AllPub,...,0,0,,MnPrv,Shed,700,7,2006,WD,Normal
