In [2]:
import pandas as pd 
import numpy as np
pd.options.mode.chained_assignment = None  # default='warn'
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

from xgboost import XGBRegressor

X = pd.read_csv('train.csv')
X_test = pd.read_csv('test.csv') # no sell price

# Remove rows with missing target, separate target from predictors
X.dropna(axis=0, subset=['SalePrice'], inplace=True)
y = X.SalePrice
X.drop(['SalePrice'], axis=1, inplace=True)

# To keep things simple, we'll drop columns with missing values
cols_with_missing = [col for col in X.columns if X[col].isnull().any()] 
X.drop(cols_with_missing, axis=1, inplace=True)
X_test.drop(cols_with_missing, axis=1, inplace=True)

# Break off validation set from training data
X_train, X_valid, y_train, y_valid = train_test_split(X, y,
                                                      train_size=0.8, test_size=0.2,
                                                      random_state=0)



12.06141305936073

In [3]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# function for comparing different approaches
def score_dataset(X_train, X_valid, y_train, y_valid):
    model = RandomForestRegressor(n_estimators=100, random_state=0)
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return mean_absolute_error(y_valid, preds)

In [4]:
# All categorical columns
object_cols = [col for col in X_train.columns if X_train[col].dtype == "object"]

# Columns that can be safely label encoded
good_label_cols = [col for col in object_cols if 
                   set(X_train[col]) == set(X_valid[col])]

numerical_columns = [col for col in X_train if 
                    X_train[col].dtype in ['int64', 'float64']]
        
# Problematic columns that will be dropped from the dataset
bad_label_cols = list(set(object_cols)-set(good_label_cols))
        
print('Categorical columns that will be label encoded:', good_label_cols)
print('\nCategorical columns that will be dropped from the dataset:', bad_label_cols)

Categorical columns that will be label encoded: ['MSZoning', 'Street', 'LotShape', 'LandContour', 'LotConfig', 'BldgType', 'HouseStyle', 'ExterQual', 'CentralAir', 'KitchenQual', 'PavedDrive', 'SaleCondition']

Categorical columns that will be dropped from the dataset: ['HeatingQC', 'ExterCond', 'Exterior2nd', 'SaleType', 'Condition2', 'Heating', 'Condition1', 'Neighborhood', 'Utilities', 'LandSlope', 'Foundation', 'RoofStyle', 'Exterior1st', 'RoofMatl', 'Functional']


In [5]:
my_imputer = SimpleImputer()
imputed_X_train = pd.DataFrame(my_imputer.fit_transform(X_train[numerical_columns]))
imputed_X_valid = pd.DataFrame(my_imputer.transform(X_valid[numerical_columns]))
imputed_X_train.columns = numerical_columns
imputed_X_valid.columns = numerical_columns

In [6]:
from sklearn.preprocessing import LabelEncoder

# Drop categorical columns that will not be encoded
label_X_train = X_train.drop(bad_label_cols, axis=1)
label_X_valid = X_valid.drop(bad_label_cols, axis=1)

# Apply label encoder 
my_encoder = LabelEncoder()
for col in set(good_label_cols):
    label_X_train[col] = my_encoder.fit_transform(X_train[col])
    label_X_valid[col] = my_encoder.transform(X_valid[col])
# Your code here

In [7]:
final_X_train = pd.concat([imputed_X_train,label_X_train], axis=1)
final_X_valid = pd.concat([imputed_X_valid,label_X_valid], axis=1)
final_X_train.dropna(axis=0)
final_X_valid.dropna(axis=0)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
1,492.0,50.0,9490.0,6.0,7.0,1941.0,1950.0,403.0,165.0,238.0,...,298.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2007.0,4.0
2,460.0,50.0,7015.0,5.0,4.0,1950.0,1950.0,185.0,0.0,524.0,...,0.0,42.0,0.0,0.0,0.0,0.0,0.0,9.0,2008.0,4.0
4,656.0,160.0,1680.0,6.0,5.0,1971.0,1971.0,0.0,0.0,525.0,...,192.0,84.0,0.0,0.0,0.0,0.0,0.0,12.0,2008.0,4.0
5,1014.0,30.0,7200.0,5.0,4.0,1910.0,2006.0,247.0,465.0,310.0,...,40.0,30.0,0.0,320.0,0.0,0.0,700.0,10.0,2009.0,4.0
9,688.0,160.0,5105.0,7.0,5.0,2004.0,2004.0,239.0,0.0,312.0,...,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,2008.0,4.0
14,1015.0,20.0,11664.0,6.0,5.0,1948.0,1950.0,336.0,0.0,746.0,...,0.0,213.0,176.0,0.0,0.0,0.0,0.0,5.0,2008.0,4.0
18,32.0,20.0,8544.0,5.0,6.0,1966.0,2006.0,0.0,0.0,1228.0,...,0.0,102.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,4.0
19,483.0,70.0,2500.0,7.0,8.0,1915.0,2005.0,299.0,0.0,611.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,2009.0,0.0
27,650.0,180.0,1936.0,4.0,6.0,1970.0,1970.0,131.0,499.0,0.0,...,0.0,50.0,0.0,0.0,0.0,0.0,0.0,5.0,2010.0,4.0
29,576.0,50.0,8480.0,5.0,5.0,1947.0,1950.0,442.0,0.0,390.0,...,49.0,0.0,87.0,0.0,0.0,0.0,0.0,5.0,2008.0,4.0


In [8]:
print(score_dataset(label_X_train,label_X_valid,y_train,y_valid)) #WE CHOSE THIS

17675.942500000005


In [9]:
print(score_dataset(imputed_X_train,imputed_X_valid , y_train, y_valid))

17952.591404109586


In [9]:
np.isnan(final_X_train)

Unnamed: 0,Id,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,...,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,MoSold,YrSold,SaleCondition
0,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
2,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
3,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
4,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
5,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True
6,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
7,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
8,False,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
9,False,False,False,False,False,False,False,False,False,False,...,True,True,True,True,True,True,True,True,True,True


In [10]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
model1 = XGBRegressor(objective ='reg:squarederror',random_state=0, n_estimators=1000, learning_rate=0.05, early_stopping_rounds=5, max_depth=7)
model1.fit(label_X_train,y_train)

XGBRegressor(early_stopping_rounds=5, learning_rate=0.05, max_depth=7,
             n_estimators=1000, objective='reg:squarederror')

In [11]:
prediction1 = model1.predict(label_X_valid)

In [12]:
mae1 = mean_absolute_error(prediction1, y_valid)
print("Mae: ", mae1)

Mae:  16733.404363762842


In [33]:
all_scores = []

for early_stopping_rounds in [3, 5, 7]:
    for max_depth in [7, 9, 11]:
        for n_estimators in [200, 500, 1000]:
            for learning_rate in [0.02, 0.05, 0.1]:
                for colsample_bytree in [0.5, 0.7, 0.9]:
                    for reg_alpha in [0.1, 1, 10, 100]:
                            XGBmodel = XGBRegressor(early_stopping_rounds=early_stopping_rounds,
                                                max_depth=max_depth,
                                               learning_rate=learning_rate,
                                               colsample_bytree=colsample_bytree,
                                               reg_alpha=reg_alpha,
                                               reg_lambda=1,
                                               objective ='reg:squarederror',
                                               random_state=1,

                                               )
                            XGBmodel.fit(label_X_train, y_train)
                            prediction = XGBmodel.predict(label_X_valid)
                            mae = mean_absolute_error(prediction, y_valid)
                            all_scores.append([early_stopping_rounds, max_depth, n_estimators,
                                               learning_rate,colsample_bytree, reg_alpha, 
                                            mae])
                            print(all_scores[-1])

[3, 7, 200, 0.02, 0.5, 0.1, 29472.200382598458]
[3, 7, 200, 0.02, 0.5, 1, 29472.21899079623]
[3, 7, 200, 0.02, 0.5, 10, 29471.120612157534]
[3, 7, 200, 0.02, 0.5, 100, 29454.29890839041]
[3, 7, 200, 0.02, 0.7, 0.1, 28922.174550513697]
[3, 7, 200, 0.02, 0.7, 1, 28922.19885755565]
[3, 7, 200, 0.02, 0.7, 10, 28907.299015410958]
[3, 7, 200, 0.02, 0.7, 100, 28934.396685038526]
[3, 7, 200, 0.02, 0.9, 0.1, 28501.49973244863]
[3, 7, 200, 0.02, 0.9, 1, 28491.63025738442]
[3, 7, 200, 0.02, 0.9, 10, 28464.312928082192]
[3, 7, 200, 0.02, 0.9, 100, 28663.677386558218]
[3, 7, 200, 0.05, 0.5, 0.1, 16822.443667059077]
[3, 7, 200, 0.05, 0.5, 1, 16822.441513270547]
[3, 7, 200, 0.05, 0.5, 10, 16785.50667540668]
[3, 7, 200, 0.05, 0.5, 100, 16970.171393407534]
[3, 7, 200, 0.05, 0.7, 0.1, 16282.436630458047]
[3, 7, 200, 0.05, 0.7, 1, 16282.439680543665]
[3, 7, 200, 0.05, 0.7, 10, 16593.645374036816]
[3, 7, 200, 0.05, 0.7, 100, 16802.386250535103]
[3, 7, 200, 0.05, 0.9, 0.1, 17072.76181239298]
[3, 7, 200, 0.

[3, 9, 500, 0.1, 0.9, 0.1, 16939.568158711474]
[3, 9, 500, 0.1, 0.9, 1, 16926.68622913099]
[3, 9, 500, 0.1, 0.9, 10, 17216.106191138697]
[3, 9, 500, 0.1, 0.9, 100, 17290.656209867295]
[3, 9, 1000, 0.02, 0.5, 0.1, 29650.663875214042]
[3, 9, 1000, 0.02, 0.5, 1, 29657.839362157534]
[3, 9, 1000, 0.02, 0.5, 10, 29663.08790400257]
[3, 9, 1000, 0.02, 0.5, 100, 29699.444764019692]
[3, 9, 1000, 0.02, 0.7, 0.1, 29127.094378745718]
[3, 9, 1000, 0.02, 0.7, 1, 29127.12711365582]
[3, 9, 1000, 0.02, 0.7, 10, 29131.289289918663]
[3, 9, 1000, 0.02, 0.7, 100, 29211.123969927226]
[3, 9, 1000, 0.02, 0.9, 0.1, 28845.227565817637]
[3, 9, 1000, 0.02, 0.9, 1, 28845.256902825342]
[3, 9, 1000, 0.02, 0.9, 10, 28803.212101348458]
[3, 9, 1000, 0.02, 0.9, 100, 28894.656383775684]
[3, 9, 1000, 0.05, 0.5, 0.1, 17245.43277771832]
[3, 9, 1000, 0.05, 0.5, 1, 17245.430998501713]
[3, 9, 1000, 0.05, 0.5, 10, 17215.90204944349]
[3, 9, 1000, 0.05, 0.5, 100, 17316.382264019692]
[3, 9, 1000, 0.05, 0.7, 0.1, 16668.932416523974]

[5, 7, 200, 0.1, 0.5, 1, 17024.53533015839]
[5, 7, 200, 0.1, 0.5, 10, 17018.73330479452]
[5, 7, 200, 0.1, 0.5, 100, 17205.877742401542]
[5, 7, 200, 0.1, 0.7, 0.1, 15845.20222870291]
[5, 7, 200, 0.1, 0.7, 1, 15845.980375107021]
[5, 7, 200, 0.1, 0.7, 10, 15809.89070526541]
[5, 7, 200, 0.1, 0.7, 100, 15911.21022848887]
[5, 7, 200, 0.1, 0.9, 0.1, 16663.810399721748]
[5, 7, 200, 0.1, 0.9, 1, 16616.18004869435]
[5, 7, 200, 0.1, 0.9, 10, 16713.045349957192]
[5, 7, 200, 0.1, 0.9, 100, 16852.18751337757]
[5, 7, 500, 0.02, 0.5, 0.1, 29472.200382598458]
[5, 7, 500, 0.02, 0.5, 1, 29472.21899079623]
[5, 7, 500, 0.02, 0.5, 10, 29471.120612157534]
[5, 7, 500, 0.02, 0.5, 100, 29454.29890839041]
[5, 7, 500, 0.02, 0.7, 0.1, 28922.174550513697]
[5, 7, 500, 0.02, 0.7, 1, 28922.19885755565]
[5, 7, 500, 0.02, 0.7, 10, 28907.299015410958]
[5, 7, 500, 0.02, 0.7, 100, 28934.396685038526]
[5, 7, 500, 0.02, 0.9, 0.1, 28501.49973244863]
[5, 7, 500, 0.02, 0.9, 1, 28491.63025738442]
[5, 7, 500, 0.02, 0.9, 10, 28464

[5, 9, 1000, 0.05, 0.9, 0.1, 17568.12598994007]
[5, 9, 1000, 0.05, 0.9, 1, 17609.716248394692]
[5, 9, 1000, 0.05, 0.9, 10, 17605.892685145547]
[5, 9, 1000, 0.05, 0.9, 100, 17546.030741652397]
[5, 9, 1000, 0.1, 0.5, 0.1, 17059.305864726026]
[5, 9, 1000, 0.1, 0.5, 1, 17053.117843000855]
[5, 9, 1000, 0.1, 0.5, 10, 17040.826426048803]
[5, 9, 1000, 0.1, 0.5, 100, 17224.27185894692]
[5, 9, 1000, 0.1, 0.7, 0.1, 15953.90073844178]
[5, 9, 1000, 0.1, 0.7, 1, 15953.943038313357]
[5, 9, 1000, 0.1, 0.7, 10, 15946.830104880137]
[5, 9, 1000, 0.1, 0.7, 100, 16176.89711312072]
[5, 9, 1000, 0.1, 0.9, 0.1, 16939.568158711474]
[5, 9, 1000, 0.1, 0.9, 1, 16926.68622913099]
[5, 9, 1000, 0.1, 0.9, 10, 17216.106191138697]
[5, 9, 1000, 0.1, 0.9, 100, 17290.656209867295]
[5, 11, 200, 0.02, 0.5, 0.1, 29728.04731645976]
[5, 11, 200, 0.02, 0.5, 1, 29728.079609910103]
[5, 11, 200, 0.02, 0.5, 10, 29779.350398651542]
[5, 11, 200, 0.02, 0.5, 100, 29751.50373234161]
[5, 11, 200, 0.02, 0.7, 0.1, 29293.952656785103]
[5, 1

[7, 7, 500, 0.05, 0.5, 10, 16785.50667540668]
[7, 7, 500, 0.05, 0.5, 100, 16970.171393407534]
[7, 7, 500, 0.05, 0.7, 0.1, 16282.436630458047]
[7, 7, 500, 0.05, 0.7, 1, 16282.439680543665]
[7, 7, 500, 0.05, 0.7, 10, 16593.645374036816]
[7, 7, 500, 0.05, 0.7, 100, 16802.386250535103]
[7, 7, 500, 0.05, 0.9, 0.1, 17072.76181239298]
[7, 7, 500, 0.05, 0.9, 1, 17072.768073095034]
[7, 7, 500, 0.05, 0.9, 10, 17173.903801904966]
[7, 7, 500, 0.05, 0.9, 100, 17178.31197827483]
[7, 7, 500, 0.1, 0.5, 0.1, 17024.535035851884]
[7, 7, 500, 0.1, 0.5, 1, 17024.53533015839]
[7, 7, 500, 0.1, 0.5, 10, 17018.73330479452]
[7, 7, 500, 0.1, 0.5, 100, 17205.877742401542]
[7, 7, 500, 0.1, 0.7, 0.1, 15845.20222870291]
[7, 7, 500, 0.1, 0.7, 1, 15845.980375107021]
[7, 7, 500, 0.1, 0.7, 10, 15809.89070526541]
[7, 7, 500, 0.1, 0.7, 100, 15911.21022848887]
[7, 7, 500, 0.1, 0.9, 0.1, 16663.810399721748]
[7, 7, 500, 0.1, 0.9, 1, 16616.18004869435]
[7, 7, 500, 0.1, 0.9, 10, 16713.045349957192]
[7, 7, 500, 0.1, 0.9, 100, 1

[7, 11, 200, 0.02, 0.9, 1, 29035.33700770548]
[7, 11, 200, 0.02, 0.9, 10, 29032.936991652397]
[7, 11, 200, 0.02, 0.9, 100, 28899.853448737158]
[7, 11, 200, 0.05, 0.5, 0.1, 17394.76285584332]
[7, 11, 200, 0.05, 0.5, 1, 17394.042406892124]
[7, 11, 200, 0.05, 0.5, 10, 17379.585977632705]
[7, 11, 200, 0.05, 0.5, 100, 17185.947198737158]
[7, 11, 200, 0.05, 0.7, 0.1, 16609.10283336901]
[7, 11, 200, 0.05, 0.7, 1, 16609.129922945205]
[7, 11, 200, 0.05, 0.7, 10, 16570.024668236303]
[7, 11, 200, 0.05, 0.7, 100, 16663.200502996577]
[7, 11, 200, 0.05, 0.9, 0.1, 17229.90998234161]
[7, 11, 200, 0.05, 0.9, 1, 17242.593602846748]
[7, 11, 200, 0.05, 0.9, 10, 17465.743057041953]
[7, 11, 200, 0.05, 0.9, 100, 17256.35041202911]
[7, 11, 200, 0.1, 0.5, 0.1, 16806.777276862158]
[7, 11, 200, 0.1, 0.5, 1, 16813.398317101884]
[7, 11, 200, 0.1, 0.5, 10, 16791.72850224743]
[7, 11, 200, 0.1, 0.5, 100, 17143.932296125855]
[7, 11, 200, 0.1, 0.7, 0.1, 16259.234308112158]
[7, 11, 200, 0.1, 0.7, 1, 16260.603301583904]


In [34]:
max_scores = sorted(all_scores, key = lambda x: x[-1], reverse = False)[:5]

In [36]:
max_scores

[[3, 7, 200, 0.1, 0.7, 10, 15809.89070526541],
 [3, 7, 500, 0.1, 0.7, 10, 15809.89070526541],
 [3, 7, 1000, 0.1, 0.7, 10, 15809.89070526541],
 [5, 7, 200, 0.1, 0.7, 10, 15809.89070526541],
 [5, 7, 500, 0.1, 0.7, 10, 15809.89070526541]]

In [18]:
max_scores_label = max_scores

In [19]:
max_scores_label

[[7, 200, 0.1, 0.7, 10, 1, 15809.89070526541],
 [7, 500, 0.1, 0.7, 10, 1, 15809.89070526541],
 [7, 1000, 0.1, 0.7, 10, 1, 15809.89070526541],
 [7, 200, 0.1, 0.7, 0.1, 1, 15845.20222870291],
 [7, 500, 0.1, 0.7, 0.1, 1, 15845.20222870291]]