In [1]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.model_selection import RandomizedSearchCV
import pickle

In [45]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')

In [46]:
train_df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [47]:
train_df.isnull().sum()

Id                  0
MSSubClass          0
MSZoning            0
LotFrontage       259
LotArea             0
Street              0
Alley            1369
LotShape            0
LandContour         0
Utilities           0
LotConfig           0
LandSlope           0
Neighborhood        0
Condition1          0
Condition2          0
BldgType            0
HouseStyle          0
OverallQual         0
OverallCond         0
YearBuilt           0
YearRemodAdd        0
RoofStyle           0
RoofMatl            0
Exterior1st         0
Exterior2nd         0
MasVnrType          8
MasVnrArea          8
ExterQual           0
ExterCond           0
Foundation          0
                 ... 
BedroomAbvGr        0
KitchenAbvGr        0
KitchenQual         0
TotRmsAbvGrd        0
Functional          0
Fireplaces          0
FireplaceQu       690
GarageType         81
GarageYrBlt        81
GarageFinish       81
GarageCars          0
GarageArea          0
GarageQual         81
GarageCond         81
PavedDrive

In [48]:
for df in[train_df,test_df]:
    df.drop(["Alley","FireplaceQu","PoolQC","Fence","MiscFeature"],axis=1,inplace=True)


In [49]:
combine_df = pd.concat([train_df,test_df],sort=False).drop('SalePrice',axis=1)

In [50]:
replace_list1 = ['BsmtQual','BsmtCond','ExterQual','ExterCond','HeatingQC','KitchenQual','GarageQual','GarageCond']
combine_df[replace_list1] = combine_df[replace_list1].replace({np.nan:'None'})


In [51]:
replace_list2 = ['BsmtExposure','BsmtFinType1','BsmtFinType2','GarageFinish','GarageType']
combine_df[replace_list2] = combine_df[replace_list2].replace({np.nan:'None'})

In [52]:
missing = (np.sum(combine_df.isnull(),axis=0)/len(combine_df)).to_frame().rename(columns={0:'fraction'})
missing_df = missing[missing['fraction']>0]
#fig = plt.figure(figsize = [12,12])
#plt.barh(missing_df.index,missing_df['no.'].to_list())
missing_df.sort_values(by='fraction',ascending=False)


Unnamed: 0,fraction
LotFrontage,0.166495
GarageYrBlt,0.054471
MasVnrType,0.008222
MasVnrArea,0.007879
MSZoning,0.00137
BsmtFullBath,0.000685
Utilities,0.000685
Functional,0.000685
BsmtHalfBath,0.000685
GarageArea,0.000343


In [53]:
list_missing = missing_df.index.to_list()
missing_dtypes = combine_df[list_missing].dtypes.to_frame().rename(columns={0:'dtype'})
cat_list = missing_dtypes[missing_dtypes['dtype']=='object'].index.to_list()
num_list = missing_dtypes[missing_dtypes['dtype']!='object'].index.to_list()

In [54]:
for item in cat_list:
    combine_df[item].fillna(combine_df[item].mode()[0],inplace=True)
for item in num_list:
    combine_df[item].fillna(combine_df[item].mean(),inplace=True)


In [55]:
def df_dtype_list(df):
    dtypes_df = df.dtypes.to_frame()
    cat = dtypes_df[dtypes_df[0] == 'object'].index.to_list()
    num = dtypes_df[dtypes_df[0] != 'object'].index.to_list()
    return cat,num


In [56]:
cat,num = df_dtype_list(combine_df)
print(f'Categorical : {cat} \n\nNumerical : {num}')

Categorical : ['MSZoning', 'Street', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'SaleType', 'SaleCondition'] 

Numerical : ['Id', 'MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', '

In [58]:
combine_df['BsmtFinSF'] = combine_df['TotalBsmtSF']-combine_df['BsmtUnfSF']
combine_df.drop(['BsmtFinSF1', 'BsmtFinSF2','TotalBsmtSF'],axis=1,inplace=True)

In [61]:
combine_df['Exterior'] = combine_df['Exterior1st']+combine_df['Exterior2nd']
combine_df.drop(['Exterior1st', 'Exterior2nd'],axis=1,inplace=True)
combine_df['Condition'] = combine_df['Condition1']+combine_df['Condition2']
combine_df.drop(['Condition1','Condition2'],axis=1,inplace=True)
combine_df['Exter'] = combine_df['ExterCond']+combine_df['ExterQual']
combine_df.drop(['ExterCond','ExterQual'],axis=1,inplace=True)
combine_df['Bsmt'] = combine_df['BsmtCond']+combine_df['BsmtQual']
combine_df.drop(['BsmtCond','BsmtQual'],axis=1,inplace=True)
combine

0       VinylSdVinylSd
1       MetalSdMetalSd
2       VinylSdVinylSd
3       Wd SdngWd Shng
4       VinylSdVinylSd
5       VinylSdVinylSd
6       VinylSdVinylSd
7       HdBoardHdBoard
8       BrkFaceWd Shng
9       MetalSdMetalSd
10      HdBoardHdBoard
11      WdShingWd Shng
12      HdBoardPlywood
13      VinylSdVinylSd
14      MetalSdMetalSd
15      Wd SdngWd Sdng
16      Wd SdngWd Sdng
17      MetalSdMetalSd
18      VinylSdVinylSd
19      BrkFacePlywood
20      VinylSdVinylSd
21      Wd SdngWd Sdng
22      VinylSdVinylSd
23      CemntBdCmentBd
24      PlywoodPlywood
25      VinylSdVinylSd
26      Wd SdngWd Sdng
27      VinylSdVinylSd
28      MetalSdMetalSd
29      MetalSdMetalSd
             ...      
1429    MetalSdMetalSd
1430    MetalSdMetalSd
1431    Wd SdngWd Sdng
1432    WdShingWd Shng
1433    MetalSdMetalSd
1434    CemntBdCmentBd
1435    CemntBdCmentBd
1436    PlywoodPlywood
1437    PlywoodPlywood
1438    VinylSdVinylSd
1439    PlywoodPlywood
1440    PlywoodPlywood
1441    Vin

In [40]:
_,num = df_dtype_list(combine_df)
num.remove('Id')

In [42]:
cat,num = df_dtype_list(combine_df)

In [43]:
num

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold']

In [44]:
combine_df['YrSold'].unique()

array([2008, 2007, 2006, 2009, 2010], dtype=int64)

In [17]:
from sklearn.preprocessing import StandardScaler


In [18]:
combine_df = pd.get_dummies(combine_df)

In [19]:
combine_df.shape

(2919, 280)

In [20]:
train_df = pd.merge(combine_df[combine_df['Id']<1461],train_df[['Id','SalePrice']],on='Id',how='inner')
test_df = combine_df[combine_df['Id']>1460]

In [30]:
X_train = train_df.drop(['Id','SalePrice'],axis=1).values
y_train = train_df[['SalePrice']].values
X_test = test_df.drop('Id',axis=1).values

In [31]:
clf = XGBRegressor()

In [32]:
base_score = [0.25,0.5,0.75,1]
n_estimators = [100, 500, 900, 1100, 1500]
max_depth = [2, 3, 5, 10, 15]
booster=['gbtree','gblinear']
learning_rate=[0.05,0.1,0.15,0.20]
min_child_weight=[1,2,3,4]

# Define the grid of hyperparameters to search
hyperparameter_grid = {
    'n_estimators': n_estimators,
    'max_depth':max_depth,
    'learning_rate':learning_rate,
    'min_child_weight':min_child_weight,
    'booster':booster,
    'base_score':base_score
    }

In [33]:
random_cv = RandomizedSearchCV(estimator=clf,
            param_distributions=hyperparameter_grid,
            cv=5, n_iter=50,
            scoring = 'neg_mean_absolute_error',n_jobs = 4,
            verbose = 5, 
            return_train_score = True,
            random_state=42)

In [None]:
random_cv.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  10 tasks      | elapsed:  2.4min
[Parallel(n_jobs=4)]: Done  64 tasks      | elapsed: 15.6min
[Parallel(n_jobs=4)]: Done 154 tasks      | elapsed: 30.0min


In [None]:
clf = random_cv.best_estimator_

In [None]:
clf.fit(X_train,y_train)

In [None]:
with open('xgb.pickle','wb') as fd:
    pickle.dump(clf,fd)

In [None]:
with open('xgb.pickle','rb') as fd:
    clf = pickle.load(fd)

In [None]:
test_df['SalePrice'] = clf.predict(X_test)
test_df[['Id','SalePrice']].to_csv('sample_submission.csv',index=False)

In [None]:
test_df[['Id','SalePrice']]

In [23]:
cat,num = df_dtype_list(train_df)

['Id',
 'MSSubClass',
 'LotFrontage',
 'LotArea',
 'OverallQual',
 'OverallCond',
 'YearBuilt',
 'YearRemodAdd',
 'MasVnrArea',
 'BsmtFinSF1',
 'BsmtFinSF2',
 'TotalBsmtSF',
 '1stFlrSF',
 '2ndFlrSF',
 'LowQualFinSF',
 'GrLivArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'FullBath',
 'HalfBath',
 'BedroomAbvGr',
 'KitchenAbvGr',
 'TotRmsAbvGrd',
 'Fireplaces',
 'GarageYrBlt',
 'GarageCars',
 'GarageArea',
 'WoodDeckSF',
 'OpenPorchSF',
 'EnclosedPorch',
 '3SsnPorch',
 'ScreenPorch',
 'PoolArea',
 'MiscVal',
 'MoSold',
 'YrSold',
 'MSZoning_C (all)',
 'MSZoning_FV',
 'MSZoning_RH',
 'MSZoning_RL',
 'MSZoning_RM',
 'Street_Grvl',
 'Street_Pave',
 'LotShape_IR1',
 'LotShape_IR2',
 'LotShape_IR3',
 'LotShape_Reg',
 'LandContour_Bnk',
 'LandContour_HLS',
 'LandContour_Low',
 'LandContour_Lvl',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'LotConfig_Corner',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LotConfig_Inside',
 'LandSlope_Gtl',
 'LandSlope_Mod',
 'LandSlope_Sev',
 'Neighbor