## Importing:

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt #graphing
import matplotlib.patches as mpatches #style
import seaborn as sns #advanced graphing
import scipy as sp #math stuff
%matplotlib inline
pd.options.display.max_rows=300

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


In [2]:
#importing csvs
train = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/train.csv')
test = pd.read_csv('/kaggle/input/house-prices-advanced-regression-techniques/test.csv')
train['train_test'] = 1
test['train_test'] = 0
all_data = pd.concat([train,test])
all_data.set_index('Id',inplace=True)

## Discovering:

In [3]:
#getting a general idea of variable types
all_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2919 entries, 1 to 2919
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     2919 non-null   int64  
 1   MSZoning       2915 non-null   object 
 2   LotFrontage    2433 non-null   float64
 3   LotArea        2919 non-null   int64  
 4   Street         2919 non-null   object 
 5   Alley          198 non-null    object 
 6   LotShape       2919 non-null   object 
 7   LandContour    2919 non-null   object 
 8   Utilities      2917 non-null   object 
 9   LotConfig      2919 non-null   object 
 10  LandSlope      2919 non-null   object 
 11  Neighborhood   2919 non-null   object 
 12  Condition1     2919 non-null   object 
 13  Condition2     2919 non-null   object 
 14  BldgType       2919 non-null   object 
 15  HouseStyle     2919 non-null   object 
 16  OverallQual    2919 non-null   int64  
 17  OverallCond    2919 non-null   int64  
 18  YearBuil

In [4]:
#seperating variables in to quantitative and qualitative
qual_vars = []
quant_vars = []
for column in all_data.columns:
    if all_data[column].dtype == 'object':
        qual_vars.append(column)
    else:
        quant_vars.append(column)
quant_vars.remove('SalePrice')
quant_vars.remove('MSSubClass')
qual_vars.append('MSSubClass')
quant_vars.remove('YearBuilt')
qual_vars.append('YearBuilt')
quant_vars.remove('YearRemodAdd')
qual_vars.append('YearRemodAdd')
quant_vars.remove('GarageYrBlt')
qual_vars.append('GarageYrBlt')
quant_vars.remove('MoSold')
qual_vars.append('MoSold')
quant_vars.remove('YrSold')
qual_vars.append('YrSold')

In [5]:
train[quant_vars].isna().sum()

LotFrontage      259
LotArea            0
OverallQual        0
OverallCond        0
MasVnrArea         8
BsmtFinSF1         0
BsmtFinSF2         0
BsmtUnfSF          0
TotalBsmtSF        0
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       0
BsmtHalfBath       0
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageCars         0
GarageArea         0
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
train_test         0
dtype: int64

In [6]:
#ensuring that null values for LotFrontage should be 0
print(train['LotFrontage'].value_counts())
train['LotFrontage'].fillna(0,inplace=True)

60.0     143
70.0      70
80.0      69
50.0      57
75.0      53
65.0      44
85.0      40
78.0      25
21.0      23
90.0      23
68.0      19
64.0      19
24.0      19
73.0      18
55.0      17
63.0      17
79.0      17
72.0      17
100.0     16
66.0      15
74.0      15
51.0      15
52.0      14
59.0      13
82.0      12
71.0      12
67.0      12
43.0      12
40.0      12
57.0      12
76.0      11
69.0      11
92.0      10
34.0      10
53.0      10
86.0      10
88.0      10
84.0       9
35.0       9
62.0       9
44.0       9
77.0       9
93.0       8
98.0       8
96.0       8
61.0       8
95.0       7
58.0       7
120.0      7
107.0      7
94.0       6
81.0       6
36.0       6
30.0       6
48.0       6
91.0       6
41.0       6
54.0       6
89.0       6
110.0      6
105.0      6
32.0       5
47.0       5
87.0       5
56.0       5
37.0       5
83.0       5
102.0      4
42.0       4
49.0       4
104.0      3
103.0      3
99.0       3
45.0       3
108.0      3
130.0      3
124.0      2

In [7]:
#ensuring that null values for MasVnrArea means there is no MasVnr
print(train[train['MasVnrArea'].isna()]['MasVnrType'])
train['MasVnrArea'].fillna(0,inplace=True)

234     NaN
529     NaN
650     NaN
936     NaN
973     NaN
977     NaN
1243    NaN
1278    NaN
Name: MasVnrType, dtype: object


In [8]:
test[quant_vars].isna().sum()

LotFrontage      227
LotArea            0
OverallQual        0
OverallCond        0
MasVnrArea        15
BsmtFinSF1         1
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
1stFlrSF           0
2ndFlrSF           0
LowQualFinSF       0
GrLivArea          0
BsmtFullBath       2
BsmtHalfBath       2
FullBath           0
HalfBath           0
BedroomAbvGr       0
KitchenAbvGr       0
TotRmsAbvGrd       0
Fireplaces         0
GarageCars         1
GarageArea         1
WoodDeckSF         0
OpenPorchSF        0
EnclosedPorch      0
3SsnPorch          0
ScreenPorch        0
PoolArea           0
MiscVal            0
train_test         0
dtype: int64

In [9]:
#ensuring that null values for LotFrontage should be 0
print(test['LotFrontage'].value_counts())
test['LotFrontage'].fillna(0,inplace=True)

60.0     133
80.0      68
70.0      63
50.0      60
75.0      52
65.0      49
85.0      36
24.0      30
63.0      30
21.0      27
68.0      25
64.0      24
74.0      24
90.0      23
72.0      22
62.0      22
78.0      21
82.0      16
73.0      15
52.0      14
57.0      14
53.0      14
56.0      14
76.0      14
59.0      14
100.0     12
88.0      12
51.0      12
81.0      12
79.0      11
43.0      11
66.0      11
35.0      10
44.0      10
58.0      10
48.0      10
67.0      10
84.0       9
61.0       9
110.0      9
69.0       9
91.0       8
95.0       8
42.0       8
41.0       8
83.0       8
45.0       7
71.0       7
120.0      7
55.0       7
94.0       6
96.0       6
77.0       6
105.0      6
102.0      6
87.0       6
34.0       6
92.0       5
39.0       5
30.0       5
93.0       5
86.0       5
107.0      4
54.0       4
40.0       4
89.0       4
98.0       4
103.0      3
108.0      3
97.0       3
125.0      3
113.0      3
26.0       3
38.0       3
32.0       3
112.0      3
106.0      3

In [10]:
#ensuring that null values for MasVnrArea means there is no MasVnr
print(test[test['MasVnrArea'].isna()]['MasVnrType'])
test['MasVnrArea'].fillna(0,inplace=True)

231     NaN
246     NaN
422     NaN
532     NaN
544     NaN
581     NaN
851     NaN
865     NaN
880     NaN
889     NaN
908     NaN
1132    NaN
1197    NaN
1226    NaN
1402    NaN
Name: MasVnrType, dtype: object


In [11]:
#ensuring that null values for BsmtFinSF1 should be 0
print(test[test['BsmtFinSF1'].isna()]['BsmtFinType1'])
test['BsmtFinSF1'].fillna(0,inplace=True)

660    NaN
Name: BsmtFinType1, dtype: object


In [12]:
#ensuring that null values for BsmtFinSF2 should be 0
print(test[test['BsmtFinSF2'].isna()]['BsmtFinType2'])
test['BsmtFinSF2'].fillna(0,inplace=True)

660    NaN
Name: BsmtFinType2, dtype: object


In [13]:
#ensuring that null values for BsmtUnfSF should be 0
print(test[test['BsmtUnfSF'].isna()]['BsmtQual'])
test['BsmtUnfSF'].fillna(0,inplace=True)

660    NaN
Name: BsmtQual, dtype: object


In [14]:
#ensuring that null values for TotalBsmtSF should be 0
print(test[test['TotalBsmtSF'].isna()]['BsmtQual'])
test['TotalBsmtSF'].fillna(0,inplace=True)

660    NaN
Name: BsmtQual, dtype: object


In [15]:
#ensuring that null values for BsmtFullBath should be 0
print(test[test['BsmtFullBath'].isna()]['BsmtQual'])
test['BsmtFullBath'].fillna(0,inplace=True)

660    NaN
728    NaN
Name: BsmtQual, dtype: object


In [16]:
#ensuring that null values for BsmtHalfBath should be 0
print(test[test['BsmtHalfBath'].isna()]['BsmtQual'])
test['BsmtHalfBath'].fillna(0,inplace=True)

660    NaN
728    NaN
Name: BsmtQual, dtype: object


In [17]:
#ensuring that null values for GarageCars should be 0
print(test[test['GarageCars'].isna()]['GarageQual'])
test['GarageCars'].fillna(0,inplace=True)

1116    NaN
Name: GarageQual, dtype: object


In [18]:
#ensuring that null values for GarageArea should be 0
print(test[test['GarageArea'].isna()]['GarageQual'])
test['GarageArea'].fillna(0,inplace=True)

1116    NaN
Name: GarageQual, dtype: object


In [19]:
all_data[qual_vars].dtypes

MSZoning          object
Street            object
Alley             object
LotShape          object
LandContour       object
Utilities         object
LotConfig         object
LandSlope         object
Neighborhood      object
Condition1        object
Condition2        object
BldgType          object
HouseStyle        object
RoofStyle         object
RoofMatl          object
Exterior1st       object
Exterior2nd       object
MasVnrType        object
ExterQual         object
ExterCond         object
Foundation        object
BsmtQual          object
BsmtCond          object
BsmtExposure      object
BsmtFinType1      object
BsmtFinType2      object
Heating           object
HeatingQC         object
CentralAir        object
Electrical        object
KitchenQual       object
Functional        object
FireplaceQu       object
GarageType        object
GarageFinish      object
GarageQual        object
GarageCond        object
PavedDrive        object
PoolQC            object
Fence             object


In [20]:
all_data[qual_vars] = all_data[qual_vars].astype('object')

In [21]:
#exploring different values of qualitative variables
for var in qual_vars:
    print(var, ":")
    print(train[var].value_counts(dropna=False))
    print()

MSZoning :
RL         1151
RM          218
FV           65
RH           16
C (all)      10
Name: MSZoning, dtype: int64

Street :
Pave    1454
Grvl       6
Name: Street, dtype: int64

Alley :
NaN     1369
Grvl      50
Pave      41
Name: Alley, dtype: int64

LotShape :
Reg    925
IR1    484
IR2     41
IR3     10
Name: LotShape, dtype: int64

LandContour :
Lvl    1311
Bnk      63
HLS      50
Low      36
Name: LandContour, dtype: int64

Utilities :
AllPub    1459
NoSeWa       1
Name: Utilities, dtype: int64

LotConfig :
Inside     1052
Corner      263
CulDSac      94
FR2          47
FR3           4
Name: LotConfig, dtype: int64

LandSlope :
Gtl    1382
Mod      65
Sev      13
Name: LandSlope, dtype: int64

Neighborhood :
NAmes      225
CollgCr    150
OldTown    113
Edwards    100
Somerst     86
Gilbert     79
NridgHt     77
Sawyer      74
NWAmes      73
SawyerW     59
BrkSide     58
Crawfor     51
Mitchel     49
NoRidge     41
Timber      38
IDOTRR      37
ClearCr     28
StoneBr     25
SW

In [22]:
#getting dummies
qual_vars.append('train_test')
dummies = pd.get_dummies(all_data[qual_vars])#,dummy_na=True)
train_dummies = dummies[dummies['train_test']==1]
test_dummies = dummies[dummies['train_test']==0]

In [23]:
#final split into x, y, test, and train
quant_vars.append('Id')
X_train = train[quant_vars]
X_train = X_train.merge(train_dummies, on='Id')
y_train = train['SalePrice']
X_test = test[quant_vars]
X_test = X_test.merge(test_dummies, on='Id')

## Exploratory Analysis:

In [24]:
#looking at correlations between qualitative variables and price graphically
#for var in quant_vars:
    #sns.regplot(data=train,x=var,y='SalePrice')
    #plt.show()

In [25]:
from sklearn.metrics import r2_score
from sklearn.linear_model import LinearRegression

In [26]:
#looking at correlations between qualitative variables and price using r_squared
# for var in quant_vars:
#     x = X_train[var]
#     X = x[:, None]
#     reg = LinearRegression()
#     reg.fit(X,y_train)
#     y = reg.predict(X_test[var][:, None])
#     r2_score(y)

In [27]:
# X_test.isna().sum()

## Preliminary Correlations

In [28]:
#importing models
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from xgboost import XGBRegressor

In [29]:
#scaling
#scaler=StandardScaler()
#X_train_scaled = scaler.fit_transform(X_train)
#X_test_scaled = scaler.fit_transform(X_test)

In [30]:
# #linear regression
# reg = LinearRegression()
# cv = cross_val_score(reg,X_train,y_train,cv=5)
# print(cv)
# print(np.mean(cv))

In [31]:
# #ridge regression
# ridge = Ridge()
# cv = cross_val_score(ridge,X_train,y_train)
# print(cv)
# print(np.mean(cv))

In [32]:
# #lasso regression
# lasso = Lasso()
# cv = cross_val_score(lasso,X_train,y_train)
# print(cv)
# print(np.mean(cv))

In [33]:
#XGBRegressor
xgb = XGBRegressor()
cv = cross_val_score(xgb,X_train,y_train)
print(cv)
print(np.mean(cv))

[0.86317579 0.8329959  0.85554012 0.89258598 0.87288017]
0.8634355915401292


**To do:**

Make dummies

Do more feature engineering

## Training Final Model

In [34]:
model = XGBRegressor()

In [35]:
model.fit(X_train,y_train)

XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
             colsample_bylevel=1, colsample_bynode=1, colsample_bytree=1,
             early_stopping_rounds=None, enable_categorical=False,
             eval_metric=None, gamma=0, gpu_id=-1, grow_policy='depthwise',
             importance_type=None, interaction_constraints='',
             learning_rate=0.300000012, max_bin=256, max_cat_to_onehot=4,
             max_delta_step=0, max_depth=6, max_leaves=0, min_child_weight=1,
             missing=nan, monotone_constraints='()', n_estimators=100, n_jobs=0,
             num_parallel_tree=1, predictor='auto', random_state=0, reg_alpha=0,
             reg_lambda=1, ...)

In [36]:
y_test = model.predict(X_test)

In [37]:
final_data = {'Id': X_test['Id'], 'SalePrice': y_test}
submission = pd.DataFrame(data=final_data)
submission.to_csv('submission_xgb', index=False)