# Data loading and processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor

In [2]:
data = pd.read_csv('train.csv', index_col = 'Id')

In [3]:
data['YearBuilt'].isna().value_counts()

False    1460
Name: YearBuilt, dtype: int64

In [4]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,...,0,,,,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,...,0,,,,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,...,0,,,,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,...,0,,,,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,...,0,,,,0,12,2008,WD,Normal,250000


In [5]:
data['LotArea'].isna().value_counts()

False    1460
Name: LotArea, dtype: int64

In [6]:
data['LotArea'].describe()

count      1460.000000
mean      10516.828082
std        9981.264932
min        1300.000000
25%        7553.500000
50%        9478.500000
75%       11601.500000
max      215245.000000
Name: LotArea, dtype: float64

In [7]:
fact_features = ['MSZoning', 'Street', 'LotShape', 'Utilities', 'LandSlope', 'BldgType', 'Heating', 
                 'CentralAir', 'KitchenQual', 'GarageType', 'PavedDrive', 'Fence', 'SaleType', 'SaleCondition',
                'Foundation', 'MSSubClass']
other_features = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt']

In [8]:
for i in fact_features:
    data[i] = data[i].factorize()[0]

In [9]:
data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0,0,65.0,8450,0,,0,Lvl,0,Inside,...,0,,-1,,0,2,2008,0,0,208500
2,1,0,80.0,9600,0,,0,Lvl,0,FR2,...,0,,-1,,0,5,2007,0,0,181500
3,0,0,68.0,11250,0,,1,Lvl,0,Inside,...,0,,-1,,0,9,2008,0,0,223500
4,2,0,60.0,9550,0,,1,Lvl,0,Corner,...,0,,-1,,0,2,2006,0,1,140000
5,0,0,84.0,14260,0,,1,Lvl,0,FR2,...,0,,-1,,0,12,2008,0,0,250000


In [10]:
X = data[fact_features].join(data[other_features])

In [11]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1460 entries, 1 to 1460
Data columns (total 20 columns):
MSZoning         1460 non-null int64
Street           1460 non-null int64
LotShape         1460 non-null int64
Utilities        1460 non-null int64
LandSlope        1460 non-null int64
BldgType         1460 non-null int64
Heating          1460 non-null int64
CentralAir       1460 non-null int64
KitchenQual      1460 non-null int64
GarageType       1460 non-null int64
PavedDrive       1460 non-null int64
Fence            1460 non-null int64
SaleType         1460 non-null int64
SaleCondition    1460 non-null int64
Foundation       1460 non-null int64
MSSubClass       1460 non-null int64
LotArea          1460 non-null int64
OverallQual      1460 non-null int64
OverallCond      1460 non-null int64
YearBuilt        1460 non-null int64
dtypes: int64(20)
memory usage: 279.5 KB


In [12]:
X['MSZoning'] = X['MSZoning'].astype('category')

In [13]:
X.head()

Unnamed: 0_level_0,MSZoning,Street,LotShape,Utilities,LandSlope,BldgType,Heating,CentralAir,KitchenQual,GarageType,PavedDrive,Fence,SaleType,SaleCondition,Foundation,MSSubClass,LotArea,OverallQual,OverallCond,YearBuilt
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,0,0,0,0,0,0,0,0,0,0,0,-1,0,0,0,0,8450,7,5,2003
2,0,0,0,0,0,0,0,0,1,0,0,-1,0,0,1,1,9600,6,8,1976
3,0,0,1,0,0,0,0,0,0,0,0,-1,0,0,0,0,11250,7,5,2001
4,0,0,1,0,0,0,0,0,0,1,0,-1,0,1,2,2,9550,7,5,1915
5,0,0,1,0,0,0,0,0,0,0,0,-1,0,0,0,0,14260,8,5,2000


In [14]:
y = data['SalePrice']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

# Decision Tree model learning

In [16]:
tree = DecisionTreeRegressor(random_state = 1)

In [17]:
tree.fit(X=X_train, y=y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [18]:
tree.score(X_test, y_test)

0.652308604691946

# Chossing best hiperparameters with GridSearchCV

In [19]:
estimator = DecisionTreeRegressor(random_state = 1)
param_grid = {'max_depth': range(1, 5),
              'min_samples_leaf': range (1, 20),
             'min_samples_split': range (2, 20),}

In [20]:
model = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=2)

In [21]:
model.fit(X=X, y=y)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(1, 5), 'min_samples_leaf': range(1, 20), 'min_samples_split': range(2, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [22]:
model.best_params_

{'max_depth': 4, 'min_samples_leaf': 7, 'min_samples_split': 16}

In [23]:
best_params = pd.Series(model.best_params_)

In [24]:
best_params

max_depth             4
min_samples_leaf      7
min_samples_split    16
dtype: int64

In [25]:
model.score(X=X_test, y=y_test)

0.7775372936173762

# Random forest training

In [26]:
sqrt = int(np.sqrt(len(X_train.columns)))+9

In [27]:
forest = RandomForestRegressor(max_depth=best_params['max_depth'],
                              min_samples_leaf = best_params['min_samples_leaf'],
                              min_samples_split = best_params['min_samples_split'], n_estimators=100, random_state = 1,
                              bootstrap=True, max_features=sqrt)

In [28]:
forest.fit(X=X_train, y=y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=13, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=7,
           min_samples_split=16, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=None, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [29]:
forest.score(X=X_test, y=y_test)

0.7612059341738151