# Data loading and processing

In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor

In [2]:
pd.read_csv('train.csv').head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
data = pd.read_csv('train.csv', index_col = 'Id')
evaluate = pd.read_csv('test.csv', index_col = 'Id')

In [4]:
full = data.append(evaluate, ignore_index=True, sort = False)

In [5]:
full['GarageArea'].isna().value_counts()

False    2918
True        1
Name: GarageArea, dtype: int64

## Features engineering

In [6]:
full['Basement'] = full['BsmtQual'].notnull()

In [7]:
full['AfterRemod'] = full['YearBuilt'].where(full['YearBuilt'] == full['YearRemodAdd']).notnull()

In [8]:
full['GarageArea'] = full['GarageArea'].fillna(0)

In [9]:
fact_features = ['MSSubClass', 'MSZoning', 'Street', 'LotShape', 'Utilities', 'LandSlope', 'BldgType', 'Heating', 
                'CentralAir', 'KitchenQual', 'GarageType', 'GarageFinish', 'GarageArea', 'PavedDrive', 'Fence', 
                 'SaleType', 'SaleCondition',
                'Foundation', 'MSSubClass', 'Basement', 'AfterRemod']
    
other_features = ['LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'SalePrice']

In [10]:
for i in fact_features:
    full[i] = full[i].factorize()[0]

In [11]:
full.tail()

Unnamed: 0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice,Basement,AfterRemod
2914,11,1,21.0,1936,0,,0,Lvl,0,Inside,...,-1,,0,6,2006,0,0,,0,0
2915,11,1,21.0,1894,0,,0,Lvl,0,Inside,...,-1,,0,4,2006,0,1,,0,0
2916,1,0,160.0,20000,0,,0,Lvl,0,Inside,...,-1,,0,9,2006,0,1,,0,1
2917,9,0,62.0,10441,0,,0,Lvl,0,Inside,...,0,Shed,700,7,2006,0,0,,0,0
2918,0,0,74.0,9627,0,,0,Lvl,0,Inside,...,-1,,0,11,2006,0,0,,0,1


## Sets

In [12]:
full = full[fact_features].join(full[other_features])

to_category = fact_features
for i in to_category:
    X[i] = X[i].astype('category')


In [14]:
full.head()

Unnamed: 0,MSSubClass,MSZoning,Street,LotShape,Utilities,LandSlope,BldgType,Heating,CentralAir,KitchenQual,...,SaleCondition,Foundation,MSSubClass.1,Basement,AfterRemod,LotArea,OverallQual,OverallCond,YearBuilt,SalePrice
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,8450,7,5,2003,208500.0
1,1,0,0,0,0,0,0,0,0,1,...,0,1,1,0,0,9600,6,8,1976,181500.0
2,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,1,11250,7,5,2001,223500.0
3,2,0,0,1,0,0,0,0,0,0,...,1,2,2,0,1,9550,7,5,1915,140000.0
4,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,14260,8,5,2000,250000.0


In [15]:
data = full[full['SalePrice'].isna() == False]
evaluate = full[full['SalePrice'].isna() == True].drop(labels = 'SalePrice', axis = 1)

In [16]:
y = data['SalePrice']
X = data.drop(labels = 'SalePrice', axis = 1)

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1)

# Decision Tree model learning

In [18]:
tree = DecisionTreeRegressor(random_state = 1)

In [19]:
tree.fit(X=X_train, y=y_train)

DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best')

In [20]:
tree.score(X_test, y_test)

0.5953088250159249

# Chossing best hiperparameters with GridSearchCV

In [21]:
estimator = DecisionTreeRegressor(random_state = 1)
param_grid = {'max_depth': range(1, 5),
              'min_samples_leaf': range (1, 20),
             'min_samples_split': range (2, 20),}

In [22]:
model = GridSearchCV(estimator=estimator, param_grid=param_grid, cv=2)

In [23]:
model.fit(X=X, y=y)

GridSearchCV(cv=2, error_score='raise-deprecating',
       estimator=DecisionTreeRegressor(criterion='mse', max_depth=None, max_features=None,
           max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           presort=False, random_state=1, splitter='best'),
       fit_params=None, iid='warn', n_jobs=None,
       param_grid={'max_depth': range(1, 5), 'min_samples_leaf': range(1, 20), 'min_samples_split': range(2, 20)},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [24]:
model.best_params_

{'max_depth': 4, 'min_samples_leaf': 13, 'min_samples_split': 2}

In [25]:
best_params = pd.Series(model.best_params_)

In [26]:
best_params

max_depth             4
min_samples_leaf     13
min_samples_split     2
dtype: int64

In [27]:
model.score(X=X_test, y=y_test)

0.80044155157152

# Random forest training

In [28]:
sqrt = int(np.sqrt(len(X_train.columns)))

In [29]:
forest = RandomForestRegressor(max_depth=best_params['max_depth'],
                              min_samples_leaf = best_params['min_samples_leaf'],
                              min_samples_split = best_params['min_samples_split'], n_estimators=100, random_state = 1,
                              bootstrap=True, max_features=sqrt)

In [30]:
forest.fit(X=X_train, y=y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=4,
           max_features=5, max_leaf_nodes=None, min_impurity_decrease=0.0,
           min_impurity_split=None, min_samples_leaf=13,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=None, oob_score=False, random_state=1,
           verbose=0, warm_start=False)

In [31]:
forest.score(X=X_test, y=y_test)

0.6844121362394038

## DummyRegressor

In [32]:
dummy = DummyRegressor(strategy='mean')

In [33]:
dummy.fit(X=X_train, y=y_train)

DummyRegressor(constant=None, quantile=None, strategy='mean')

In [34]:
dummy.score(X = X_test, y=y_test)

-0.005807477762133084

## Predictions for test data

In [35]:
predict = forest.predict(X = evaluate)

In [36]:
predict

array([148308.89585868, 175862.45958265, 216487.74098881, ...,
       157326.81650675, 160816.60987178, 213871.936206  ])

In [37]:
evaluate.index

Int64Index([1460, 1461, 1462, 1463, 1464, 1465, 1466, 1467, 1468, 1469,
            ...
            2909, 2910, 2911, 2912, 2913, 2914, 2915, 2916, 2917, 2918],
           dtype='int64', length=1459)

In [38]:
results = pd.DataFrame(predict, index=evaluate.index, columns = ['SalePrice'])

In [39]:
results.to_csv('submission.csv')