# PyCaret 2 House Price Prediction Example

This notebook is created using PyCaret 2.0. Last updated : 04-08-2020

House Price Prediction data set from Kaggle https://www.kaggle.com/c/house-prices-advanced-regression-techniques <br>
Train Dataset consists of 1460 Samples with 81 features including the SalePrice<br>
Test Dataset consists of 1459 Samples wit 80 features

In [3]:
# Mount Google Drive 
# Skip this step if using on local hardware 
# from google.colab import drive
# drive.mount('/content/gdrive')

In [1]:
# Works with pycaret and pycaret 2
#!pip install pycaret==2.0
from pycaret.regression import *
import pandas as pd

In [2]:
# check version
from pycaret.utils import version
version()

'2.3.0'

In [7]:
# Chane path as per your file structure
# Remove root_path if using local hardware
# 
# root_path = 'gdrive/My Drive/Colab Notebooks/'

data = pd.read_csv('data/kc_house_data_train.csv', index_col=0)

# test_data = pd.read_csv('gdrive/My Drive/Colab Notebooks/HousePrice/test.csv')

print(data.shape)

(17290, 21)


In [8]:
data.head()

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,zipcode,lat,long,sqft_living15,sqft_lot15
0,2591820310,20141006T000000,365000.0,4,2.25,2070,8893,2.0,0,0,4,8,2070,0,1986,0,98058,47.4388,-122.162,2390,7700
1,7974200820,20140821T000000,865000.0,5,3.0,2900,6730,1.0,0,0,5,8,1830,1070,1977,0,98115,47.6784,-122.285,2370,6283
2,7701450110,20140815T000000,1038000.0,4,2.5,3770,10893,2.0,0,2,3,11,3770,0,1997,0,98006,47.5646,-122.129,3710,9685
3,9522300010,20150331T000000,1490000.0,3,3.5,4560,14608,2.0,0,2,3,12,4560,0,1990,0,98034,47.6995,-122.228,4050,14226
4,9510861140,20140714T000000,711000.0,3,2.5,2550,5376,2.0,0,0,3,9,2550,0,2004,0,98052,47.6647,-122.083,2250,4050


In [9]:
# Ignoring features with high null values 

demo = setup(data = data, target = 'price', 
                   ignore_features = ['id', 'date', 'zipcode', 'lat', 'long'],normalize = True,
                   transformation= True, transformation_method = 'yeo-johnson', 
                   transform_target = True, remove_outliers= True,
                   remove_multicollinearity = True,
                   ignore_low_variance = True, combine_rare_levels = True) 

Unnamed: 0,Description,Value
0,session_id,5851
1,Target,price
2,Original Data,"(17290, 21)"
3,Missing Values,False
4,Numeric Features,10
5,Categorical Features,5
6,Ordinal Features,False
7,High Cardinality Features,False
8,High Cardinality Method,
9,Transformed Train Set,"(11496, 21)"


In [12]:
# Blacklist Theil–Sen Regressor 
# Auto sort on R2 
compare_models(blacklist = ['tr'])

TypeError: compare_models() got an unexpected keyword argument 'blacklist'

In [None]:
# Creating models for the best estimators 
huber = create_model('huber')
bayesian_ridge = create_model('br')
cat_boost = create_model('catboost')

In [None]:
# Tuning the created models 
huber = tune_model(huber)
bayesian_ridge = tune_model(bayesian_ridge)
cat_boost = tune_model(cat_boost)

In [None]:
# Blending models
blender = blend_models(estimator_list = [huber, bayesian_ridge, cat_boost])

In [None]:
# Finaliszing model for predictions 
model = finalize_model(blender)
predictions = predict_model(model, data = test_data)

In [None]:
# Generating CSV for Kaggle Submissions 
sub = pd.DataFrame({
        "Id": predictions['Id'],
        "SalePrice": predictions['Label']
    })

sub.to_csv('gdrive/My Drive/Colab Notebooks/HousePrice/submission.csv', index=False)