## Price predict

In [1]:
import pandas as pd
import numpy as np
import os

# Miscellaneous 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# Models
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.ensemble import RandomForestRegressor

# Metrics
from sklearn.metrics import mean_absolute_percentage_error
from sklearn.metrics import r2_score
from sklearn.metrics import max_error

In [2]:
data = pd.read_csv("../archive/house_prices.csv")
# The 0 bedroom properties will be deleted, because it makes no sense a flat with 0 bedrooms. 
data = data.loc[data['bedrooms']!=0,].reset_index().drop('Unnamed: 0', axis = 1)

In [3]:
data.columns

Index(['index', 'property_type', 'price', 'location', 'city', 'baths',
       'purpose', 'bedrooms', 'Area_in_Marla'],
      dtype='object')

In [4]:
cat_data = data.columns[[1, 3, 4, 5, 6, 7]]
cont_data = data.columns[[2,8]]

### 1. Feature Engineering

In [5]:
## Standard Scaler
# Columns to scale: price, Area_in_Marla

scaler = StandardScaler()
# data.update(np.asmatrix(scaler.fit_transform(data[cont_data])))

data[cont_data] = pd.DataFrame(np.asmatrix(scaler.fit_transform(data[cont_data])))


In [6]:
## One hot encoding
# Columns to ohe: property_type, location, city, baths, bedrooms and purpose
one_hot = OneHotEncoder(sparse_output=False)

cat_X = one_hot.fit_transform(data[cat_data])
cat_X = pd.DataFrame(np.asmatrix(cat_X), index = data.index)
cat_X.columns = one_hot.get_feature_names_out()

data = pd.concat([data.drop(cat_data, axis = 1), cat_X], axis=1)



In [7]:
data.drop('index', axis = 1, inplace=True)

In [8]:
data.columns

Index(['price', 'Area_in_Marla', 'property_type_Farm House',
       'property_type_Flat', 'property_type_House',
       'property_type_Lower Portion', 'property_type_Penthouse',
       'property_type_Room', 'property_type_Upper Portion',
       'location_12th Avenue',
       ...
       'baths_6', 'baths_7', 'purpose_For Rent', 'purpose_For Sale',
       'bedrooms_1', 'bedrooms_2', 'bedrooms_3', 'bedrooms_4', 'bedrooms_5',
       'bedrooms_6'],
      dtype='object', length=1415)

### 2. Train-Test split, multiple model selection

In [9]:
X_train, X_test, y_train, y_test = train_test_split(data.iloc[:,1:], data.iloc[:,0], train_size=0.7, random_state= 1234)

In [10]:
# Select the models and the hyperparameter configuration
class EstimatorSelectionHelper:

    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError("Some estimators are missing parameters: %s" % missing_params)
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print("Running GridSearchCV for %s." % key)
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X,y)
            self.grid_searches[key] = gs    

    def score_summary(self, sort_by='mean_score'):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'min_score': min(scores),
                 'max_score': max(scores),
                 'mean_score': np.mean(scores),
                 'std_score': np.std(scores),
            }
            return pd.Series({**params,**d})

        rows = []
        for k in self.grid_searches:
            print(k)
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]        
                scores.append(r.reshape(len(params),1))

            all_scores = np.hstack(scores)
            for p, s in zip(params,all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'min_score', 'mean_score', 'max_score', 'std_score']
        columns = columns + [c for c in df.columns if c not in columns]

        return df[columns]

In [11]:
models = {
    'ElasticNet': ElasticNet(),
    'LinearRegression': LinearRegression(),
    'GradientBoosting': GradientBoostingRegressor(),
    'RandomForest': RandomForestRegressor()
}

In [12]:
parameters = {
    'ElasticNet':{'alpha':[0.3, 0.6, 0.9], 'l1_ratio':[0.25, 0.5, 0.75], 'fit_intercept':[True, False], 'random_state':[1234]},
    'LinearRegression':{'fit_intercept':[True, False]},
    'GradientBoosting':{'n_estimators':[50, 100, 200, 500], 'random_state':[1234], 'ccp_alpha':[0.1, 0.3, 0.5]},
    'RandomForest':{'n_estimators':[50, 100, 200, 500], 'max_features': ['sqrt'], 'random_state':[1234], 'ccp_alpha':[0.1, 0.3, 0.5]}
}

In [13]:
helper1 = EstimatorSelectionHelper(models, parameters)
results = []

for j in ['neg_mean_absolute_percentage_error', 'r2', 'max_error']:
    helper1.fit(X_train, y_train, scoring=j, n_jobs=4)
    aux = helper1.score_summary(sort_by='max_score')
    aux['metric'] = j
    results.append(aux)

Running GridSearchCV for ElasticNet.
Fitting 3 folds for each of 18 candidates, totalling 54 fits


Running GridSearchCV for LinearRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for GradientBoosting.
Fitting 3 folds for each of 12 candidates, totalling 36 fits
Running GridSearchCV for RandomForest.
Fitting 3 folds for each of 12 candidates, totalling 36 fits
ElasticNet
LinearRegression
GradientBoosting
RandomForest
Running GridSearchCV for ElasticNet.
Fitting 3 folds for each of 18 candidates, totalling 54 fits
Running GridSearchCV for LinearRegression.
Fitting 3 folds for each of 2 candidates, totalling 6 fits
Running GridSearchCV for GradientBoosting.
Fitting 3 folds for each of 12 candidates, totalling 36 fits


KeyboardInterrupt: 