In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression,Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor


from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

In [2]:
from sklearn.metrics import r2_score,mean_absolute_error

In [3]:
from sklearn.metrics import mean_squared_error
import hyperopt
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from hyperopt.pyll.base import scope

In [4]:
# Display Pipeline

from sklearn import set_config
set_config(display='diagram')
import warnings
warnings.filterwarnings('ignore')

In [5]:
df = pd.read_csv('df.csv')

In [6]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,floor_category
0,flat,sector 70,1.55,3,3,3,Relatively New,1527.0,1,0,unfurnished,Mid Floor
1,flat,sector 79,0.71,2,2,3+,New Property,1280.0,0,0,unfurnished,High Floor
2,flat,sector 90,1.25,3,4,3+,Old Property,1608.0,1,0,furnished,Low Floor
3,flat,sector 71,0.95,2,2,2,Moderately Old,1084.0,0,0,semifurnished,Mid Floor
4,flat,sector 66,3.2,3,3,3,Relatively New,1800.0,1,1,unfurnished,High Floor


In [7]:
X = df.drop(columns=['price'])
y = df['price']

In [8]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

In [9]:
from sklearn.model_selection import GridSearchCV

In [12]:
param_grid = {
    'regressor__criterion':['squared_error'],
    'regressor__max_depth':[20],
    'regressor__max_features':['sqrt'],
    'regressor__max_samples':[1.0],
    'regressor__n_estimators':[300]
}

In [13]:
import category_encoders as ce
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'floor_category']
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(handle_unknown='ignore'),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [14]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [15]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [16]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4,refit=True)

In [17]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 1 candidates, totalling 10 fits


In [18]:
final_pipe = search.best_estimator_

In [19]:
search.best_score_

0.9026310946167293

In [21]:
 pd.DataFrame(search.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_regressor__criterion,param_regressor__max_depth,param_regressor__max_features,param_regressor__max_samples,param_regressor__n_estimators,params,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,1.828969,0.311471,0.112934,0.04968,squared_error,20,sqrt,1.0,300,"{'regressor__criterion': 'squared_error', 'reg...",...,0.866165,0.909833,0.891467,0.892418,0.906384,0.920287,0.916228,0.902631,0.016134,1


In [23]:
 pd.DataFrame(search.cv_results_)[['param_regressor__max_depth', 'param_regressor__max_features', 'param_regressor__max_samples', 'param_regressor__n_estimators','mean_test_score','std_test_score']].sort_values('mean_test_score',ascending=False)

Unnamed: 0,param_regressor__max_depth,param_regressor__max_features,param_regressor__max_samples,param_regressor__n_estimators,mean_test_score,std_test_score
0,20,sqrt,1.0,300,0.902631,0.016134


In [24]:
final_pipe.fit(X,y_transformed)

In [25]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first',handle_unknown='ignore'),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [26]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=500))
])

In [27]:
pipeline.fit(X,y_transformed)

In [28]:
data = [['house', 'sector 49', 3, 3, '3+', 'New Property', 1750, 0, 0, 'unfurnished','Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,floor_category
0,house,sector 49,3,3,3+,New Property,1750,0,0,unfurnished,Low Floor


In [29]:
np.expm1(pipeline.predict(one_df))

array([2.74005627])

In [30]:
import pickle

with open('pipeline.pkl', 'wb') as file:
    pickle.dump(pipeline, file)

In [31]:
with open('df.pkl', 'wb') as file:
    pickle.dump(X, file)