In [183]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVR

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

from sklearn.decomposition import PCA

In [184]:
df = pd.read_csv('../../../data/raw/nb_data/7. feature-selection/gurgaon_properties_post_feature_selection_v2.csv')

In [185]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,0.0,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,0.0,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,0.0,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,1.0,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,0.0,High,Mid Floor


In [186]:
df['furnishing_type'].value_counts()

furnishing_type
0.0    2349
1.0    1018
2.0     187
Name: count, dtype: int64

In [187]:
# 0 -> unfurnished
# 1 -> semifurnished
# 2 -> furnished
df['furnishing_type'] = df['furnishing_type'].replace({0.0:'unfurnished',1.0:'semifurnished',2.0:'furnished'})

In [188]:
df.head()

Unnamed: 0,property_type,sector,price,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,0.82,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,0.95,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,0.32,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,1.6,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,0.48,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


In [189]:
X = df.drop(columns=['price'])
y = df['price']

In [190]:
# Applying the log1p transformation to the target variable
y_transformed = np.log1p(y)

### Ordinal Encoding

In [191]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

In [192]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode)
    ], 
    remainder='passthrough'
)

In [193]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [194]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [195]:
scores.mean(),scores.std()

(0.7363096633436828, 0.03238005754429938)

In [196]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [197]:
pipeline.fit(X_train,y_train)

In [198]:
y_pred = pipeline.predict(X_test)

In [199]:
y_pred = np.expm1(y_pred)

In [200]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.9463822160089355

In [201]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [202]:
from sklearn.tree import DecisionTreeRegressor, ExtraTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor
from sklearn.neural_network import MLPRegressor
from xgboost import XGBRegressor

In [203]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [204]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [205]:
model_output

[['linear_reg', 0.7363096633436828, 0.9463822160089355],
 ['svr', 0.7642012011196353, 0.8472636473483927],
 ['ridge', 0.7363125343993552, 0.946338774185337],
 ['LASSO', 0.05943378064493572, 1.528905986892753],
 ['decision tree', 0.7740847697769442, 0.7462645322678639],
 ['random forest', 0.8815233681960916, 0.5308689793815297],
 ['extra trees', 0.7213004765954898, 0.8285867221566898],
 ['gradient boosting', 0.872600262721515, 0.5764291125398039],
 ['adaboost', 0.7581575319058252, 0.8804225698445143],
 ['mlp', 0.8169921894791023, 0.7639569495634707],
 ['xgboost', 0.8894876835260124, 0.5040475141482346]]

In [206]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [207]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.889488,0.504048
5,random forest,0.881523,0.530869
7,gradient boosting,0.8726,0.576429
4,decision tree,0.774085,0.746265
9,mlp,0.816992,0.763957
6,extra trees,0.7213,0.828587
1,svr,0.764201,0.847264
8,adaboost,0.758158,0.880423
2,ridge,0.736313,0.946339
0,linear_reg,0.73631,0.946382


### OneHotEncoding

In [208]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), columns_to_encode),
        ('cat1',OneHotEncoder(drop='first'),['sector','agePossession','furnishing_type'])
    ], 
    remainder='passthrough'
)

In [209]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [210]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [211]:
scores.mean()

0.8546054073648314

In [212]:
scores.std()

0.01599847663314007

In [213]:
X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

In [214]:
pipeline.fit(X_train,y_train)

In [215]:
y_pred = pipeline.predict(X_test)

In [216]:
y_pred = np.expm1(y_pred)

In [217]:
mean_absolute_error(np.expm1(y_test),y_pred)

0.6497382874070646

In [218]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [219]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [220]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [221]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [222]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
10,xgboost,0.89585,0.493456
5,random forest,0.890531,0.502663
7,gradient boosting,0.876689,0.568733
9,mlp,0.874234,0.577495
0,linear_reg,0.854605,0.649738
2,ridge,0.854678,0.652914
4,decision tree,0.803981,0.679852
6,extra trees,0.783662,0.691898
1,svr,0.769741,0.834124
8,adaboost,0.753712,0.860769


### OneHotEncoding With PCA

In [223]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['sector','agePossession'])
    ], 
    remainder='passthrough'
)

In [224]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('pca', PCA(n_components=0.95)),
    ('regressor', LinearRegression())
])

In [225]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [226]:
scores.mean()

0.763818660211568

In [227]:
scores.std()

0.02854368471829162

In [228]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('pca', PCA(n_components=0.95)),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [229]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [230]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))



In [231]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [232]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.824185,0.645389
1,svr,0.824158,0.664211
10,xgboost,0.828356,0.666489
9,mlp,0.823148,0.670539
7,gradient boosting,0.820756,0.682793
6,extra trees,0.649891,0.829516
4,decision tree,0.628962,0.892991
2,ridge,0.763858,0.909826
0,linear_reg,0.763819,0.909848
8,adaboost,0.692856,0.930395


### Target Encoder

In [233]:
!pip install category-encoders



In [234]:
import category_encoders as ce

columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [235]:
# Creating a pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [236]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

In [237]:
scores.mean(),scores.std()

(0.8289972166194003, 0.018959795853286922)

In [238]:
def scorer(model_name, model):
    
    output = []
    
    output.append(model_name)
    
    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('regressor', model)
    ])
    
    # K-fold cross-validation
    kfold = KFold(n_splits=10, shuffle=True, random_state=42)
    scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')
    
    output.append(scores.mean())
    
    X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)
    
    pipeline.fit(X_train,y_train)
    
    y_pred = pipeline.predict(X_test)
    
    y_pred = np.expm1(y_pred)
    
    output.append(mean_absolute_error(np.expm1(y_test),y_pred))
    
    return output
    

In [239]:
model_dict = {
    'linear_reg':LinearRegression(),
    'svr':SVR(),
    'ridge':Ridge(),
    'LASSO':Lasso(),
    'decision tree': DecisionTreeRegressor(),
    'random forest':RandomForestRegressor(),
    'extra trees': ExtraTreeRegressor(),
    'gradient boosting': GradientBoostingRegressor(),
    'adaboost': AdaBoostRegressor(),
    'mlp': MLPRegressor(),
    'xgboost':XGBRegressor()
}

In [240]:
model_output = []
for model_name,model in model_dict.items():
    model_output.append(scorer(model_name, model))

In [241]:
model_df = pd.DataFrame(model_output, columns=['name','r2','mae'])

In [242]:
model_df.sort_values(['mae'])

Unnamed: 0,name,r2,mae
5,random forest,0.894129,0.462999
10,xgboost,0.896336,0.463656
7,gradient boosting,0.883782,0.52721
4,decision tree,0.802397,0.568109
1,svr,0.863266,0.58307
9,mlp,0.848382,0.597865
6,extra trees,0.764012,0.678287
8,adaboost,0.815937,0.702489
0,linear_reg,0.828997,0.71454
2,ridge,0.829011,0.715103


### Hyperparameter Tuning

In [243]:
from sklearn.model_selection import GridSearchCV

In [244]:
param_grid = {
    'regressor__n_estimators': [50, 100, 200, 300],
    'regressor__max_depth': [None, 10, 20, 30],
    'regressor__max_samples':[0.1, 0.25, 0.5, 1.0],
    'regressor__max_features': ['auto', 'sqrt']
}

In [245]:
columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [246]:
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor())
])

In [247]:
kfold = KFold(n_splits=10, shuffle=True, random_state=42)

In [248]:
search = GridSearchCV(pipeline, param_grid, cv=kfold, scoring='r2', n_jobs=-1, verbose=4)

In [None]:
search.fit(X, y_transformed)

Fitting 10 folds for each of 128 candidates, totalling 1280 fits


640 fits failed out of a total of 1280.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
367 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\vishpand\OneDrive - Nokia\1) Nokia Projects\Projects\.venv\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\vishpand\OneDrive - Nokia\1) Nokia Projects\Projects\.venv\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\vishpand\OneDrive - Nokia\1) Nokia Projects\Projects\.venv\Lib\site-packages\sklearn\pipeline.py", line 476, in fit
  

In [None]:
final_pipe = search.best_estimator_

In [None]:
search.best_params_

{'regressor__max_depth': None,
 'regressor__max_features': 'sqrt',
 'regressor__max_samples': 1.0,
 'regressor__n_estimators': 200}

In [None]:
search.best_score_

0.8901031142961587

In [None]:
final_pipe.fit(X,y_transformed)

### Exporting the model

In [262]:
best_params = {
    'booster': 'gbtree',
    'lambda': 5.343080149402018e-08,
    'alpha': 0.992875781201781,
    'max_depth': 9,
    'eta': 0.1393514456130816,
    'gamma': 2.2922254498761712e-08,
    'grow_policy': 'depthwise',
    # 'sample_type': 'uniform',
    # 'normalize_type': 'tree',
    # 'rate_drop': 1.7710100842560433e-07,
    # 'skip_drop': 1.4610766469331365e-08,
    # 'n_estimators': 300,  # Assuming you want to keep the number of estimators consistent
    # 'random_state': 42  # Setting random state for reproducibility
}

In [263]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [264]:
# Create XGBRegressor with best hyperparameters
best_regressor = XGBRegressor(**best_params)

# Assuming preprocessor is already defined
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Assuming preprocessor is already defined
    ('regressor', best_regressor)
])

In [265]:
pipeline.fit(X,y_transformed)

In [266]:
# K-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=42)
scores = cross_val_score(pipeline, X, y_transformed, cv=kfold, scoring='r2')

print(f"{scores.mean() = }")

X_train, X_test, y_train, y_test = train_test_split(X,y_transformed,test_size=0.2,random_state=42)

pipeline.fit(X_train,y_train)

y_pred = pipeline.predict(X_test)

y_pred = np.expm1(y_pred)

print(f"{mean_absolute_error(np.expm1(y_test),y_pred) = }")

scores.mean() = 0.8975870374434127
mean_absolute_error(np.expm1(y_test),y_pred) = 0.479577735199707


In [267]:
import joblib

joblib.dump(pipeline, f'../../../models/pipeline.joblib')

['../../../models/pipeline.joblib']

In [268]:
joblib.dump(X, '../../../models/df.joblib')

['../../../models/df.joblib']

In [269]:
X.head()

Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,flat,sector 36,3.0,2.0,2,New Property,850.0,0.0,0.0,unfurnished,Low,Low Floor
1,flat,sector 89,2.0,2.0,2,New Property,1226.0,1.0,0.0,unfurnished,Low,Mid Floor
2,flat,sohna road,2.0,2.0,1,New Property,1000.0,0.0,0.0,unfurnished,Low,High Floor
3,flat,sector 92,3.0,4.0,3+,Relatively New,1615.0,1.0,0.0,semifurnished,High,Mid Floor
4,flat,sector 102,2.0,2.0,1,Relatively New,582.0,0.0,1.0,unfurnished,High,Mid Floor


### Trying out the predictions

In [270]:
X.columns

Index(['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category'],
      dtype='object')

In [271]:
X.iloc[0].values

array(['flat', 'sector 36', 3.0, 2.0, '2', 'New Property', 850.0, 0.0,
       0.0, 'unfurnished', 'Low', 'Low Floor'], dtype=object)

In [272]:
data = [['house', 'sector 102', 4, 3, '3+', 'New Property', 2750, 0, 0, 'unfurnished', 'Low', 'Low Floor']]
columns = ['property_type', 'sector', 'bedRoom', 'bathroom', 'balcony',
       'agePossession', 'built_up_area', 'servant room', 'store room',
       'furnishing_type', 'luxury_category', 'floor_category']

# Convert to DataFrame
one_df = pd.DataFrame(data, columns=columns)

one_df


Unnamed: 0,property_type,sector,bedRoom,bathroom,balcony,agePossession,built_up_area,servant room,store room,furnishing_type,luxury_category,floor_category
0,house,sector 102,4,3,3+,New Property,2750,0,0,unfurnished,Low,Low Floor


In [273]:
np.expm1(pipeline.predict(one_df))

array([2.5958443], dtype=float32)

In [274]:
X.dtypes

property_type       object
sector              object
bedRoom            float64
bathroom           float64
balcony             object
agePossession       object
built_up_area      float64
servant room       float64
store room         float64
furnishing_type     object
luxury_category     object
floor_category      object
dtype: object

In [275]:
sorted(X['sector'].unique().tolist())

['dwarka expressway',
 'gwal pahari',
 'manesar',
 'sector 1',
 'sector 10',
 'sector 102',
 'sector 103',
 'sector 104',
 'sector 105',
 'sector 106',
 'sector 107',
 'sector 108',
 'sector 109',
 'sector 11',
 'sector 110',
 'sector 111',
 'sector 112',
 'sector 113',
 'sector 12',
 'sector 13',
 'sector 14',
 'sector 15',
 'sector 17',
 'sector 2',
 'sector 21',
 'sector 22',
 'sector 23',
 'sector 24',
 'sector 25',
 'sector 26',
 'sector 27',
 'sector 28',
 'sector 3',
 'sector 30',
 'sector 31',
 'sector 33',
 'sector 36',
 'sector 37',
 'sector 37d',
 'sector 38',
 'sector 39',
 'sector 4',
 'sector 40',
 'sector 41',
 'sector 43',
 'sector 45',
 'sector 46',
 'sector 47',
 'sector 48',
 'sector 49',
 'sector 5',
 'sector 50',
 'sector 51',
 'sector 52',
 'sector 53',
 'sector 54',
 'sector 55',
 'sector 56',
 'sector 57',
 'sector 58',
 'sector 59',
 'sector 6',
 'sector 60',
 'sector 61',
 'sector 62',
 'sector 63',
 'sector 63a',
 'sector 65',
 'sector 66',
 'sector 67',
 'se

### XGBoost Hyperparameter Tuning using Optuna

In [250]:
# train, test, cv with 0.6, 0.2, 0.2
from sklearn.model_selection import train_test_split

x, x_test, y, y_test = train_test_split(X, y_transformed, test_size=0.2, train_size=0.8, random_state=42)
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size = 0.25, train_size =0.75, random_state=42)

In [None]:
def create_objective(x_train, x_val, y_train, y_val):
    def objective(trial):

        columns_to_encode = ['property_type','sector', 'balcony', 'agePossession', 'furnishing_type', 'luxury_category', 'floor_category']

        # Creating a column transformer for preprocessing
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
                ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
                ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
                ('target_enc', ce.TargetEncoder(), ['sector'])
            ], 
            remainder='passthrough'
        )

        # Define the hyperparameters to be tuned
        param = {
            'verbosity': 0,
            'objective': 'reg:absoluteerror',  # Change to MAE
            'booster': trial.suggest_categorical('booster', ['gbtree', 'gblinear', 'dart']),
            'lambda': trial.suggest_float('lambda', 1e-8, 1.0, log=True),
            'alpha': trial.suggest_float('alpha', 1e-8, 1.0, log=True)
        }
        
        if param['booster'] == 'gbtree' or param['booster'] == 'dart':
            param['max_depth'] = trial.suggest_int('max_depth', 1, 9)
            param['eta'] = trial.suggest_float('eta', 0.01, 0.3)
            param['gamma'] = trial.suggest_float('gamma', 1e-8, 1.0, log=True)
            param['grow_policy'] = trial.suggest_categorical('grow_policy', ['depthwise', 'lossguide'])
        
        if param['booster'] == 'dart':
            param['sample_type'] = trial.suggest_categorical('sample_type', ['uniform', 'weighted'])
            param['normalize_type'] = trial.suggest_categorical('normalize_type', ['tree', 'forest'])
            param['rate_drop'] = trial.suggest_float('rate_drop', 1e-8, 1.0, log=True)
            param['skip_drop'] = trial.suggest_float('skip_drop', 1e-8, 1.0, log=True)
        
        # Define the pipeline
        pipeline = Pipeline([
            ('preprocessor', preprocessor),  # Assuming preprocessor is already defined
            ('regressor', XGBRegressor(**param, n_estimators=300, random_state=42))
        ])
        
        # Train the pipeline
        pipeline.fit(x_train, y_train)
        
        # Make predictions
        preds = pipeline.predict(x_val)
        mae = mean_absolute_error(y_val, preds)
        
        return mae
    
    return objective


In [None]:
import optuna

In [None]:
objective = create_objective(x_train, x_val, y_train, y_val)
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=100, timeout=600)

print("Best MAE: ", study.best_value)
print("Best hyperparameters: ", study.best_params)


[I 2024-06-21 02:03:47,370] A new study created in memory with name: no-name-062da66d-d48d-4d68-8191-649fb6db38d2
[I 2024-06-21 02:03:48,085] Trial 0 finished with value: 0.15245088381867547 and parameters: {'booster': 'gbtree', 'lambda': 0.0001245208040217347, 'alpha': 3.078216576188828e-07, 'max_depth': 1, 'eta': 0.23538300811567708, 'gamma': 0.0001114893093338104, 'grow_policy': 'depthwise'}. Best is trial 0 with value: 0.15245088381867547.
[I 2024-06-21 02:03:48,608] Trial 1 finished with value: 0.13762631927426436 and parameters: {'booster': 'gbtree', 'lambda': 0.009818270679196555, 'alpha': 1.517130099379337e-07, 'max_depth': 8, 'eta': 0.27669424215191313, 'gamma': 1.27932023920658e-07, 'grow_policy': 'depthwise'}. Best is trial 1 with value: 0.13762631927426436.
[I 2024-06-21 02:03:48,815] Trial 2 finished with value: 0.4635475179677218 and parameters: {'booster': 'gblinear', 'lambda': 1.181555389884381e-06, 'alpha': 1.2285578386374197e-08}. Best is trial 1 with value: 0.1376263

Best MAE:  0.1267862231694066
Best hyperparameters:  {'booster': 'gbtree', 'lambda': 5.343080149402018e-08, 'alpha': 0.992875781201781, 'max_depth': 9, 'eta': 0.1393514456130816, 'gamma': 2.2922254498761712e-08, 'grow_policy': 'depthwise'}


In [251]:
best_params = {
    'booster': 'gbtree',
    'lambda': 5.343080149402018e-08,
    'alpha': 0.992875781201781,
    'max_depth': 9,
    'eta': 0.1393514456130816,
    'gamma': 2.2922254498761712e-08,
    'grow_policy': 'depthwise',
    # 'sample_type': 'uniform',
    # 'normalize_type': 'tree',
    # 'rate_drop': 1.7710100842560433e-07,
    # 'skip_drop': 1.4610766469331365e-08,
    # 'n_estimators': 300,  # Assuming you want to keep the number of estimators consistent
    # 'random_state': 42  # Setting random state for reproducibility
}

In [252]:
# Creating a column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['bedRoom', 'bathroom', 'built_up_area', 'servant room', 'store room']),
        ('cat', OrdinalEncoder(), [i for i in columns_to_encode if i != 'sector' and i != 'agePossession']),
        ('cat1',OneHotEncoder(drop='first',sparse_output=False),['agePossession']),
        ('target_enc', ce.TargetEncoder(), ['sector'])
    ], 
    remainder='passthrough'
)

In [259]:
# Create XGBRegressor with best hyperparameters
best_regressor = XGBRegressor(**best_params)

# Assuming preprocessor is already defined
pipeline = Pipeline([
    ('preprocessor', preprocessor),  # Assuming preprocessor is already defined
    ('regressor', best_regressor)
])

# Train the pipeline on the entire training data
pipeline.fit(x, y)

In [260]:
y_pred = pipeline.predict(x_test)

y_pred = np.expm1(y_pred)

print(f"{mean_absolute_error(np.expm1(y_test),y_pred) = }")

mean_absolute_error(np.expm1(y_test),y_pred) = 0.479577735199707


In [261]:
from sklearn.metrics import r2_score

print(f"{r2_score(np.expm1(y_test),y_pred) = }")

r2_score(np.expm1(y_test),y_pred) = 0.8606679502924164
