In [27]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.ensemble import AdaBoostRegressor
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score
from sklearn.feature_selection import SelectKBest

In [28]:
# Load your dataset (replace 'cleaned_data_house_price.csv' with your dataset file)
dataset = pd.read_csv("cleaned_data_house_price.csv")


In [29]:
dataset1 = pd.read_csv("cleaned_data_house_price.csv", index_col=None)

# Take a backup of the original dataset
df2 = dataset1.copy()

# Perform one-hot encoding to handle categorical variables
df2 = pd.get_dummies(df2, drop_first=True)  # remove first redundant col wrt to nominal col


# Assuming 'price_per_sqft' is your target variable, and other columns are features
X = df2.drop(columns=['price_per_sqft'])
y = df2['price_per_sqft']

In [30]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [31]:
# Create a pipeline with feature selection and boosting regressor (AdaBoost)
ada_pipeline = Pipeline([
    ('feature_selection', SelectKBest(k=2)),  # Select the top k features
    ('boosting', AdaBoostRegressor())  # AdaBoost regressor
])


In [32]:
# Create a pipeline with feature selection and boosting regressor (LightGBM)
lgbm_pipeline = Pipeline([
    ('feature_selection', SelectKBest(k=2)),  # Select the top k features
    ('boosting', LGBMRegressor())  # LightGBM regressor
])

In [34]:
# Create a pipeline with feature selection and boosting regressor (XGBoost)
xgb_pipeline = Pipeline([
    ('feature_selection', SelectKBest(k=2)),  # Select the top k features
    ('boosting', XGBRegressor())  # XGBoost regressor
])


In [35]:
# Fit the pipelines on the training data
ada_pipeline.fit(X_train, y_train)
lgbm_pipeline.fit(X_train, y_train)
xgb_pipeline.fit(X_train, y_train)

  45  46  47  49  65  66  68  69  72  74  75  78  85  91  95  98 102 106
 107 112 114 118 119 123 125 128 139 148 151 152 156 159 162 170 171 175
 183 186 187 189 196 202 203 211 214 220 221 222 225 229 237 242 245 251
 252 265 266 272 273 274 275 277 284 287 289 293 294 296 300 301 303 315
 323 329 330 337 338 340 342 344 351 355 359 361 362 364 370 376 384 387
 392 394 395 398 401 419 421 424 430 431 433 449 454 456 458 459 469 473
 474 476 481 485 489 498 500 506 507 510 512 522 544 545 547 556 557 558
 560 561 565 570 575 584 599 607 609 611 613 621 623 624 626 627 635 638
 639 641 642 645 649 652 653 656 658 662 663 668 675 678 679 680 685 697
 700 706 708 712 713 714 715 717 718 722 724 727 728 730 734 735 737 739
 741 745 747 753 757 763 768 769 775 786 788 789 790 794 800 804 806 807
 808 810 814 819 820 821 832 836 841 843 846 847 848 849 850 853 857 860
 863 864 865 875 876 879 880 882 888 891 893 895 898 899 900 901 902 903
 904 907] are constant.
  f = msb / msw
  f = msb /

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 6888, number of used features: 0
[LightGBM] [Info] Start training from score 49974.392346


  45  46  47  49  65  66  68  69  72  74  75  78  85  91  95  98 102 106
 107 112 114 118 119 123 125 128 139 148 151 152 156 159 162 170 171 175
 183 186 187 189 196 202 203 211 214 220 221 222 225 229 237 242 245 251
 252 265 266 272 273 274 275 277 284 287 289 293 294 296 300 301 303 315
 323 329 330 337 338 340 342 344 351 355 359 361 362 364 370 376 384 387
 392 394 395 398 401 419 421 424 430 431 433 449 454 456 458 459 469 473
 474 476 481 485 489 498 500 506 507 510 512 522 544 545 547 556 557 558
 560 561 565 570 575 584 599 607 609 611 613 621 623 624 626 627 635 638
 639 641 642 645 649 652 653 656 658 662 663 668 675 678 679 680 685 697
 700 706 708 712 713 714 715 717 718 722 724 727 728 730 734 735 737 739
 741 745 747 753 757 763 768 769 775 786 788 789 790 794 800 804 806 807
 808 810 814 819 820 821 832 836 841 843 846 847 848 849 850 853 857 860
 863 864 865 875 876 879 880 882 888 891 893 895 898 899 900 901 902 903
 904 907] are constant.
  f = msb / msw
  f = msb /

Pipeline(steps=[('feature_selection', SelectKBest(k=2)),
                ('boosting',
                 XGBRegressor(base_score=0.5, booster='gbtree', callbacks=None,
                              colsample_bylevel=1, colsample_bynode=1,
                              colsample_bytree=1, early_stopping_rounds=None,
                              enable_categorical=False, eval_metric=None,
                              gamma=0, gpu_id=-1, grow_policy='depthwise',
                              importance_type=None, interaction_constraints='',
                              learning_rate=0.300000012, max_bin=256,
                              max_cat_to_onehot=4, max_delta_step=0,
                              max_depth=6, max_leaves=0, min_child_weight=1,
                              missing=nan, monotone_constraints='()',
                              n_estimators=100, n_jobs=0, num_parallel_tree=1,
                              predictor='auto', random_state=0, reg_alpha=0,
              

In [36]:
# Evaluate the models
r2_ada = r2_score(y_test, ada_pipeline.predict(X_test))
r2_lgbm = r2_score(y_test, lgbm_pipeline.predict(X_test))
r2_xgb = r2_score(y_test, xgb_pipeline.predict(X_test))



In [37]:
print("AdaBoost R-squared:", r2_ada)
print("LightGBM R-squared:", r2_lgbm)
print("XGBoost R-squared:", r2_xgb)

AdaBoost R-squared: -0.0013587248829549647
LightGBM R-squared: -5.090171304455993e-07
XGBoost R-squared: -4.7408331549370075e-07
