In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [7]:
from sklearn.datasets import fetch_california_housing

In [8]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor


In [12]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-3.0.5-py3-none-win_amd64.whl.metadata (2.1 kB)
Downloading xgboost-3.0.5-py3-none-win_amd64.whl (56.8 MB)
   ---------------------------------------- 0.0/56.8 MB ? eta -:--:--
   ---------------------------------------- 0.5/56.8 MB 4.0 MB/s eta 0:00:14
   - -------------------------------------- 1.6/56.8 MB 3.9 MB/s eta 0:00:15
   - -------------------------------------- 2.6/56.8 MB 4.3 MB/s eta 0:00:13
   -- ------------------------------------- 3.4/56.8 MB 4.4 MB/s eta 0:00:13
   -- ------------------------------------- 4.2/56.8 MB 4.2 MB/s eta 0:00:13
   --- ------------------------------------ 5.0/56.8 MB 4.2 MB/s eta 0:00:13
   ---- ----------------------------------- 5.8/56.8 MB 4.3 MB/s eta 0:00:12
   ---- ----------------------------------- 6.6/56.8 MB 4.1 MB/s eta 0:00:13
   ----- ---------------------------------- 7.3/56.8 MB 4.1 MB/s eta 0:00:12
   ----- ---------------------------------- 7.9/56.8 MB 3.9 MB/s eta 0:00:13
   ------ ---

In [2]:
import xgboost
print(xgboost.__version__)


3.0.5


In [4]:
from xgboost import XGBRegressor
import lightgbm as lgb

In [11]:
data = fetch_california_housing(as_frame=True)

X = data.data
y = data.target

In [12]:
print(data.data)

       MedInc  HouseAge  AveRooms  AveBedrms  Population  AveOccup  Latitude  \
0      8.3252      41.0  6.984127   1.023810       322.0  2.555556     37.88   
1      8.3014      21.0  6.238137   0.971880      2401.0  2.109842     37.86   
2      7.2574      52.0  8.288136   1.073446       496.0  2.802260     37.85   
3      5.6431      52.0  5.817352   1.073059       558.0  2.547945     37.85   
4      3.8462      52.0  6.281853   1.081081       565.0  2.181467     37.85   
...       ...       ...       ...        ...         ...       ...       ...   
20635  1.5603      25.0  5.045455   1.133333       845.0  2.560606     39.48   
20636  2.5568      18.0  6.114035   1.315789       356.0  3.122807     39.49   
20637  1.7000      17.0  5.205543   1.120092      1007.0  2.325635     39.43   
20638  1.8672      18.0  5.329513   1.171920       741.0  2.123209     39.43   
20639  2.3886      16.0  5.254717   1.162264      1387.0  2.616981     39.37   

       Longitude  
0        -122.23  
1

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [14]:
## Decision Tree Regression

dt_reg = DecisionTreeRegressor(random_state=42)

In [15]:
## fit X_train, y_train in DT

dt_reg.fit(X_train, y_train)

In [18]:
## Predictions of the model with test data

y_pred_dt = dt_reg.predict(X_test)

In [20]:
## calculating mean_squared_error 

mse_dt = mean_squared_error(y_test, y_pred_dt)
print(f"Decision Tree Regression MSE: {mse_dt:.4f}")

Decision Tree Regression MSE: 0.4952


In [21]:
## Random Forest Regrerssor

rf_reg = RandomForestRegressor(n_estimators = 100, random_state=42)


In [22]:
## Fit X-train, y_train into rf_reg model we created

rf_reg.fit(X_train, y_train)

In [24]:
y_pred_rf = rf_reg.predict(X_test)

In [26]:
mse_rf = mean_squared_error(y_test, y_pred_rf)
print(mse_rf)

0.2553684927247781


In [27]:
## XGBoost Regressor
xgb_reg = XGBRegressor(objective= 'reg:squarederror', random_state = 42)

In [28]:
## Defining Parameters for Hyperparameter tuning for XGBoost

xgb_param_grid = {
    'n_estimators' : [100,200],
    'max_depth' : [3,5,7],
    'learning_rate' : [0.01,0.05,0.1],
    'subsample' : [0.8,1.0],
    'colsample_bytree' : [0.8,1.0]
    
}

In [30]:
from sklearn.model_selection import GridSearchCV

In [34]:
## Using GridSearchCV to implement cross validation on the hyperparameters to the XGBoost Model

xgb_grid = GridSearchCV(estimator=xgb_reg,
                       param_grid= xgb_param_grid,
                       scoring= 'neg_mean_squared_error',
                       cv=3,
                       
                       )

In [35]:
## fitting the X_train, y_train to the model after gridsearchcv

xgb_grid.fit(X_train, y_train)

In [37]:
## Returns the entire model which has the best hyperparameters found by gridsearchcv

xgb_best = xgb_grid.best_estimator_

In [39]:
y_xgb_pred = xgb_best.predict(X_test)

In [40]:
## Calculating the mean squared error
mse_xgb = mean_squared_error(y_test, y_xgb_pred)

In [41]:
print(mse_xgb)

0.19955334536451447


In [43]:
## LightGBM

lgbm_reg = lgb.LGBMRegressor(objective = 'regression', random_state = 42)

In [48]:
## Initializing hyperparameters for LightGBM 

lgbm_param_grid = {
    'n_estimators' : [100,200],
    'num_leaves' : [30,50,70],
    'learning_rate' : [0.01,0.05,0.1],
    'subsample' : [0.8,1.0],
    'colsample_bytree' : [0.8,1.0]}

In [50]:
lgbm_grid = GridSearchCV(estimator=lgbm_reg,
                        param_grid=lgbm_param_grid,
                        scoring= 'neg_mean_squared_error',
                        cv = 3)

In [51]:
lgbm_grid.fit(X_train,y_train)

[WinError 2] The system cannot find the file specified
  File "C:\Users\hvutuku\anaconda3\Lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
        "wmic CPU Get NumberOfCores /Format:csv".split(),
        capture_output=True,
        text=True,
    )
  File "C:\Users\hvutuku\anaconda3\Lib\subprocess.py", line 554, in run
    with Popen(*popenargs, **kwargs) as process:
         ~~~~~^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\hvutuku\anaconda3\Lib\subprocess.py", line 1039, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
    ~~~~~~~~~~~~~~~~~~~^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
                        pass_fds, cwd, env,
                        ^^^^^^^^^^^^^^^^^^^
    ...<5 lines>...
                        gid, gids, uid, umask,
                        ^^^^^^^^^^^^^^^^^^^^^^
                        start_new_session, process_group)
                        ^^^^^^^^^^^^^^^^^^^^^

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000373 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start training from score 2.064393
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000174 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1837
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start training from score 2.078156
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1838
[LightGBM] [Info] Number of data points in the train set: 11008, number of used features: 8
[LightGBM] [Info] Start traini

In [52]:
lgbm_best = lgbm_grid.best_estimator_

In [53]:
y_pred_lgbm = lgbm_best.predict(X_test)

In [54]:
mse_lgbm = mean_squared_error(y_test, y_pred_lgbm)
print(mse_lgbm)

0.19260823006099917
