In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score,KFold,GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
df = pd.read_csv("cleaned_train.csv")
df.head()

Unnamed: 0,id,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,...,Department,Bar For Salad,Florist,Coffee Bar,Ready Food,Video Store,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,...,Household,1,1,1,1,1,28.2,26.6,1.6,10000.0
1,mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,...,Snack Foods,0,0,0,0,0,16.57,14.97,1.6,50000.0
2,mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,...,Periodicals,0,1,0,0,0,28.64,27.18,1.45,30000.0
3,mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,...,Produce,1,1,1,1,1,12.62,9.71,2.91,50000.0
4,mc_ID_5,S03le_WA,Weekend Discount,Supermarket,5220000.0,1618200.0,yes,1970.17,1236.07,440.92,...,Household,0,0,1,0,0,15.41,13.95,1.45,30000.0


In [9]:
df.columns

Index(['id', 'Place Code', 'Promotion Name', 'Store Kind', 'Store Sales',
       'Store Cost', 'Is Recyclable?', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Cost', 'Marriage', 'Gender', 'Children',
       'Degree', 'Work', 'Oreder Brand', 'Product', 'Department',
       'Bar For Salad', 'Florist', 'Coffee Bar', 'Ready Food', 'Video Store',
       'Gross Weight', 'Net Weight', 'Package Weight',
       'Min. Person Yearly Income'],
      dtype='object')

In [3]:
target = "Cost"
X = df.drop(columns=target)
y = df[target]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
ord_categorical_columns = ['Place Code', 'Promotion Name', 'Store Kind', 'Is Recyclable?',
                            'Children', 'Degree', 'Work', 'Oreder Brand',
                            'Product', 'Department']

hot_categorical_columns = ['Marriage', 'Gender']

all_categorical_columns = X.select_dtypes("object").columns

numeric_columns = X.select_dtypes("float").columns

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('ohe', OneHotEncoder(), hot_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [7]:
model = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=20, random_state=42)
)
model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [8]:
# Check if the model fitted well or not
predictions = model.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

16.24279568376655

In [10]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

66.13992962598071

In [11]:
rfmodel = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=100, random_state=42)
)
rfmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [12]:
# Check if the model fitted well or not
predictions = rfmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

19.280206350134836

In [13]:
predictions = rfmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

53.03544924733378

In [14]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

scores = cross_val_score(rfmodel, X_train, y_train, cv=5, scoring=rmse_scorer)

positive_scores = -scores

print("Cross-validation RMSE scores:", positive_scores)

Cross-validation RMSE scores: [53.65738053 56.10354203 52.34148074 52.18038972 50.51044931]


In [15]:
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Try Lazypredict

In [16]:
reg = LazyRegressor(verbose=1,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)


  2%|██                                                                                 | 1/41 [00:01<01:18,  1.95s/it]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.1229879692040099, 'Adjusted R-Squared': 0.11924272209740783, 'RMSE': 147.04110042057997, 'Time taken': 1.9538815021514893}


  5%|████                                                                               | 2/41 [00:09<03:21,  5.16s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8722005560563311, 'Adjusted R-Squared': 0.8716547933063937, 'RMSE': 56.130699031326074, 'Time taken': 7.39117169380188}


  7%|██████                                                                             | 3/41 [00:09<01:53,  2.99s/it]

{'Model': 'BayesianRidge', 'R-Squared': 0.016696374049479545, 'Adjusted R-Squared': 0.012497212657520018, 'RMSE': 155.69683667700198, 'Time taken': 0.40448904037475586}


 12%|██████████                                                                         | 5/41 [00:11<00:54,  1.51s/it]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.8071444327947982, 'Adjusted R-Squared': 0.8063208503013952, 'RMSE': 68.95282418809465, 'Time taken': 1.2073006629943848}
{'Model': 'DummyRegressor', 'R-Squared': -3.504148029964682e-06, 'Adjusted R-Squared': -0.004273981745815281, 'RMSE': 157.01340419741697, 'Time taken': 0.12865519523620605}


 15%|████████████▏                                                                      | 6/41 [00:11<00:37,  1.06s/it]

{'Model': 'ElasticNet', 'R-Squared': 0.01438935405967523, 'Adjusted R-Squared': 0.010180340625054574, 'RMSE': 155.87937707768427, 'Time taken': 0.20059704780578613}


 17%|██████████████▏                                                                    | 7/41 [00:13<00:43,  1.29s/it]

{'Model': 'ElasticNetCV', 'R-Squared': 0.016740751681517674, 'Adjusted R-Squared': 0.012541779802577535, 'RMSE': 155.69332324803412, 'Time taken': 1.7603068351745605}


 20%|████████████████▏                                                                  | 8/41 [00:13<00:34,  1.04s/it]

{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.7436033515424032, 'Adjusted R-Squared': 0.7425084192358227, 'RMSE': 79.50457871545507, 'Time taken': 0.5083816051483154}


 22%|██████████████████▏                                                                | 9/41 [00:44<05:34, 10.45s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.8952187312382067, 'Adjusted R-Squared': 0.8947712667452739, 'RMSE': 50.8250477268684, 'Time taken': 31.140395641326904}


 24%|████████████████████                                                              | 10/41 [00:45<03:46,  7.31s/it]

{'Model': 'GammaRegressor', 'R-Squared': 0.012413960599017226, 'Adjusted R-Squared': 0.008196511320436528, 'RMSE': 156.03550818807867, 'Time taken': 0.28238964080810547}


 27%|█████████████████████▏                                                         | 11/41 [10:12<1:29:20, 178.70s/it]

{'Model': 'GaussianProcessRegressor', 'R-Squared': -5.9800688849087935, 'Adjusted R-Squared': -6.009877008260718, 'RMSE': 414.82586074184235, 'Time taken': 567.2829926013947}


 29%|███████████████████████                                                        | 12/41 [10:34<1:03:25, 131.22s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.4747478194542283, 'Adjusted R-Squared': 0.4725047496440684, 'RMSE': 113.79414071212821, 'Time taken': 22.612470388412476}


 32%|██████████████████████████                                                        | 13/41 [10:41<43:36, 93.45s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.8243723247407336, 'Adjusted R-Squared': 0.8236223133161389, 'RMSE': 65.8009926167201, 'Time taken': 6.543072462081909}


 34%|████████████████████████████                                                      | 14/41 [10:42<29:29, 65.52s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.014051531776593795, 'Adjusted R-Squared': 0.009841075684536515, 'RMSE': 155.9060889520537, 'Time taken': 0.9933726787567139}


 37%|██████████████████████████████                                                    | 15/41 [10:51<20:59, 48.44s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': -0.03713842973321135, 'Adjusted R-Squared': -0.041567490643104055, 'RMSE': 159.9021606324316, 'Time taken': 8.837711095809937}


 39%|███████████████████████████████▌                                                 | 16/41 [14:46<43:39, 104.77s/it]

{'Model': 'KernelRidge', 'R-Squared': -10.95020055635004, 'Adjusted R-Squared': -11.001233441288191, 'RMSE': 542.7796612977268, 'Time taken': 235.5947666168213}
{'Model': 'Lars', 'R-Squared': 0.016506461702418318, 'Adjusted R-Squared': 0.012306489296877055, 'RMSE': 155.7118713638147, 'Time taken': 0.21408700942993164}


 44%|████████████████████████████████████                                              | 18/41 [14:47<19:45, 51.53s/it]

{'Model': 'LarsCV', 'R-Squared': 0.01699069899152994, 'Adjusted R-Squared': 0.012792794503237559, 'RMSE': 155.67353314649284, 'Time taken': 0.7789764404296875}


 46%|██████████████████████████████████████                                            | 19/41 [14:48<13:14, 36.13s/it]

{'Model': 'Lasso', 'R-Squared': 0.016594826256287054, 'Adjusted R-Squared': 0.012395231208271129, 'RMSE': 155.70487603614566, 'Time taken': 0.2490708827972412}


 49%|████████████████████████████████████████                                          | 20/41 [14:49<09:01, 25.81s/it]

{'Model': 'LassoCV', 'R-Squared': 0.017012684028474556, 'Adjusted R-Squared': 0.012814873426460927, 'RMSE': 155.6717923147885, 'Time taken': 1.746098279953003}


 51%|██████████████████████████████████████████                                        | 21/41 [14:50<06:02, 18.13s/it]

{'Model': 'LassoLars', 'R-Squared': -3.504148029964682e-06, 'Adjusted R-Squared': -0.004273981745815281, 'RMSE': 157.01340419741697, 'Time taken': 0.23388361930847168}


 54%|████████████████████████████████████████████                                      | 22/41 [14:50<04:05, 12.95s/it]

{'Model': 'LassoLarsCV', 'R-Squared': 0.016950951023656402, 'Adjusted R-Squared': 0.012752876793152401, 'RMSE': 155.67668044328812, 'Time taken': 0.8482048511505127}


 56%|██████████████████████████████████████████████                                    | 23/41 [14:51<02:44,  9.15s/it]

{'Model': 'LassoLarsIC', 'R-Squared': 0.01714137829930784, 'Adjusted R-Squared': 0.012944117281368905, 'RMSE': 155.6616015812756, 'Time taken': 0.28336143493652344}


 59%|████████████████████████████████████████████████                                  | 24/41 [14:51<01:50,  6.48s/it]

{'Model': 'LinearRegression', 'R-Squared': 0.016506461702418096, 'Adjusted R-Squared': 0.012306489296876832, 'RMSE': 155.7118713638147, 'Time taken': 0.2522721290588379}


 61%|██████████████████████████████████████████████████                                | 25/41 [14:51<01:14,  4.63s/it]

{'Model': 'LinearSVR', 'R-Squared': 0.010171962685455394, 'Adjusted R-Squared': 0.005944939038560526, 'RMSE': 156.2125221139118, 'Time taken': 0.31273865699768066}


 63%|████████████████████████████████████████████████████                              | 26/41 [16:27<08:01, 32.07s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.08731012415470973, 'Adjusted R-Squared': 0.08341251614398248, 'RMSE': 150.0021850004357, 'Time taken': 96.07265114784241}


 66%|██████████████████████████████████████████████████████                            | 27/41 [19:19<17:12, 73.77s/it]

{'Model': 'NuSVR', 'R-Squared': 0.02547180780149738, 'Adjusted R-Squared': 0.021310121571468166, 'RMSE': 155.00052614784155, 'Time taken': 171.06666731834412}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.014820330442103402, 'Adjusted R-Squared': 0.010613157476019874, 'RMSE': 155.84529278898992, 'Time taken': 0.16894030570983887}


 71%|██████████████████████████████████████████████████████████                        | 29/41 [19:19<07:16, 36.36s/it]

{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.016798998235654672, 'Adjusted R-Squared': 0.012600275096447455, 'RMSE': 155.68871167997472, 'Time taken': 0.5670478343963623}


 73%|████████████████████████████████████████████████████████████                      | 30/41 [19:20<04:41, 25.55s/it]

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.06439319981599412, 'Adjusted R-Squared': -0.06893865120310871, 'RMSE': 161.98955573181004, 'Time taken': 0.3503565788269043}


 76%|██████████████████████████████████████████████████████████████                    | 31/41 [19:20<03:00, 18.03s/it]

{'Model': 'PoissonRegressor', 'R-Squared': 0.016442369775478638, 'Adjusted R-Squared': 0.012242123667758231, 'RMSE': 155.71694496669497, 'Time taken': 0.4661848545074463}


 78%|████████████████████████████████████████████████████████████████                  | 32/41 [19:21<01:56, 12.90s/it]

{'Model': 'RANSACRegressor', 'R-Squared': -1.3665839682302154, 'Adjusted R-Squared': -1.376690376635469, 'RMSE': 241.5443747451886, 'Time taken': 0.9357280731201172}


 83%|████████████████████████████████████████████████████████████████████              | 34/41 [20:25<02:18, 19.76s/it]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.8858349484275655, 'Adjusted R-Squared': 0.8853474108407793, 'RMSE': 53.05209727756488, 'Time taken': 63.75239706039429}
{'Model': 'Ridge', 'R-Squared': 0.0165068418332045, 'Adjusted R-Squared': 0.012306871050997481, 'RMSE': 155.71184127165878, 'Time taken': 0.1579442024230957}


 85%|██████████████████████████████████████████████████████████████████████            | 35/41 [20:25<01:23, 13.92s/it]

{'Model': 'RidgeCV', 'R-Squared': 0.016510236875647455, 'Adjusted R-Squared': 0.01231028059184236, 'RMSE': 155.71157251088408, 'Time taken': 0.3004794120788574}


 88%|████████████████████████████████████████████████████████████████████████          | 36/41 [20:26<00:49,  9.92s/it]

{'Model': 'SGDRegressor', 'R-Squared': 0.013561621981421412, 'Adjusted R-Squared': 0.009349073747890069, 'RMSE': 155.9448183769587, 'Time taken': 0.5684065818786621}


 93%|████████████████████████████████████████████████████████████████████████████      | 38/41 [23:23<02:06, 42.11s/it]

{'Model': 'SVR', 'R-Squared': 0.03079171715819351, 'Adjusted R-Squared': 0.02665274940228546, 'RMSE': 154.57687643348802, 'Time taken': 177.1359601020813}
{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.016506461702418096, 'Adjusted R-Squared': 0.012306489296876832, 'RMSE': 155.7118713638147, 'Time taken': 0.1798996925354004}


 95%|██████████████████████████████████████████████████████████████████████████████    | 39/41 [23:23<00:59, 29.55s/it]

{'Model': 'TweedieRegressor', 'R-Squared': 0.012382814440567103, 'Adjusted R-Squared': 0.008165232153480528, 'RMSE': 156.03796866653374, 'Time taken': 0.23825287818908691}


 98%|████████████████████████████████████████████████████████████████████████████████  | 40/41 [23:29<00:22, 22.47s/it]

{'Model': 'XGBRegressor', 'R-Squared': 0.8527332424729093, 'Adjusted R-Squared': 0.8521043452877402, 'RMSE': 60.25433963077798, 'Time taken': 5.949170827865601}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2080
[LightGBM] [Info] Number of data points in the train set: 22580, number of used features: 24
[LightGBM] [Info] Start training from score 519.389129


100%|██████████████████████████████████████████████████████████████████████████████████| 41/41 [23:31<00:00, 34.43s/it]

{'Model': 'LGBMRegressor', 'R-Squared': 0.8287743411010376, 'Adjusted R-Squared': 0.8280431283228215, 'RMSE': 64.97112573940814, 'Time taken': 1.879643440246582}





In [17]:
print(models)

                               Adjusted R-Squared  R-Squared   RMSE  \
Model                                                                 
ExtraTreesRegressor                          0.89       0.90  50.83   
RandomForestRegressor                        0.89       0.89  53.05   
BaggingRegressor                             0.87       0.87  56.13   
XGBRegressor                                 0.85       0.85  60.25   
LGBMRegressor                                0.83       0.83  64.97   
HistGradientBoostingRegressor                0.82       0.82  65.80   
DecisionTreeRegressor                        0.81       0.81  68.95   
ExtraTreeRegressor                           0.74       0.74  79.50   
GradientBoostingRegressor                    0.47       0.47 113.79   
AdaBoostRegressor                            0.12       0.12 147.04   
MLPRegressor                                 0.08       0.09 150.00   
SVR                                          0.03       0.03 154.58   
NuSVR 

In [18]:
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

# Extra tree regressor

In [20]:
# Preprocess the data by removing missing values and scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train_transformed)
X_test = scaler.transform(X_test_transformed)

In [25]:
# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

In [26]:

# Train an ExtraTreesRegressor model on the reduced dataset
exmodel = make_pipeline(ExtraTreesRegressor(random_state=42))
exmodel.fit(X_train_pca, y_train)


Pipeline(steps=[('extratreesregressor', ExtraTreesRegressor(random_state=42))])

In [27]:
# Check if the model fitted well or not
predictions = exmodel.predict(X_train_pca)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

7.680054705663122e-13

In [28]:
# Evaluate the performance of the model on the testing set
y_pred = exmodel.predict(X_test_pca)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse:.2f}')

RMSE: 138.25


# LGBMRegressor

In [35]:
# Preprocess the data by removing missing values and scaling the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)


In [38]:
# Apply pPCA to reduce the dimensionality of the data
ppca = PCA(n_components=10)
X_train_ppca = ppca.fit_transform(X_train)
X_test_ppca = ppca.transform(X_test)

In [41]:
from lightgbm import LGBMRegressor

In [None]:

# Train an LGBMRegressor model on the reduced dataset using GridSearchCV for hyperparameter tuning
model = make_pipeline(LGBMRegressor())
param_grid = {
    'lgbmregressor__learning_rate': [0.01, 0.1],
    'lgbmregressor__n_estimators': range(50, 501, 50),
    'lgbmregressor__num_leaves': [31, 45, 61],
    'lgbmregressor__max_depth': [2, 4, 8, 16, 32],
}
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5,n_jobs=-1,
    verbose=1)
grid_search.fit(X_train_pca, y_train)


In [None]:
# Evaluate the performance of the model on the testing set
y_pred = grid_search.predict(X_test_pca)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse:.2f}')

# XGBRegressor

In [None]:
# Preprocess the data by removing missing values and scaling the features using StandardScaler
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

In [None]:
# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_transformed)
X_test_pca = pca.transform(X_test_transformed)

In [None]:
model = make_pipeline(preprocessor, XGBRegressor(n_estimators=500, learning_rate=0.1, max_depth=12, min_split_loss=5))
model.fit(X_train, y_train)

In [None]:
y_pred = model.predict(X_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

In [None]:
param_grid = {
    'xgbregressor__learning_rate': [0.01, 0.1],
    'xgbregressor__n_estimators': range(50, 501, 50),
    'xgbregressor__max_depth': [2, 4, 8, 16, 32],
    'xgbregressor__min_split_loss': range(0, 7, 2)
    
}


grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

grid_search.fit(X_train_pca, y_train)

In [None]:
y_pred = grid_search.predict(X_test_pca)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

In [None]:
print(f'Best hyperparameters: {grid_search.best_params_}')


# BaggingRegressor

In [None]:
# Preprocess the data by removing missing values and scaling the features using StandardScaler
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_transformed)
X_test_pca = pca.transform(X_test_transformed)

# Train a BaggingRegressor model on the reduced dataset using GridSearchCV for hyperparameter tuning
params = {
    'n_estimators': range(100, 801, 20),
}
BAG_RF = GridSearchCV(
    BaggingRegressor(random_state=42, n_jobs=-1),
    param_grid=params,
    cv=3,
    n_jobs=-1,
    verbose=1
)
BAG_RF.fit(X_train_pca, y_train)


In [None]:
BAG_RF.best_params_
BAG = BAG_RF.best_estimator_
BAG

# RandomForestRegressor

In [None]:
scaler = StandardScaler()
X_train_transformed = scaler.fit_transform(X_train)
X_test_transformed = scaler.transform(X_test)

# Apply PCA to reduce the dimensionality of the data
pca = PCA(n_components=10)
X_train_pca = pca.fit_transform(X_train_transformed)
X_test_pca = pca.transform(X_test_transformed)

In [None]:

params = {
    'n_estimators': range(100, 501, 100),
    'max_depth': [2, 8, 16, 32, 64],
    'min_samples_split': [2,4,6],
    'min_samples_leaf': [1,2,3],
}
GS_RF = GridSearchCV(
    RandomForestRegressor(random_state=42, n_jobs=-1),
    param_grid=params,
    cv=3,
    n_jobs=-1,
    verbose=1
)
GS_RF.fit(X_train_pca, y_train)

# Evaluate the performance of the model on the testing set
y_pred = GS_RF.predict(X_test_pca)
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'RMSE: {rmse:.2f}')

In [None]:
GS_RF.best_params_

In [None]:
RF = GS_RF.best_estimator_
RF

# STacking

In [None]:
from sklearn.ensemble import StackingRegressor, VotingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor


In [None]:
# Define the pipelines for each model
rf = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=512, max_depth=16, n_jobs=-1, random_state=42)
)

et = make_pipeline(
    preprocessor,
    ExtraTreesRegressor(n_estimators=512, max_depth=16, n_jobs=-1, random_state=42)
)

bag = make_pipeline(
    preprocessor,
    BaggingRegressor(n_estimators=75, n_jobs=-1, random_state=42)
)

xgb = make_pipeline(
    preprocessor,
    XGBRegressor(n_estimators=1000, learning_rate=0.05, max_depth=8, random_state=42)
)

lgbm = make_pipeline(
    preprocessor,
    LGBMRegressor(n_estimators=1000, learning_rate=0.05, max_depth=8, random_state=42)
)

In [None]:
# Define the estimators for the ensemble models
estimators_pca = [
              ('rf', rf),
              ('et', et),
              ('bag', bag),
              ('xgb', xgb),
              ('lgbm', lgbm)
             ]

# Create the stacking model
stack_model_pca = StackingRegressor(estimators=estimators_pca, final_estimator=RandomForestRegressor(random_state=42)) # Train the stacking model with PCA
stack_model_pca.fit(X_train_transformed, y_train)


In [None]:
# Check if the model fitted well or not with PCA
predictions_pca = stack_model_pca.predict(X_train_transformed)
mse_pca = mean_squared_error(y_train, predictions_pca)
rmse_pca = np.sqrt(mse_pca)
print(f'Training Root Mean Squared Error with PCA: {rmse_pca:.2f}')

In [None]:

predictions_pca = stack_model_pca.predict(X_test_transformed)
mse_pca = mean_squared_error(y_test, predictions_pca)
rmse_pca = np.sqrt(mse_pca)
print(f'Testing Root Mean Squared Error with PCA: {rmse_pca:.2f}')



In [None]:
# Create the voting model with PCA
voting_model_pca = VotingRegressor(estimators=estimators_pca)

voting_model_pca.fit(X_train_transformed, y_train)

In [None]:
# Check if the model fitted well or not with PCA
predictions_pca = voting_model_pca.predict(X_train_transformed)
mse_pca = mean_squared_error(y_train, predictions_pca)
rmse_pca = np.sqrt(mse_pca)
print(f'Training Root Mean Squared Error with PCA: {rmse_pca:.2f}')

In [None]:
predictions_pca = voting_model_pca.predict(X_test_transformed)
mse_pca = mean_squared_error(y_test, predictions_pca)
rmse_pca = np.sqrt(mse_pca)
print(f'Testing Root Mean Squared Error with PCA: {rmse_pca:.2f}')


## Extract submissions

In [None]:
samples = pd.read_csv('data/sample_submission.csv')
samples.head()

In [None]:
test = pd.read_csv("data/cleaned_test.csv")
test.head()

In [None]:
test.isna().sum()

In [None]:
y_sub_pred = exmodel.predict(test)

In [None]:
samples["Cost"] = y_sub_pred

In [None]:
samples.to_csv("data/submission.csv", index=False)