In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [3]:
df = pd.read_csv("data/cleaned_train.csv")
df.head()

Unnamed: 0,id,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,...,Department,Bar For Salad,Florist,Coffee Bar,Ready Food,Video Store,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,mc_ID_0,H11go_ZA,Dimes Off,Deluxe,8760000.0,4292400.0,yes,2842.23,2037.64,481.98,...,Household,1,1,1,1,1,28.2,26.6,1.6,10000.0
1,mc_ID_1,S04ne_WA,Budget Bargains,Supermarket,6360000.0,1971600.0,no,2814.95,2049.72,457.36,...,Snack Foods,0,0,0,0,0,16.57,14.97,1.6,50000.0
2,mc_ID_2,L05es_CA,Shelf Emptiers,Supermarket,10860000.0,4452600.0,yes,2192.32,1322.21,523.32,...,Periodicals,0,1,0,0,0,28.64,27.18,1.45,30000.0
3,mc_ID_4,M10da_YU,Sale Winners,Deluxe,11560000.0,4970800.0,no,2862.3,1872.19,593.93,...,Produce,1,1,1,1,1,12.62,9.71,2.91,50000.0
4,mc_ID_5,S03le_WA,Weekend Discount,Supermarket,5220000.0,1618200.0,yes,1970.17,1236.07,440.92,...,Household,0,0,1,0,0,15.41,13.95,1.45,30000.0


In [4]:
target = "Cost"
X = df.drop(columns=target)
y = df[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
ord_categorical_columns = ['Place Code', 'Promotion Name', 'Store Kind', 'Is Recyclable?',
                            'Children', 'Degree', 'Work', 'Oreder Brand',
                            'Product', 'Department']

hot_categorical_columns = ['Marriage', 'Gender']

all_categorical_columns = X.select_dtypes("object").columns

numeric_columns = X.select_dtypes("float").columns

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('ohe', OneHotEncoder(), hot_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [8]:
model = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=20, random_state=42)
)
model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [9]:
# Check if the model fitted well or not
predictions = model.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

16.24279568376655

In [10]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

66.13992962598071

In [11]:
rfmodel = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=100, random_state=42)
)
rfmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [12]:
# Check if the model fitted well or not
predictions = rfmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

19.280206350134836

In [13]:
predictions = rfmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

53.03544924733378

In [14]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

rmse_scorer = make_scorer(rmse, greater_is_better=False)

scores = cross_val_score(rfmodel, X_train, y_train, cv=5, scoring=rmse_scorer)

positive_scores = -scores

print("Cross-validation RMSE scores:", positive_scores)

Cross-validation RMSE scores: [53.65738053 56.10354203 52.34148074 52.18038972 50.51044931]


In [15]:
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Try Lazypredict

In [35]:
reg = LazyRegressor(verbose=1,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)

  2%|█▉                                                                                 | 1/42 [00:00<00:19,  2.13it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.1229879692040099, 'Adjusted R-Squared': 0.11924272209740783, 'RMSE': 147.04110042057997, 'Time taken': 0.4701058864593506}


  7%|█████▉                                                                             | 3/42 [00:02<00:26,  1.45it/s]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8722005560563311, 'Adjusted R-Squared': 0.8716547933063937, 'RMSE': 56.130699031326074, 'Time taken': 1.659067153930664}
{'Model': 'BayesianRidge', 'R-Squared': 0.016696374049479545, 'Adjusted R-Squared': 0.012497212657520018, 'RMSE': 155.69683667700198, 'Time taken': 0.11887073516845703}


 14%|███████████▊                                                                       | 6/42 [00:02<00:10,  3.58it/s]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.8071444327947982, 'Adjusted R-Squared': 0.8063208503013952, 'RMSE': 68.95282418809465, 'Time taken': 0.28182339668273926}
{'Model': 'DummyRegressor', 'R-Squared': -3.504148029964682e-06, 'Adjusted R-Squared': -0.004273981745815281, 'RMSE': 157.01340419741697, 'Time taken': 0.05338740348815918}
{'Model': 'ElasticNet', 'R-Squared': 0.01438935405967523, 'Adjusted R-Squared': 0.010180340625054574, 'RMSE': 155.87937707768427, 'Time taken': 0.062494754791259766}


 19%|███████████████▊                                                                   | 8/42 [00:03<00:08,  3.98it/s]

{'Model': 'ElasticNetCV', 'R-Squared': 0.016740751681517674, 'Adjusted R-Squared': 0.012541779802577535, 'RMSE': 155.69332324803412, 'Time taken': 0.36518239974975586}
{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.7436033515424032, 'Adjusted R-Squared': 0.7425084192358227, 'RMSE': 79.50457871545507, 'Time taken': 0.12048888206481934}


 21%|█████████████████▊                                                                 | 9/42 [00:10<01:16,  2.32s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.8952187312382067, 'Adjusted R-Squared': 0.8947712667452739, 'RMSE': 50.8250477268684, 'Time taken': 7.474532604217529}
{'Model': 'GammaRegressor', 'R-Squared': 0.012413960599017226, 'Adjusted R-Squared': 0.008196511320436528, 'RMSE': 156.03550818807867, 'Time taken': 0.06496620178222656}


 26%|████████████████████▋                                                          | 11/42 [12:18<1:25:22, 165.23s/it]

{'Model': 'GaussianProcessRegressor', 'R-Squared': -5.9800688849087935, 'Adjusted R-Squared': -6.009877008260718, 'RMSE': 414.82586074184235, 'Time taken': 727.6633887290955}


 29%|██████████████████████▌                                                        | 12/42 [12:23<1:03:10, 126.34s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.4747478194542283, 'Adjusted R-Squared': 0.4725047496440684, 'RMSE': 113.79414071212821, 'Time taken': 5.466938257217407}


 31%|█████████████████████████▍                                                        | 13/42 [12:24<45:26, 94.00s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.8243723247407336, 'Adjusted R-Squared': 0.8236223133161389, 'RMSE': 65.8009926167201, 'Time taken': 1.0338072776794434}


 33%|███████████████████████████▎                                                      | 14/42 [12:25<32:05, 68.77s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.014051531776593795, 'Adjusted R-Squared': 0.009841075684536515, 'RMSE': 155.9060889520537, 'Time taken': 0.3321394920349121}


 36%|█████████████████████████████▎                                                    | 15/42 [12:28<22:42, 50.46s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': -0.03713842973321135, 'Adjusted R-Squared': -0.041567490643104055, 'RMSE': 159.9021606324316, 'Time taken': 2.8709850311279297}


 40%|████████████████████████████████▊                                                | 17/42 [19:24<45:47, 109.90s/it]

{'Model': 'KernelRidge', 'R-Squared': -10.95020055635004, 'Adjusted R-Squared': -11.001233441288191, 'RMSE': 542.7796612977268, 'Time taken': 416.350843667984}
{'Model': 'Lars', 'R-Squared': 0.016841011981447385, 'Adjusted R-Squared': 0.012642468260371631, 'RMSE': 155.6853852310479, 'Time taken': 0.17054057121276855}


 45%|█████████████████████████████████████                                             | 19/42 [19:25<21:05, 55.02s/it]

{'Model': 'LarsCV', 'R-Squared': 0.01699069899152994, 'Adjusted R-Squared': 0.012792794503237559, 'RMSE': 155.67353314649284, 'Time taken': 0.3013269901275635}
{'Model': 'Lasso', 'R-Squared': 0.016594826256287054, 'Adjusted R-Squared': 0.012395231208271129, 'RMSE': 155.70487603614566, 'Time taken': 0.17048358917236328}


 48%|███████████████████████████████████████                                           | 20/42 [19:25<14:15, 38.87s/it]

{'Model': 'LassoCV', 'R-Squared': 0.017012684028474556, 'Adjusted R-Squared': 0.012814873426460927, 'RMSE': 155.6717923147885, 'Time taken': 0.4595353603363037}
{'Model': 'LassoLars', 'R-Squared': -3.504148029964682e-06, 'Adjusted R-Squared': -0.004273981745815281, 'RMSE': 157.01340419741697, 'Time taken': 0.05022311210632324}


 52%|██████████████████████████████████████████▉                                       | 22/42 [19:25<07:02, 21.12s/it]

{'Model': 'LassoLarsCV', 'R-Squared': 0.016950951023656402, 'Adjusted R-Squared': 0.012752876793152401, 'RMSE': 155.67668044328812, 'Time taken': 0.25123095512390137}


 60%|████████████████████████████████████████████████▊                                 | 25/42 [19:26<02:39,  9.41s/it]

{'Model': 'LassoLarsIC', 'R-Squared': 0.01714137829930784, 'Adjusted R-Squared': 0.012944117281368905, 'RMSE': 155.6616015812756, 'Time taken': 0.22081422805786133}
{'Model': 'LinearRegression', 'R-Squared': 0.016506461702418096, 'Adjusted R-Squared': 0.012306489296876832, 'RMSE': 155.7118713638147, 'Time taken': 0.050226449966430664}
{'Model': 'LinearSVR', 'R-Squared': 0.010171962685455394, 'Adjusted R-Squared': 0.005944939038560526, 'RMSE': 156.2125221139118, 'Time taken': 0.13044428825378418}


 62%|██████████████████████████████████████████████████▊                               | 26/42 [19:43<03:00, 11.27s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.08731012415470973, 'Adjusted R-Squared': 0.08341251614398248, 'RMSE': 150.0021850004357, 'Time taken': 17.570504426956177}


 69%|████████████████████████████████████████████████████████▌                         | 29/42 [20:29<02:31, 11.63s/it]

{'Model': 'NuSVR', 'R-Squared': 0.02547180780149738, 'Adjusted R-Squared': 0.021310121571468166, 'RMSE': 155.00052614784155, 'Time taken': 45.70055341720581}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.01482033044210318, 'Adjusted R-Squared': 0.010613157476019652, 'RMSE': 155.84529278898995, 'Time taken': 0.05140376091003418}
{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.016798998235654672, 'Adjusted R-Squared': 0.012600275096447455, 'RMSE': 155.68871167997472, 'Time taken': 0.13150930404663086}


 74%|████████████████████████████████████████████████████████████▌                     | 31/42 [20:29<01:20,  7.36s/it]

{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.06439319981599412, 'Adjusted R-Squared': -0.06893865120310871, 'RMSE': 161.98955573181004, 'Time taken': 0.0979766845703125}
{'Model': 'PoissonRegressor', 'R-Squared': 0.016442369775478638, 'Adjusted R-Squared': 0.012242123667758231, 'RMSE': 155.71694496669497, 'Time taken': 0.11248302459716797}
QuantileRegressor model failed to execute
Unable to allocate 7.61 GiB for an array with shape (22580, 45210) and data type float64


 79%|████████████████████████████████████████████████████████████████▍                 | 33/42 [21:28<02:05, 13.95s/it]

{'Model': 'RANSACRegressor', 'R-Squared': -1.3665839682302154, 'Adjusted R-Squared': -1.376690376635469, 'RMSE': 241.5443747451886, 'Time taken': 0.402101993560791}


 86%|██████████████████████████████████████████████████████████████████████▎           | 36/42 [21:42<00:49,  8.23s/it]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.8858349484275655, 'Adjusted R-Squared': 0.8853474108407793, 'RMSE': 53.05209727756488, 'Time taken': 14.149781227111816}
{'Model': 'Ridge', 'R-Squared': 0.0165068418332045, 'Adjusted R-Squared': 0.012306871050997481, 'RMSE': 155.71184127165878, 'Time taken': 0.06493830680847168}
{'Model': 'RidgeCV', 'R-Squared': 0.016510236875647455, 'Adjusted R-Squared': 0.01231028059184236, 'RMSE': 155.71157251088408, 'Time taken': 0.08463239669799805}


 88%|████████████████████████████████████████████████████████████████████████▏         | 37/42 [21:43<00:31,  6.39s/it]

{'Model': 'SGDRegressor', 'R-Squared': 0.013561621981421412, 'Adjusted R-Squared': 0.009349073747890069, 'RMSE': 155.9448183769587, 'Time taken': 0.1627511978149414}


 95%|██████████████████████████████████████████████████████████████████████████████    | 40/42 [22:30<00:19,  9.67s/it]

{'Model': 'SVR', 'R-Squared': 0.03079171715819351, 'Adjusted R-Squared': 0.02665274940228546, 'RMSE': 154.57687643348802, 'Time taken': 47.182443380355835}
{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.016506461702418096, 'Adjusted R-Squared': 0.012306489296876832, 'RMSE': 155.7118713638147, 'Time taken': 0.04932141304016113}
{'Model': 'TweedieRegressor', 'R-Squared': 0.012382814440567103, 'Adjusted R-Squared': 0.008165232153480528, 'RMSE': 156.03796866653374, 'Time taken': 0.06249499320983887}
{'Model': 'XGBRegressor', 'R-Squared': 0.8527332424729093, 'Adjusted R-Squared': 0.8521043452877402, 'RMSE': 60.25433963077798, 'Time taken': 1.2772939205169678}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2080
[LightGBM] [Info] Number of data points in the train set: 22580, number of used features: 24
[LightGBM] [Info] Start training from score 519.389129


100%|██████████████████████████████████████████████████████████████████████████████████| 42/42 [22:32<00:00, 32.20s/it]

{'Model': 'LGBMRegressor', 'R-Squared': 0.8287743411010376, 'Adjusted R-Squared': 0.8280431283228215, 'RMSE': 64.97112573940814, 'Time taken': 0.8558270931243896}





In [36]:
print(models)

                               Adjusted R-Squared  R-Squared   RMSE  \
Model                                                                 
ExtraTreesRegressor                          0.89       0.90  50.83   
RandomForestRegressor                        0.89       0.89  53.05   
BaggingRegressor                             0.87       0.87  56.13   
XGBRegressor                                 0.85       0.85  60.25   
LGBMRegressor                                0.83       0.83  64.97   
HistGradientBoostingRegressor                0.82       0.82  65.80   
DecisionTreeRegressor                        0.81       0.81  68.95   
ExtraTreeRegressor                           0.74       0.74  79.50   
GradientBoostingRegressor                    0.47       0.47 113.79   
AdaBoostRegressor                            0.12       0.12 147.04   
MLPRegressor                                 0.08       0.09 150.00   
SVR                                          0.03       0.03 154.58   
NuSVR 

In [25]:
from sklearn.ensemble import ExtraTreesRegressor

In [26]:
exmodel = make_pipeline(
    preprocessor,
    ExtraTreesRegressor(random_state=42)
)
exmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Place Code',
                                                   'Promotion Name',
                                                   'Store Kind',
                                                   'Is Recyclable?', 'Children',
                                                   'Degree', 'Work',
                                                   'Oreder Brand', 'Product',
                                                   'Department']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 'N

In [29]:
# Check if the model fitted well or not
predictions = exmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

8.697849117431592e-13

In [30]:
predictions = exmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

50.8250477268684

## Extract submissions

In [16]:
samples = pd.read_csv('data/sample_submission.csv')
samples.head()

Unnamed: 0,ID,Cost
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [17]:
test = pd.read_csv("data/cleaned_test.csv")
test.head()

Unnamed: 0,id,Place Code,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,...,Department,Bar For Salad,Florist,Coffee Bar,Ready Food,Video Store,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,0,B06ls_CA,Fantastic Discounts,Gourmet,11760000.0,4704000.0,yes,2201.06,1424.85,465.54,...,Snacks,1,1,1,1,1,31.83,28.78,3.05,50000.0
1,1,S01em_OR,Bag Stuffer,Deluxe,2160000.0,669600.0,no,2577.16,1735.17,505.07,...,Frozen Foods,1,1,1,1,1,29.94,27.04,2.91,70000.0
2,2,H11go_ZA,Pick Your Savings,Deluxe,1830000.0,823500.0,no,2837.58,2038.11,481.98,...,Dairy,1,1,1,1,1,29.22,26.31,2.91,130000.0
3,3,M10da_YU,Price Winners,Deluxe,8820000.0,4410000.0,no,2859.04,1871.16,593.93,...,Frozen Foods,1,1,1,1,1,28.05,25.0,3.05,10000.0
4,4,L05es_CA,Dollar Days,Supermarket,4320000.0,1987200.0,no,2193.97,1320.15,523.32,...,Beverages,0,1,0,0,0,23.55,20.64,2.91,30000.0


In [18]:
test.isna().sum()

id                           0
Place Code                   0
Promotion Name               0
Store Kind                   0
Store Sales                  0
Store Cost                   0
Is Recyclable?               0
Store Area                   0
Grocery Area                 0
Frozen Area                  0
Meat Area                    0
Marriage                     0
Gender                       0
Children                     0
Degree                       0
Work                         0
Oreder Brand                 0
Product                      0
Department                   0
Bar For Salad                0
Florist                      0
Coffee Bar                   0
Ready Food                   0
Video Store                  0
Gross Weight                 0
Net Weight                   0
Package Weight               0
Min. Person Yearly Income    0
dtype: int64

In [31]:
y_sub_pred = exmodel.predict(test)

In [32]:
samples["Cost"] = y_sub_pred

In [34]:
samples.to_csv("data/submission.csv", index=False)