In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

import pre_processing_funcs as pre

knn = KNeighborsRegressor()

In [2]:
df = pre.set_index(pd.read_csv("data/cleaned_train.csv"))
df.head()

Unnamed: 0_level_0,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,Cost,...,Department,Coffee Bar,Video Store,Bar For Salad,Florist,Ready Food,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,Dimes Off,Deluxe,8760000.0,4292400.0,True,2842.23,2037.64,481.98,323.0,602.76,...,Household,1,1,1,1,1,28.2,26.6,1.6,10000.0
mc_ID_1,Budget Bargains,Supermarket,6360000.0,1971600.0,False,2814.95,2049.72,457.36,305.02,708.66,...,Snack Foods,0,0,0,0,0,16.57,14.97,1.6,50000.0
mc_ID_2,Shelf Emptiers,Supermarket,10860000.0,4452600.0,True,2192.32,1322.21,523.32,348.85,564.26,...,Periodicals,0,0,0,1,0,28.64,27.18,1.45,30000.0
mc_ID_4,Sale Winners,Deluxe,11560000.0,4970800.0,False,2862.3,1872.19,593.93,395.95,519.76,...,Produce,1,1,1,1,1,12.62,9.71,2.91,50000.0
mc_ID_5,Weekend Discount,Supermarket,5220000.0,1618200.0,True,1970.17,1236.07,440.92,293.95,364.16,...,Household,1,0,0,0,0,15.41,13.95,1.45,30000.0


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28227 entries, mc_ID_0 to mc_ID_6465
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Promotion Name             28227 non-null  object 
 1   Store Kind                 28227 non-null  object 
 2   Store Sales                28227 non-null  float64
 3   Store Cost                 28227 non-null  float64
 4   Is Recyclable?             28227 non-null  bool   
 5   Store Area                 28227 non-null  float64
 6   Grocery Area               28227 non-null  float64
 7   Frozen Area                28227 non-null  float64
 8   Meat Area                  28227 non-null  float64
 9   Cost                       28227 non-null  float64
 10  Marriage                   28227 non-null  object 
 11  Gender                     28227 non-null  object 
 12  Children                   28227 non-null  int64  
 13  Degree                     28227 non-nul

In [4]:
target = "Cost"
X = df.drop(columns=target)
y = df[target]

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [6]:
ord_categorical_columns = ['Store Code', 'Promotion Name', 'Store Kind',
                            'Degree', 'Work', 'Order Brand',
                            'Order', 'Department', "Country ISO2"]

hot_categorical_columns = ['Marriage', 'Gender', 'Is Recyclable?']


numeric_columns = X.select_dtypes("float").columns

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('ohe', OneHotEncoder(), hot_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [8]:
model = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=16, random_state=42)
)
model.fit(X_train, y_train)

In [9]:
# Check if the model fitted well or not
predictions = model.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

31.260576325505898

In [10]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

63.21910413875263

In [11]:
rfmodel = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=400, max_depth=16, random_state=42)
)
rfmodel.fit(X_train, y_train)

In [12]:
# Check if the model fitted well or not
predictions = rfmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

29.1118602752378

In [13]:
predictions = rfmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

51.92352323362081

In [14]:
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Try Lazypredict

In [15]:
reg = LazyRegressor(verbose=1,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)

  2%|▏         | 1/42 [00:00<00:34,  1.19it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.11951398275584835, 'Adjusted R-Squared': 0.11559723001010036, 'RMSE': 148.32596671558832, 'Time taken': 0.838735818862915}


  5%|▍         | 2/42 [00:03<01:15,  1.88s/it]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8789710935965422, 'Adjusted R-Squared': 0.8784327087815802, 'RMSE': 54.992126062592675, 'Time taken': 2.616075277328491}


  7%|▋         | 3/42 [00:03<00:44,  1.14s/it]

{'Model': 'BayesianRidge', 'R-Squared': 0.016509764253425985, 'Adjusted R-Squared': 0.012134807688716998, 'RMSE': 156.7620672907787, 'Time taken': 0.2481834888458252}


 14%|█▍        | 6/42 [00:04<00:15,  2.35it/s]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.8095013645343389, 'Adjusted R-Squared': 0.8086539506755058, 'RMSE': 68.99250129586773, 'Time taken': 0.3981435298919678}
{'Model': 'DummyRegressor', 'R-Squared': -0.00035584667892929467, 'Adjusted R-Squared': -0.004805828203301665, 'RMSE': 158.1004889993646, 'Time taken': 0.03308701515197754}
{'Model': 'ElasticNet', 'R-Squared': 0.013088936013456842, 'Adjusted R-Squared': 0.008698762241274749, 'RMSE': 157.03445972937482, 'Time taken': 0.07258033752441406}


 19%|█▉        | 8/42 [00:04<00:10,  3.31it/s]

{'Model': 'ElasticNetCV', 'R-Squared': 0.016398312342800647, 'Adjusted R-Squared': 0.012022859995571067, 'RMSE': 156.7709494008028, 'Time taken': 0.23041176795959473}
{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.7119498391717426, 'Adjusted R-Squared': 0.71066847724635, 'RMSE': 84.8379208994726, 'Time taken': 0.12218880653381348}


 21%|██▏       | 9/42 [00:12<01:18,  2.37s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.8996575285779016, 'Adjusted R-Squared': 0.8992111652708638, 'RMSE': 50.07239361739368, 'Time taken': 7.512455463409424}


 24%|██▍       | 10/42 [00:12<00:56,  1.76s/it]

{'Model': 'GammaRegressor', 'R-Squared': 0.011456746404372997, 'Adjusted R-Squared': 0.007059312002257179, 'RMSE': 157.16426075026226, 'Time taken': 0.2643253803253174}


 26%|██▌       | 11/42 [03:03<26:36, 51.49s/it]

{'Model': 'GaussianProcessRegressor', 'R-Squared': -4.956887052732238, 'Adjusted R-Squared': -4.983385660618058, 'RMSE': 385.8030339598802, 'Time taken': 171.1885266304016}


 29%|██▊       | 12/42 [03:12<19:25, 38.85s/it]

{'Model': 'GradientBoostingRegressor', 'R-Squared': 0.4899331046783577, 'Adjusted R-Squared': 0.48766412382728275, 'RMSE': 112.89365260803874, 'Time taken': 8.674102783203125}


 31%|███       | 13/42 [03:13<13:19, 27.58s/it]

{'Model': 'HistGradientBoostingRegressor', 'R-Squared': 0.8460364058690042, 'Adjusted R-Squared': 0.8453515144360371, 'RMSE': 62.02474338985329, 'Time taken': 0.8890213966369629}


 33%|███▎      | 14/42 [03:13<09:06, 19.51s/it]

{'Model': 'HuberRegressor', 'R-Squared': 0.014186570516145247, 'Adjusted R-Squared': 0.009801279459722423, 'RMSE': 156.9471092070167, 'Time taken': 0.4565892219543457}


 36%|███▌      | 15/42 [03:14<06:14, 13.87s/it]

{'Model': 'KNeighborsRegressor', 'R-Squared': -0.03999229808954352, 'Adjusted R-Squared': -0.04461859834794901, 'RMSE': 161.20221960134683, 'Time taken': 0.6268765926361084}


 40%|████      | 17/42 [04:22<08:48, 21.15s/it]

{'Model': 'KernelRidge', 'R-Squared': -10.89066587424907, 'Adjusted R-Squared': -10.943560295397864, 'RMSE': 545.0784893790535, 'Time taken': 68.14427328109741}
{'Model': 'Lars', 'R-Squared': 0.016799193206001428, 'Adjusted R-Squared': 0.012425524136633115, 'RMSE': 156.73899902947457, 'Time taken': 0.1807422637939453}


 43%|████▎     | 18/42 [04:22<05:57, 14.88s/it]

{'Model': 'LarsCV', 'R-Squared': 0.01571579921878008, 'Adjusted R-Squared': 0.011337310781141197, 'RMSE': 156.82533100512117, 'Time taken': 0.1959371566772461}


 45%|████▌     | 19/42 [04:23<04:01, 10.49s/it]

{'Model': 'Lasso', 'R-Squared': 0.015289896299410577, 'Adjusted R-Squared': 0.01090951327583145, 'RMSE': 156.85925674653728, 'Time taken': 0.25177884101867676}


 48%|████▊     | 20/42 [04:23<02:44,  7.49s/it]

{'Model': 'LassoCV', 'R-Squared': 0.016049949056092805, 'Adjusted R-Squared': 0.011672947050114568, 'RMSE': 156.79870881252756, 'Time taken': 0.4560508728027344}
{'Model': 'LassoLars', 'R-Squared': 0.01528989627725208, 'Adjusted R-Squared': 0.010909513253574366, 'RMSE': 156.85925674830216, 'Time taken': 0.0541079044342041}


 52%|█████▏    | 22/42 [04:23<01:21,  4.09s/it]

{'Model': 'LassoLarsCV', 'R-Squared': 0.015965146132881358, 'Adjusted R-Squared': 0.011587766889700202, 'RMSE': 156.80546560996785, 'Time taken': 0.17483949661254883}


 60%|█████▉    | 25/42 [04:24<00:31,  1.87s/it]

{'Model': 'LassoLarsIC', 'R-Squared': 0.01569954710018695, 'Adjusted R-Squared': 0.011320986366646801, 'RMSE': 156.82662571926724, 'Time taken': 0.2675955295562744}
{'Model': 'LinearRegression', 'R-Squared': 0.01680311598591666, 'Adjusted R-Squared': 0.012429464366636944, 'RMSE': 156.73868635010817, 'Time taken': 0.05727386474609375}
{'Model': 'LinearSVR', 'R-Squared': 0.007139278755110556, 'Adjusted R-Squared': 0.002722638536049593, 'RMSE': 157.50709467688824, 'Time taken': 0.1025247573852539}


 62%|██████▏   | 26/42 [04:38<01:14,  4.69s/it]

{'Model': 'MLPRegressor', 'R-Squared': 0.112529866245907, 'Adjusted R-Squared': 0.10858204536621796, 'RMSE': 148.913074091045, 'Time taken': 14.195226192474365}


 67%|██████▋   | 28/42 [05:15<02:10,  9.31s/it]

{'Model': 'NuSVR', 'R-Squared': 0.027513226698109072, 'Adjusted R-Squared': 0.02318721792007572, 'RMSE': 155.88265978365396, 'Time taken': 36.52607464790344}
{'Model': 'OrthogonalMatchingPursuit', 'R-Squared': 0.01389100916049757, 'Adjusted R-Squared': 0.009504403329360978, 'RMSE': 156.9706349688812, 'Time taken': 0.15606689453125}


 71%|███████▏  | 30/42 [05:15<00:59,  4.96s/it]

{'Model': 'OrthogonalMatchingPursuitCV', 'R-Squared': 0.015009637762740002, 'Adjusted R-Squared': 0.010628008037485315, 'RMSE': 156.8815770304926, 'Time taken': 0.1112680435180664}
{'Model': 'PassiveAggressiveRegressor', 'R-Squared': -0.04897390502801624, 'Adjusted R-Squared': -0.05364015905394148, 'RMSE': 161.89681243755308, 'Time taken': 0.15504932403564453}


 74%|███████▍  | 31/42 [05:15<00:39,  3.59s/it]

{'Model': 'PoissonRegressor', 'R-Squared': 0.016679798697413317, 'Adjusted R-Squared': 0.012305598513682958, 'RMSE': 156.74851550237935, 'Time taken': 0.15692734718322754}


 76%|███████▌  | 32/42 [05:49<02:03, 12.33s/it]

QuantileRegressor model failed to execute
Unable to allocate 7.61 GiB for an array with shape (22581, 45214) and data type float64


 79%|███████▊  | 33/42 [05:51<01:23,  9.29s/it]

{'Model': 'RANSACRegressor', 'R-Squared': -0.8843109783177625, 'Adjusted R-Squared': -0.8926931445914181, 'RMSE': 216.98626533767424, 'Time taken': 1.906635046005249}


 81%|████████  | 34/42 [06:27<02:17, 17.19s/it]

{'Model': 'RandomForestRegressor', 'R-Squared': 0.8905204157969401, 'Adjusted R-Squared': 0.890033406970414, 'RMSE': 52.30250931237643, 'Time taken': 36.103086709976196}


 86%|████████▌ | 36/42 [06:27<00:51,  8.59s/it]

{'Model': 'Ridge', 'R-Squared': 0.016790743235342798, 'Adjusted R-Squared': 0.012417036577136997, 'RMSE': 156.7396725628403, 'Time taken': 0.2502908706665039}
{'Model': 'RidgeCV', 'R-Squared': 0.01674160430395155, 'Adjusted R-Squared': 0.012367679056193293, 'RMSE': 156.74358928948453, 'Time taken': 0.11243343353271484}


 88%|████████▊ | 37/42 [06:27<00:30,  6.10s/it]

{'Model': 'SGDRegressor', 'R-Squared': 0.015620367758093168, 'Adjusted R-Squared': 0.011241454803280426, 'RMSE': 156.83293333565572, 'Time taken': 0.2303481101989746}


 90%|█████████ | 38/42 [07:12<01:10, 17.51s/it]

{'Model': 'SVR', 'R-Squared': 0.030290801166271475, 'Adjusted R-Squared': 0.0259771481465485, 'RMSE': 155.65988796216052, 'Time taken': 44.3139374256134}
{'Model': 'TransformedTargetRegressor', 'R-Squared': 0.01680311598591666, 'Adjusted R-Squared': 0.012429464366636944, 'RMSE': 156.73868635010817, 'Time taken': 0.06800007820129395}


 95%|█████████▌| 40/42 [07:12<00:19,  9.53s/it]

{'Model': 'TweedieRegressor', 'R-Squared': 0.01137724330900114, 'Adjusted R-Squared': 0.006979455245429089, 'RMSE': 157.17058055162107, 'Time taken': 0.26117563247680664}


 98%|█████████▊| 41/42 [07:14<00:07,  7.77s/it]

{'Model': 'XGBRegressor', 'R-Squared': 0.8769225432878411, 'Adjusted R-Squared': 0.876375045704602, 'RMSE': 55.45557497258384, 'Time taken': 2.4443063735961914}
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2516
[LightGBM] [Info] Number of data points in the train set: 22581, number of used features: 25
[LightGBM] [Info] Start training from score 518.858765


100%|██████████| 42/42 [07:15<00:00, 10.37s/it]

{'Model': 'LGBMRegressor', 'R-Squared': 0.8514437425868504, 'Adjusted R-Squared': 0.8507829051428417, 'RMSE': 60.92582667521762, 'Time taken': 0.46050357818603516}





In [16]:
print(models)

                               Adjusted R-Squared  R-Squared   RMSE  \
Model                                                                 
ExtraTreesRegressor                          0.90       0.90  50.07   
RandomForestRegressor                        0.89       0.89  52.30   
BaggingRegressor                             0.88       0.88  54.99   
XGBRegressor                                 0.88       0.88  55.46   
LGBMRegressor                                0.85       0.85  60.93   
HistGradientBoostingRegressor                0.85       0.85  62.02   
DecisionTreeRegressor                        0.81       0.81  68.99   
ExtraTreeRegressor                           0.71       0.71  84.84   
GradientBoostingRegressor                    0.49       0.49 112.89   
AdaBoostRegressor                            0.12       0.12 148.33   
MLPRegressor                                 0.11       0.11 148.91   
SVR                                          0.03       0.03 155.66   
NuSVR 

In [17]:
df = pd.read_csv("data/train_transformed.csv")
test = pd.read_csv("data/test_transformed.csv")

In [18]:
exmodel = make_pipeline(
    preprocessor,
    ExtraTreesRegressor(n_estimators=500, max_depth=16, random_state=42)
)
exmodel.fit(X_train, y_train)

In [19]:
# Check if the model fitted well or not
predictions = exmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

25.296651086740656

In [20]:
predictions = exmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

49.56561217782004

In [21]:
params = {
    'n_estimators': range(100, 301, 100),
    'max_depth': [2, 8, 16, 32, 64],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1,2],
}

gsCV = GridSearchCV(
    ExtraTreesRegressor(random_state=42),
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)
gsCV

In [22]:
gsCV.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


In [23]:
gsCV.best_params_

{'max_depth': 16,
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 300}

In [24]:
best = gsCV.best_estimator_

In [25]:
# Check if the model fitted well or not
predictions = best.predict(X_train_transformed)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

26.8816347915983

In [26]:
predictions = best.predict(X_test_transformed)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

49.676162906712754

In [27]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

kf=KFold(n_splits=5,shuffle=True,random_state=42)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

scores=cross_val_score(exmodel,X,y,cv=kf,scoring=rmse_scorer)

positive_scores = -scores

positive_scores

array([49.70247498, 50.51184942, 50.72072477, 47.99071331, 50.0463652 ])

In [62]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor
import xgboost as xgb
model = xgb.XGBRegressor()

In [63]:
model = make_pipeline(preprocessor, xgb.XGBRegressor())
model.fit(X_train, y_train)


In [64]:
y_pred = model.predict(X_test)

In [65]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 55.45


In [66]:
param_grid = {
    'xgbregressor__learning_rate': [0.01, 0.1],
    'xgbregressor__n_estimators': [50, 100],
    'xgbregressor__num_leaves': [31, 63],
}


In [70]:
from sklearn.model_selection import GridSearchCV
from xgboost import XGBRegressor

model = xgb.XGBRegressor(random_state=42)

grid_search = GridSearchCV(
    model,
    param_grid=param_grid,
    cv=5,
    n_jobs=-1,
    verbose=1
)

In [74]:
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)


Fitting 5 folds for each of 8 candidates, totalling 40 fits


ValueError: 
All the 40 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "E:\ana\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\ana\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\sklearn.py", line 988, in fit
    train_dmatrix, evals = _wrap_evaluation_matrices(
                           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\sklearn.py", line 448, in _wrap_evaluation_matrices
    train_dmatrix = create_dmatrix(
                    ^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\sklearn.py", line 908, in _create_dmatrix
    return DMatrix(**kwargs, nthread=self.n_jobs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
           ^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\core.py", line 743, in __init__
    handle, feature_names, feature_types = dispatch_data_backend(
                                           ^^^^^^^^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\data.py", line 970, in dispatch_data_backend
    return _from_pandas_df(data, enable_categorical, missing, threads,
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\data.py", line 417, in _from_pandas_df
    data, feature_names, feature_types = _transform_pandas_df(
                                         ^^^^^^^^^^^^^^^^^^^^^
  File "E:\ana\Lib\site-packages\xgboost\data.py", line 391, in _transform_pandas_df
    _invalid_dataframe_dtype(data)
  File "E:\ana\Lib\site-packages\xgboost\data.py", line 283, in _invalid_dataframe_dtype
    raise ValueError(msg)
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:Promotion Name: object, Store Kind: object, Marriage: object, Gender: object, Degree: object, Work: object, Store Code: object, Country ISO2: object, Order Brand: object, Order: object, Department: object


In [None]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

In [None]:
print(f'Best hyperparameters: {grid_search.best_params_}')

In [35]:
import lightgbm as lgb

In [36]:
model = make_pipeline(preprocessor, lgb.LGBMRegressor())
model.fit(X_train, y_train)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2501
[LightGBM] [Info] Number of data points in the train set: 22581, number of used features: 25
[LightGBM] [Info] Start training from score 518.858765


In [37]:
y_pred = model.predict(X_test)

In [38]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 60.93


In [39]:
param_grid = {
    'lgbmregressor__learning_rate': [0.01, 0.1],
    'lgbmregressor__n_estimators': [50, 100],
    'lgbmregressor__num_leaves': [31, 63],
}

In [40]:
grid_search = GridSearchCV(model, param_grid=param_grid, cv=5)

In [41]:
grid_search.fit(X_train, y_train)
y_pred = grid_search.predict(X_test)

You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2498
[LightGBM] [Info] Number of data points in the train set: 18064, number of used features: 25
[LightGBM] [Info] Start training from score 519.190876
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2503
[LightGBM] [Info] Number of data points in the train set: 18065, number of used features: 25
[LightGBM] [Info] Start training from score 519.171909
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2502
[LightGBM] [Info] Number of data points in the train set: 18065, number of used features: 25
[LightGBM] [Info] Start training from score 518.358197
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2502
[LightGBM] [Info] Number of data points in the train set: 18065, number of used features: 25
[LightGBM] [Info] Start trainin

In [42]:
rmse = mean_squared_error(y_test, y_pred, squared=False)
print(f'Root Mean Squared Error: {rmse:.2f}')

Root Mean Squared Error: 55.35


In [43]:
print(f'Best hyperparameters: {grid_search.best_params_}')

Best hyperparameters: {'lgbmregressor__learning_rate': 0.1, 'lgbmregressor__n_estimators': 100, 'lgbmregressor__num_leaves': 63}


## Extract submissions

In [88]:
samples = pd.read_csv('data/sample_submission.csv')
samples.head()

Unnamed: 0,ID,Cost
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [89]:
test = pd.read_csv("data/cleaned_test.csv")
test.head()

Unnamed: 0,id,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,...,Department,Video Store,Bar For Salad,Florist,Coffee Bar,Ready Food,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,0,Fantastic Discounts,Gourmet,11760000.0,4704000.0,True,2201.06,1424.85,465.54,308.73,...,Snacks,1,1,1,1,1,31.83,28.78,3.05,50000.0
1,1,Bag Stuffer,Deluxe,2160000.0,669600.0,False,2577.16,1735.17,505.07,336.59,...,Frozen Foods,1,1,1,1,1,29.94,27.04,2.91,70000.0
2,2,Pick Your Savings,Deluxe,1830000.0,823500.0,False,2837.58,2038.11,481.98,321.26,...,Dairy,1,1,1,1,1,29.22,26.31,2.91,130000.0
3,3,Price Winners,Deluxe,8820000.0,4410000.0,False,2859.04,1871.16,593.93,394.58,...,Frozen Foods,1,1,1,1,1,28.05,25.0,3.05,10000.0
4,4,Dollar Days,Supermarket,4320000.0,1987200.0,False,2193.97,1320.15,523.32,348.85,...,Beverages,0,0,1,0,0,23.55,20.64,2.91,30000.0


In [90]:
test.isna().sum()

id                           0
Promotion Name               0
Store Kind                   0
Store Sales                  0
Store Cost                   0
Is Recyclable?               0
Store Area                   0
Grocery Area                 0
Frozen Area                  0
Meat Area                    0
Marriage                     0
Gender                       0
Children                     0
Degree                       0
Work                         0
Store Code                   0
Country ISO2                 0
Oreder Brand                 0
Order                        0
Department                   0
Video Store                  0
Bar For Salad                0
Florist                      0
Coffee Bar                   0
Ready Food                   0
Gross Weight                 0
Net Weight                   0
Package Weight               0
Min. Person Yearly Income    0
dtype: int64

In [92]:
y_sub_pred = best.predict(preprocessor.transform(test))

In [93]:
samples["Cost"] = y_sub_pred

In [94]:
samples.to_csv("data/submission.csv", index=False)