In [177]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from lazypredict.Supervised import LazyRegressor
from sklearn.preprocessing import OrdinalEncoder, StandardScaler, OneHotEncoder
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import KFold, train_test_split, cross_val_score, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.metrics import mean_squared_error
from sklearn.neighbors import KNeighborsRegressor

import pre_processing_funcs as pre

knn = KNeighborsRegressor()

In [159]:
df = pre.set_index(pd.read_csv("data/cleaned_train.csv"))
df.head()

Unnamed: 0_level_0,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,Cost,...,Department,Florist,Coffee Bar,Bar For Salad,Video Store,Ready Food,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
mc_ID_0,Dimes Off,Deluxe,8760000.0,4292400.0,True,2842.23,2037.64,481.98,323.0,602.76,...,Household,1,1,1,1,1,28.2,26.6,1.6,10000.0
mc_ID_1,Budget Bargains,Supermarket,6360000.0,1971600.0,False,2814.95,2049.72,457.36,328.94,708.66,...,Snack Foods,0,0,0,0,0,16.57,14.97,1.6,50000.0
mc_ID_2,Shelf Emptiers,Supermarket,10860000.0,4452600.0,True,2192.32,1322.21,523.32,348.85,564.26,...,Periodicals,1,0,0,0,0,28.64,27.18,1.45,30000.0
mc_ID_4,Sale Winners,Deluxe,11560000.0,4970800.0,False,2862.3,1872.19,593.93,395.95,519.76,...,Produce,1,1,1,1,1,12.62,9.71,2.91,50000.0
mc_ID_5,Weekend Discount,Supermarket,5220000.0,1618200.0,True,1970.17,1236.07,440.92,293.95,364.16,...,Household,0,1,0,0,0,15.41,13.95,1.45,30000.0


In [165]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 28227 entries, mc_ID_0 to mc_ID_6465
Data columns (total 29 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Promotion Name             28227 non-null  object 
 1   Store Kind                 28227 non-null  object 
 2   Store Sales                28227 non-null  float64
 3   Store Cost                 28227 non-null  float64
 4   Is Recyclable?             28227 non-null  bool   
 5   Store Area                 28227 non-null  float64
 6   Grocery Area               28227 non-null  float64
 7   Frozen Area                28227 non-null  float64
 8   Meat Area                  28227 non-null  float64
 9   Cost                       28227 non-null  float64
 10  Marriage                   28227 non-null  object 
 11  Gender                     28227 non-null  object 
 12  Children                   28227 non-null  int64  
 13  Degree                     28227 non-nul

In [160]:
target = "Cost"
X = df.drop(columns=target)
y = df[target]

In [167]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [168]:
ord_categorical_columns = ['Store Code', 'Promotion Name', 'Store Kind',
                            'Degree', 'Work', 'Oreder Brand',
                            'Order', 'Department', "Country ISO2"]

hot_categorical_columns = ['Marriage', 'Gender', 'Is Recyclable?']


numeric_columns = X.select_dtypes("float").columns

In [169]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), ord_categorical_columns),
        ('ohe', OneHotEncoder(), hot_categorical_columns),
        ('num', StandardScaler(), numeric_columns)
    ]
)

In [170]:
model = make_pipeline(
    preprocessor,
    DecisionTreeRegressor(max_depth=16, random_state=42)
)
model.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Store Code',
                                                   'Promotion Name',
                                                   'Store Kind', 'Degree',
                                                   'Work', 'Oreder Brand',
                                                   'Order', 'Department',
                                                   'Country ISO2']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender',
                                                   'Is Recyclable?']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 

In [171]:
# Check if the model fitted well or not
predictions = model.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

31.225928695904216

In [172]:
predictions = model.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

64.27330280492905

In [173]:
rfmodel = make_pipeline(
    preprocessor,
    RandomForestRegressor(n_estimators=400, max_depth=16, random_state=42)
)
rfmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Store Code',
                                                   'Promotion Name',
                                                   'Store Kind', 'Degree',
                                                   'Work', 'Oreder Brand',
                                                   'Order', 'Department',
                                                   'Country ISO2']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender',
                                                   'Is Recyclable?']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 

In [174]:
# Check if the model fitted well or not
predictions = rfmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

28.61634461012867

In [175]:
predictions = rfmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

51.940643056981266

In [154]:
preprocessor.fit(X_train)

X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

## Try Lazypredict

In [155]:
reg = LazyRegressor(verbose=1,ignore_warnings=False, custom_metric=None)
models,predictions = reg.fit(X_train_transformed, X_test_transformed, y_train, y_test)

  2%|█▉                                                                                 | 1/42 [00:00<00:35,  1.15it/s]

{'Model': 'AdaBoostRegressor', 'R-Squared': 0.11551702036224154, 'Adjusted R-Squared': 0.1114236495059614, 'RMSE': 147.66606729864304, 'Time taken': 0.8679823875427246}


  7%|█████▉                                                                             | 3/42 [00:02<00:33,  1.17it/s]

{'Model': 'BaggingRegressor', 'R-Squared': 0.8783221797591636, 'Adjusted R-Squared': 0.8777590570595798, 'RMSE': 54.76986610736036, 'Time taken': 1.8664119243621826}
{'Model': 'BayesianRidge', 'R-Squared': 0.017614026497484225, 'Adjusted R-Squared': 0.013067562397970911, 'RMSE': 155.62416892250292, 'Time taken': 0.1361832618713379}


 14%|███████████▊                                                                       | 6/42 [00:03<00:11,  3.07it/s]

{'Model': 'DecisionTreeRegressor', 'R-Squared': 0.7834835604753536, 'Adjusted R-Squared': 0.7824815264013698, 'RMSE': 73.06029258235051, 'Time taken': 0.2960188388824463}
{'Model': 'DummyRegressor', 'R-Squared': -3.504148029964682e-06, 'Adjusted R-Squared': -0.004631501853236353, 'RMSE': 157.01340419741697, 'Time taken': 0.04697299003601074}
{'Model': 'ElasticNet', 'R-Squared': 0.014770269955669324, 'Adjusted R-Squared': 0.010210645003523977, 'RMSE': 155.84925226577428, 'Time taken': 0.05974245071411133}


 19%|███████████████▊                                                                   | 8/42 [00:03<00:09,  3.56it/s]

{'Model': 'ElasticNetCV', 'R-Squared': 0.017618614484656958, 'Adjusted R-Squared': 0.01307217161826335, 'RMSE': 155.62380552026406, 'Time taken': 0.3807823657989502}
{'Model': 'ExtraTreeRegressor', 'R-Squared': 0.695579056170362, 'Adjusted R-Squared': 0.6941702016777364, 'RMSE': 86.63098108393301, 'Time taken': 0.12792754173278809}


 21%|█████████████████▊                                                                 | 9/42 [00:11<01:23,  2.52s/it]

{'Model': 'ExtraTreesRegressor', 'R-Squared': 0.8983151969683448, 'Adjusted R-Squared': 0.8978446015822958, 'RMSE': 50.068432438290785, 'Time taken': 8.096688508987427}
{'Model': 'GammaRegressor', 'R-Squared': 0.012690081462677583, 'Adjusted R-Squared': 0.008120829436694899, 'RMSE': 156.0136935463599, 'Time taken': 0.07796645164489746}


 24%|███████████████████▌                                                              | 10/42 [00:21<01:08,  2.14s/it]


KeyboardInterrupt: 

In [None]:
print(models)

In [None]:
df = pd.read_csv("data/train_transformed.csv")
test = pd.read_csv("data/test_transformed.csv")

In [176]:
exmodel = make_pipeline(
    preprocessor,
    ExtraTreesRegressor(n_estimators=500, max_depth=16, random_state=42)
)
exmodel.fit(X_train, y_train)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('cat', OrdinalEncoder(),
                                                  ['Store Code',
                                                   'Promotion Name',
                                                   'Store Kind', 'Degree',
                                                   'Work', 'Oreder Brand',
                                                   'Order', 'Department',
                                                   'Country ISO2']),
                                                 ('ohe', OneHotEncoder(),
                                                  ['Marriage', 'Gender',
                                                   'Is Recyclable?']),
                                                 ('num', StandardScaler(),
                                                  Index(['Store Sales', 'Store Cost', 'Store Area', 'Grocery Area',
       'Frozen Area', 'Meat Area', 'Gross Weight', 

In [178]:
# Check if the model fitted well or not
predictions = exmodel.predict(X_train)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

25.28647673163381

In [179]:
predictions = exmodel.predict(X_test)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

50.58440706856533

In [74]:
params = {
    'n_estimators': range(100, 301, 100),
    'max_depth': [2, 8, 16, 32, 64],
    'min_samples_split': [2,4],
    'min_samples_leaf': [1,2],
}

gsCV = GridSearchCV(
    ExtraTreesRegressor(random_state=42),
    param_grid=params,
    cv=5,
    n_jobs=-1,
    verbose=1
)
gsCV

GridSearchCV(cv=5, estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [2, 8, 16, 32, 64],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 4],
                         'n_estimators': range(100, 301, 100)},
             verbose=1)

In [75]:
gsCV.fit(X_train_transformed, y_train)

Fitting 5 folds for each of 60 candidates, totalling 300 fits


GridSearchCV(cv=5, estimator=ExtraTreesRegressor(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [2, 8, 16, 32, 64],
                         'min_samples_leaf': [1, 2],
                         'min_samples_split': [2, 4],
                         'n_estimators': range(100, 301, 100)},
             verbose=1)

In [78]:
gsCV.best_params_

{'max_depth': 16,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'n_estimators': 300}

In [85]:
best = gsCV.best_estimator_

In [86]:
# Check if the model fitted well or not
predictions = best.predict(X_train_transformed)

mse = mean_squared_error(y_train, predictions)
rmse = np.sqrt(mse)
rmse

27.991098921693695

In [87]:
predictions = best.predict(X_test_transformed)

mse = mean_squared_error(y_test, predictions)
rmse = np.sqrt(mse)
rmse

51.687590330980036

In [180]:
def rmse(y_true, y_pred):
    return np.sqrt(np.mean((y_true - y_pred)**2))

kf=KFold(n_splits=5,shuffle=True,random_state=42)

rmse_scorer = make_scorer(rmse, greater_is_better=False)

scores=cross_val_score(exmodel,X,y,cv=kf,scoring=rmse_scorer)

positive_scores = -scores

positive_scores

array([50.72777279, 52.1449524 , 51.66919402, 48.70748587, 51.09358654])

In [None]:
from sklearn.ensemble import VotingRegressor, StackingRegressor

In [None]:
lr = LogisticRegression(random_state=0, max_iter=500, C = 10, penalty= 'l2', solver= 'newton-cg')

mlp = MLPClassifier(random_state=0, hidden_layer_sizes = (512,) , max_iter=100, early_stopping=True,
                    solver='adam', learning_rate = 'adaptive', alpha=0.001, activation='relu', batch_size=128)

gb = GradientBoostingClassifier(random_state=0, learning_rate=0.1, n_estimators=100)

hgb = HistGradientBoostingClassifier(random_state=0, learning_rate=0.01, l2_regularization=0.001)

lgbm = LGBMClassifier(random_state=0, learning_rate=0.01, n_estimators=1000, num_leaves=10)

cat = CatBoostClassifier(n_estimators=1500, learning_rate=0.05, random_state=0, eval_metric= 'MultiClass')

In [None]:
estimators = [
              ('gb', gb),
              ('lgbm', lgbm),
              ('cat', cat),
             ]

stack_model = StackingClassifier(estimators=estimators, final_estimator = lr
                                 ,n_jobs =-1, verbose = False, passthrough=True)

stack_model.fit(X_train, y_train)

In [None]:
estimators = [
              ('hgb', hgb),
              ('gb', gb),
              ('lgbm', lgbm),
              ('cat', cat),
             ]

voting_model = VotingClassifier(estimators=estimators, n_jobs =-1, verbose = False, voting = 'soft')

voting_model.fit(X_train,y_train)

## Extract submissions

In [88]:
samples = pd.read_csv('data/sample_submission.csv')
samples.head()

Unnamed: 0,ID,Cost
0,0,0
1,1,0
2,2,0
3,3,0
4,4,0


In [89]:
test = pd.read_csv("data/cleaned_test.csv")
test.head()

Unnamed: 0,id,Promotion Name,Store Kind,Store Sales,Store Cost,Is Recyclable?,Store Area,Grocery Area,Frozen Area,Meat Area,...,Department,Video Store,Bar For Salad,Florist,Coffee Bar,Ready Food,Gross Weight,Net Weight,Package Weight,Min. Person Yearly Income
0,0,Fantastic Discounts,Gourmet,11760000.0,4704000.0,True,2201.06,1424.85,465.54,308.73,...,Snacks,1,1,1,1,1,31.83,28.78,3.05,50000.0
1,1,Bag Stuffer,Deluxe,2160000.0,669600.0,False,2577.16,1735.17,505.07,336.59,...,Frozen Foods,1,1,1,1,1,29.94,27.04,2.91,70000.0
2,2,Pick Your Savings,Deluxe,1830000.0,823500.0,False,2837.58,2038.11,481.98,321.26,...,Dairy,1,1,1,1,1,29.22,26.31,2.91,130000.0
3,3,Price Winners,Deluxe,8820000.0,4410000.0,False,2859.04,1871.16,593.93,394.58,...,Frozen Foods,1,1,1,1,1,28.05,25.0,3.05,10000.0
4,4,Dollar Days,Supermarket,4320000.0,1987200.0,False,2193.97,1320.15,523.32,348.85,...,Beverages,0,0,1,0,0,23.55,20.64,2.91,30000.0


In [90]:
test.isna().sum()

id                           0
Promotion Name               0
Store Kind                   0
Store Sales                  0
Store Cost                   0
Is Recyclable?               0
Store Area                   0
Grocery Area                 0
Frozen Area                  0
Meat Area                    0
Marriage                     0
Gender                       0
Children                     0
Degree                       0
Work                         0
Store Code                   0
Country ISO2                 0
Oreder Brand                 0
Order                        0
Department                   0
Video Store                  0
Bar For Salad                0
Florist                      0
Coffee Bar                   0
Ready Food                   0
Gross Weight                 0
Net Weight                   0
Package Weight               0
Min. Person Yearly Income    0
dtype: int64

In [92]:
y_sub_pred = best.predict(preprocessor.transform(test))

In [93]:
samples["Cost"] = y_sub_pred

In [94]:
samples.to_csv("data/submission.csv", index=False)