# Import the necessary Python libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import xgboost as xgb
import lightgbm as lgbm
import catboost as cb

from sklearn.metrics import mean_squared_error as mse
from sklearn.metrics import mean_squared_log_error as msle
from sklearn.model_selection import KFold,cross_val_predict, cross_val_score
from sklearn.ensemble import VotingRegressor, StackingRegressor
from sklearn.pipeline import make_pipeline
from skopt import BayesSearchCV
from skopt.space import Real, Integer, Categorical
from category_encoders import TargetEncoder
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

sns.set_theme(style = 'white', palette = 'viridis')
pal = sns.color_palette("tab10")

# Loading Train and Test Datasets

In [None]:
training_dataset = pd.read_csv(r'../input/playground-series-s3e11/train.csv')
testing_dataset = pd.read_csv(r'../input/playground-series-s3e11/test.csv')

training_dataset.drop('id', axis = 1, inplace = True)
testing_ids=testing_dataset['id']
testing_dataset.drop('id', axis = 1, inplace = True)

# Loading Original Training Dataset

In [None]:
original_training_dataset = pd.read_csv(r'../input/media-campaign-cost-prediction/train_dataset.csv')

* Take a Quick Look at the Data Structure

In [None]:
training_dataset.head()

In [None]:
testing_dataset.head()

In [None]:
original_training_dataset.head()

In [None]:
def rmsle_lgbm(y_pred, data):

    y_true = np.array(data)
    score = msle(y_true, y_pred, squared = False)

    return 'rmsle', score, False

# Data Pre-processing

## Check for Missing Values

In [None]:
def count_missing_values(df :pd.DataFrame):
    # Count missing values in each column
    missing_values = df.isnull().sum()
    
    # Total missing values in the DataFrame
    total_missing = missing_values.sum()
    
    return total_missing

1. Training Dataset

In [None]:
print("\nTotal missing values in the Training Dataset:",count_missing_values(training_dataset))

2. Testing Dataset

In [None]:
print("\nTotal missing values in the Testing Dataset:",count_missing_values(testing_dataset))

3. Original Training Dataset

In [None]:
print("\nTotal missing values in the original training dataset Dataset:", count_missing_values(original_training_dataset))

## Check for Duplicates Values

In [None]:
def count_duplicate_rows(df):
    # Check for duplicate rows
    duplicate_rows = df[df.duplicated()]
    
    # Count the number of duplicate rows
    num_duplicate_rows = duplicate_rows.shape[0]
    
    return num_duplicate_rows

1. Training Dataset

In [None]:
print("\nNumber of duplicate rows:", count_duplicate_rows(training_dataset))

2. Testing Dataset

In [None]:
print("\nNumber of duplicate rows:", count_duplicate_rows(testing_dataset))

3. Original Training Dataset

In [None]:
print("\nNumber of duplicate rows:", count_duplicate_rows(original_training_dataset))

## Remove Duplicates Values

In [None]:
# Remove duplicate rows
original_training_dataset = original_training_dataset.drop_duplicates()

# Exploratory Data Analysis

## Visualize Feature Distribution

In [None]:
def plot_feature_distributions(training_df, testing_df, original_training_df, palette):
    fig, ax = plt.subplots(5, 3, figsize=(10, 13), dpi=300)
    ax = ax.flatten()

    for i, column in enumerate(testing_df.columns):
        sns.kdeplot(training_df[column], ax=ax[i], color=palette[0])
        sns.kdeplot(testing_df[column], ax=ax[i], color=palette[2])
        sns.kdeplot(original_training_df[column], ax=ax[i], color=palette[1])

        ax[i].set_title(f'{column} Distribution')
        ax[i].set_xlabel(None)

    fig.suptitle('Distribution of Feature\nper Dataset\n', fontsize=24, fontweight='bold')
    fig.legend(['Train', 'Test', 'Original Train'])
    plt.tight_layout()
    plt.show()


In [None]:
plot_feature_distributions(training_dataset, testing_dataset, original_training_dataset, pal)

## Visualize Cost Distribution Per Train vs Original Train Datasets

In [None]:
plt.figure(figsize = (10, 5), dpi = 300)

sns.kdeplot(training_dataset['cost'], color = pal[0], fill = True)
sns.kdeplot(original_training_dataset['cost'], color = pal[2], fill = True)

plt.title('Distribution of Cost per Dataset\n', weight = 'bold', fontsize = 25)
plt.legend(['Train', 'Original Train'])
plt.show()

* The distribution in both the original datasets and the competition datasets appears to be quite similar.

## Visualize Correlation Matrix

In [None]:
def plot_correlation_heatmap(data, label=None):
    correlation_matrix = data.corr()
    plt.figure(figsize=(14, 10), dpi=300)
    
    # Create a mask to hide the upper triangle of the heatmap
    mask = np.zeros_like(correlation_matrix)
    mask[np.triu_indices_from(mask)] = True
    
    sns.heatmap(correlation_matrix, mask=mask, annot=True, annot_kws={'size': 7})
    plt.yticks(fontsize=14)
    plt.xticks(fontsize=14)
    plt.title(f'Correlation Matrix for {label} Dataset\n', fontsize=25, weight='bold')
    plt.show()

1. Training Dataset

In [None]:
plot_correlation_heatmap(training_dataset, 'Training')

2. Testing Dataset

In [None]:
plot_correlation_heatmap(testing_dataset, 'Testing')

* Perfect correlation between '**salad_bar**' and '**prepared_food**' implies that all '**prepared_food**' items originate from the '**salad_bar**.' Removing one of these variables should improve our results.

# Feature Engineering

## Separate Features and Target Labels from Datasets

In [None]:
train_features = training_dataset.drop(columns='cost')
train_target = training_dataset['cost']

In [None]:
test_features = testing_dataset.copy()

In [None]:
features.head()

In [None]:
target.head()

In [None]:
test_features.head()

In [None]:
seed = 42
splits = 5

np.random.seed(seed)

cross_validator = KFold(n_splits=splits, random_state=seed, shuffle=True)

## Feature Generation

In [None]:
# Calculate 'child_ratio' and handle infinity and NaN values
train_features['child_ratio'] = train_features['total_children'] / train_features['num_children_at_home']
train_features['child_ratio'].replace([np.inf, -np.inf], 10, inplace=True)
train_features['child_ratio'].fillna(0, inplace=True)

test_features['child_ratio'] = test_features['total_children'] / test_features['num_children_at_home']
test_features['child_ratio'].replace([np.inf, -np.inf], 10, inplace=True)
test_features['child_ratio'].fillna(0, inplace=True)

# Calculate 'store_score'
train_features['store_score'] = train_features['coffee_bar'] + train_features['video_store'] + train_features['salad_bar'] + train_features['florist']
test_features['store_score'] = test_features['coffee_bar'] + test_features['video_store'] + test_features['salad_bar'] + test_features['florist']

#Calculate 'store_score_ratio'
train_features['store_score_ratio'] = train_features['store_sqft'] / train_features['store_score']
test_features['store_score_ratio'] = test_features['store_sqft'] / test_features['store_score']
# Calculate 'independent_child'
train_features['independent_child'] = train_features['total_children'] - train_features['num_children_at_home']
test_features['independent_child'] = test_features['total_children'] - test_features['num_children_at_home']


In [None]:
test_features.shape

## Check for Feature Importance

In [None]:
xgb_params = {
    'seed': seed,
    'objective': 'reg:squaredlogerror',
    'eval_metric': 'rmse',
    'tree_method' : 'gpu_hist',
    'n_jobs' : -1,
    'importance_type' : 'total_gain',
    'max_depth' : 12,
    'n_estimators': 100
}

lgbm_params = {
    'seed': seed,
    'objective' : 'regression',
    'metric' : 'custom',
    'device' : 'gpu',
    'n_jobs' : -1,
    'importance_type' : 'gain',
    'max_depth' : 12,
    'n_estimators': 100
}

xgb_importance = np.zeros(len(list(train_features)))
lgbm_importance = np.zeros(len(list(train_features)))

for fold, (train_idx, val_idx) in enumerate(cross_validator.split(train_features)):
    
    xgb_model = XGBRegressor(
        **xgb_params,
        callbacks=[xgb.callback.EarlyStopping(rounds=10,
                                              maximize=False,
                                              save_best=True)]
    )
    
    xgb_model.fit(
        train_features.iloc[train_idx], 
        train_target.iloc[train_idx],
        eval_set = [(train_features.iloc[val_idx], train_target.iloc[val_idx])],
        verbose = False
    )  
    
    lgbm_model = LGBMRegressor(
        **lgbm_params
    )
    
    lgbm_model.fit(
        train_features.iloc[train_idx], 
        train_target.iloc[train_idx],
        eval_set = [(train_features.iloc[val_idx], train_target.iloc[val_idx])],
        eval_metric = rmsle_lgbm,
        callbacks = [lgbm.log_evaluation(False),
                     lgbm.early_stopping(stopping_rounds = 10, verbose = False)]
    )
    
    xgb_importance += xgb_model.feature_importances_ / splits
    lgbm_importance += lgbm_model.feature_importances_ / splits
    
xgb_info = pd.DataFrame(xgb_importance, index = list(train_features))
lgbm_info = pd.DataFrame(lgbm_importance, index = list(train_features))

In [None]:
fig, ax = plt.subplots(1, 2, figsize = (10,4), dpi = 300)
ax = ax.flatten()

sns.barplot(xgb_info.sort_values(ascending = False, by = 0).T, orient = 'h', palette = 'viridis', ax = ax[0])
ax[0].set_title('XGBoost')
ax[0].tick_params(axis = 'both', which = 'major', labelsize = 7)

sns.barplot(lgbm_info.sort_values(ascending = False, by = 0).T, orient = 'h', palette = 'viridis', ax = ax[1])
ax[1].tick_params(axis = 'both', which = 'major', labelsize = 7)
ax[1].set_title('LightGBM')

fig.suptitle('Feature Importance', weight = 'bold')
plt.tight_layout()

## Dropping Unimportant Features

In [None]:
drop_features = ['low_fat', 'gross_weight', 'recyclable_package', 'store_sales(in millions)', 'units_per_case', 'unit_sales(in millions)','prepared_food']

train_features.drop(drop_features, axis = 1, inplace = True)
test_features.drop(drop_features, axis = 1, inplace = True)

In [None]:
test_features.shape

## KFold Cross Validation with XGBoost

In [None]:
def rmsle(y_target,y_pred):

    y_target = np.array(y_target)
    score = msle(y_target, y_pred, squared = False)

    return score

In [None]:
train_target = np.log1p(train_target)

xgb_params = {
    'seed': seed,
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse',
    'tree_method' : 'gpu_hist',
    'n_jobs' : -1,
    'importance_type' : 'total_gain',
    'grow_policy' : 'lossguide',
    
    'max_depth' : 20,
    'n_estimators' : 1000,
    'alpha' : 1,
    'lambda' : 0,
    'eta' : .0561418286569673
}

In [None]:
train_scores_rmse, val_scores_rmse = [], []
train_scores_mse, val_scores_mse = [], []
for fold, (train_idx, val_idx) in enumerate(cross_validator.split(train_features, train_target)):
    
    xgb_model = XGBRegressor(**xgb_params)
    
    xgb_model.fit(
        train_features.iloc[train_idx], 
        train_target.iloc[train_idx]
    )
    
    train_preds = xgb_model.predict(train_features.iloc[train_idx])
    val_preds = xgb_model.predict(train_features.iloc[val_idx])
    
    train_score_rmsle = rmsle(train_target.iloc[train_idx], train_preds)
    val_score_rmsle = rmsle(train_target.iloc[val_idx], val_preds)
    
    train_score_mse = mse(target.iloc[train_idx], train_preds, squared = False)
    val_score_mse = mse(target.iloc[val_idx], val_preds, squared = False)
    
    train_scores_rmse.append(train_score)
    val_scores_rmse.append(val_score)
    
    train_scores_mse.append(train_score_mse)
    val_scores_mse.append(val_score_mse)

    print(f'Fold {fold}: val RMSLE = {val_score_rmsle:.5f} | train RMSLE = {train_score_rmsle:.5f}')
    print(f'Fold {fold}: val MSE = {val_score_mse:.5f} | train MSE = {train_score_mse:.5f}')
    print()

print(f'Average val RMSLE = {np.mean(val_scores):.5f} | train RMSLE = {np.mean(train_scores):.5f}')
print(f'Average val MSE = {np.mean(val_scores_mse):.5f} | train MSE = {np.mean(train_scores_mse):.5f}')

## KFold Cross Validation with CatBoost

In [None]:
cb_params = {
    'random_seed': seed,
    'loss_function' : 'RMSE',
    'eval_metric' : 'RMSE',
    'task_type' : 'GPU',
    'bootstrap_type' : 'Bernoulli',
    'verbose' : False,
    
    'n_estimators' : 1000, 
    'max_depth' : 11,
    'learning_rate' : .09373739891212098,
    'min_data_in_leaf' : 1,
    'reg_lambda' : 10,
    'subsample' : 1
}

In [None]:
train_scores_rmse, val_scores_rmse = [], []
train_scores_mse, val_scores_mse = [], []

for fold, (train_idx, val_idx) in enumerate(cross_validator.split(train_features, train_target)):

    cb_model = CatBoostRegressor(**cb_params)
    
    cb_model.fit(train_features.iloc[train_idx], train_target.iloc[train_idx])
    
    train_preds = cb_model.predict(train_features.iloc[train_idx])
    val_preds = cb_model.predict(train_features.iloc[val_idx])
    
    train_score_rmsle = rmsle(target.iloc[train_idx], train_preds)
    val_score_rmsle = rmsle(target.iloc[val_idx], val_preds)
    
    train_score_mse = mse(target.iloc[train_idx], train_preds, squared = False)
    val_score_mse = mse(target.iloc[val_idx], val_preds, squared = False)
    
    train_scores_rmse.append(train_score)
    val_scores_rmse.append(val_score)
    
    train_scores_mse.append(train_score_mse)
    val_scores_mse.append(val_score_mse)
    

    print(f'Fold {fold}: val RMSLE = {val_score_rmsle:.5f} | train RMSLE = {train_score_rmsle:.5f}')
    print(f'Fold {fold}: val MSE = {val_score_mse:.5f} | train MSE = {train_score_mse:.5f}')
    print()

print(f'Average val RMSLE = {np.mean(val_scores):.5f} | train RMSLE = {np.mean(train_scores):.5f}')
print(f'Average val MSE = {np.mean(val_scores_mse):.5f} | train MSE = {np.mean(train_scores_mse):.5f}')

## KFold Cross Validation with Stacking Approch with XGBoost & CatBoost

In [None]:
train_scores_rmse, val_scores_rmse = [], []
train_scores_mse, val_scores_mse = [], []

for fold, (train_idx, val_idx) in enumerate(cross_validator.split(train_features, train_target)):
    
    stack =StackingRegressor([('xgb', XGBRegressor(**xgb_params)),('cb', CatBoostRegressor(**cb_params))])
    
    stack.fit(train_features.iloc[train_idx], train_target.iloc[train_idx])
    
    train_preds = stack.predict(train_features.iloc[train_idx])
    val_preds = stack.predict(train_features.iloc[val_idx])
    
    train_score_rmsle = rmsle(train_target.iloc[train_idx], train_preds)
    val_score_rmsle = rmsle(train_target.iloc[val_idx], val_preds)
    
    train_score_mse = mse(train_target.iloc[train_idx], train_preds, squared = False)
    val_score_mse = mse(train_target.iloc[val_idx], val_preds, squared = False)
    
    train_scores_rmse.append(train_score)
    val_scores_rmse.append(val_score)
    
    train_scores_mse.append(train_score_mse)
    val_scores_mse.append(val_score_mse)
    

    print(f'Fold {fold}: val RMSLE = {val_score_rmsle:.5f} | train RMSLE = {train_score_rmsle:.5f}')
    print(f'Fold {fold}: val MSE = {val_score_mse:.5f} | train MSE = {train_score_mse:.5f}')
    print()

print(f'Average val RMSLE = {np.mean(val_scores):.5f} | train RMSLE = {np.mean(train_scores):.5f}')
print(f'Average val MSE = {np.mean(val_scores_mse):.5f} | train MSE = {np.mean(train_scores_mse):.5f}')

## KFold Cross Validation with Voting Approch with XGBoost & CatBoost

In [None]:
train_scores_rmse, val_scores_rmse = [], []
train_scores_mse, val_scores_mse = [], []

for fold, (train_idx, val_idx) in enumerate(cross_validator.split(train_features, train_target)):
    
    vote = make_pipeline(
        VotingRegressor([('xgb', XGBRegressor(**xgb_params)),('cb', CatBoostRegressor(**cb_params))])
    )
    
    vote.fit(train_features.iloc[train_idx], train_target.iloc[train_idx])
    
    train_preds = vote.predict(train_features.iloc[train_idx])
    val_preds = vote.predict(train_features.iloc[val_idx])
    
    train_score_rmsle = rmsle(train_target.iloc[train_idx], train_preds)
    val_score_rmsle = rmsle(trian_target.iloc[val_idx], val_preds)
    
    train_score_mse = mse(train_target.iloc[train_idx], train_preds, squared = False)
    val_score_mse = mse(train_target.iloc[val_idx], val_preds, squared = False)
    
    train_scores_rmse.append(train_score)
    val_scores_rmse.append(val_score)
    
    train_scores_mse.append(train_score_mse)
    val_scores_mse.append(val_score_mse)
    

    print(f'Fold {fold}: val RMSLE = {val_score_rmsle:.5f} | train RMSLE = {train_score_rmsle:.5f}')
    print(f'Fold {fold}: val MSE = {val_score_mse:.5f} | train MSE = {train_score_mse:.5f}')
    print()

print(f'Average val RMSLE = {np.mean(val_scores):.5f} | train RMSLE = {np.mean(train_scores):.5f}')
print(f'Average val MSE = {np.mean(val_scores_mse):.5f} | train MSE = {np.mean(train_scores_mse):.5f}')

# Final Model

## Combine Original & Training Datasets

In [None]:
combined_training_dataset = pd.concat([training_dataset, original_training_dataset])
combined_training_dataset.drop_duplicates(subset = list(training_dataset)[0:-1], inplace = True)

combined_features = combined_training_dataset.iloc[:, :-1]
combined_features = combined_features.drop(drop_features, axis = 1)
combined_target = combined_training_dataset['cost']
combined_target = np.log1p(combined_target)


In [None]:
combined_features.head()

## Feature Engineering for Combined Datasets

In [None]:
# Calculate 'child_ratio' and handle infinity and NaN values
combined_features['child_ratio'] = combined_features['total_children'] / combined_features['num_children_at_home']
combined_features['child_ratio'].replace([np.inf, -np.inf], 10, inplace=True)
combined_features['child_ratio'].fillna(0, inplace=True)

# Calculate 'store_score'
combined_features['store_score'] = combined_features['coffee_bar'] + combined_features['video_store'] + combined_features['salad_bar'] + combined_features['florist']

#Calculate 'store_score_ratio'
combined_features['store_score_ratio'] = combined_features['store_sqft'] / combined_features['store_score']

# Calculate 'independent_child'
combined_features['independent_child'] = combined_features['total_children'] - combined_features['num_children_at_home']


## Stacking Approch with XGBoost & CatBoost

In [None]:
stack = StackingRegressor([('xgb', XGBRegressor(**xgb_params)),('cb', CatBoostRegressor(**cb_params))])
stack.fit(combined_features, combined_target)

## Saving the Model

In [None]:
import joblib
joblib.dump(stack, 'final_model.joblib')

## Getting Results

In [None]:
stack_preds = pd.DataFrame({'cost': stack.predict(test_features)})
result = pd.concat([testing_ids, stack_preds], axis=1)

## Saving the CSV File

In [None]:
result['cost'] = np.expm1(result['cost'])
result.to_csv('submision.csv', index = False)