In [1]:
# We load the competition data

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
    MinMaxScaler
)
from sklearn.feature_selection import (
    mutual_info_regression
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    RandomizedSearchCV
)
from sklearn.metrics import (
    r2_score, 
    mean_squared_error
)
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.ensemble import BaggingRegressor
from catboost import CatBoostRegressor

# Primer análisis

Notebook y análisis completo:

https://www.kaggle.com/code/les1781/listening-time-prediction-playground-series-s5-e4

In [3]:
# We load the data

listening_new = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv", index_col="id")

# Data wrangling

In [4]:
# We check for null values

null_values = (
    pd.DataFrame(
        {f"Amount of Null Data": listening_new.isnull().sum(), 
         "Percentage of Null Data" : (
             listening_new.isnull().sum()) / (len(listening_new)) * (100)
        }))

null_values.style.background_gradient(cmap="Greens")

Unnamed: 0,Amount of Null Data,Percentage of Null Data
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,87093,11.6124
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,146030,19.470667
Number_of_Ads,1,0.000133
Episode_Sentiment,0,0.0


In [5]:
# We replace the erroneous values


listening_new["Number_of_Ads"] = listening_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x)

listening_new["Episode_Length_minutes"] = (
    listening_new["Episode_Length_minutes"].apply(lambda x: np.NaN if x>125.0 else x)
)

listening_new["Host_Popularity_percentage"] = np.where(
    listening_new["Host_Popularity_percentage"] > 100, 100, listening_new["Host_Popularity_percentage"]
).round(decimals=2)

listening_new["Guest_Popularity_percentage"] = np.where(
    listening_new["Guest_Popularity_percentage"] > 100, 100, listening_new["Guest_Popularity_percentage"]
).round(decimals=2)

In [6]:
# We fill null values with the mean and mode groupby Podcast

listening_new["Number_of_Ads"] = listening_new["Number_of_Ads"].fillna(
        listening_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])
)

listening_new["Episode_Length_minutes"] = (listening_new["Episode_Length_minutes"].fillna(
    listening_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("median")
))

listening_new["Guest_Popularity_percentage"] = (listening_new["Guest_Popularity_percentage"].fillna(
        listening_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("median")
))

In [7]:
# 1) polynomial / transforms of Episode_Length_minutes

listening_new["len_log"] = np.log1p(listening_new["Episode_Length_minutes"])
listening_new["len_sq"]  = listening_new["Episode_Length_minutes"] ** 2
listening_new["len_cubert"] = np.cbrt(listening_new["Episode_Length_minutes"])

In [8]:
# 2) host/guest Popularity_percentage transforms

listening_new["host_log"] = np.log1p(listening_new["Host_Popularity_percentage"])
listening_new["host_sq"] = listening_new["Host_Popularity_percentage"] ** 2
listening_new["guest_log"] = np.log1p(listening_new["Guest_Popularity_percentage"])
listening_new["guest_sq"] = listening_new["Guest_Popularity_percentage"] ** 2
listening_new["pop_diff"] = listening_new["Host_Popularity_percentage"] - listening_new["Guest_Popularity_percentage"]
listening_new["pop_sum"] = listening_new["Host_Popularity_percentage"] + listening_new["Guest_Popularity_percentage"]
listening_new["pop_ratio"] = listening_new["Host_Popularity_percentage"] / (listening_new["Guest_Popularity_percentage"] + 1)

In [9]:
# 3) Number_of_Ads transforms

listening_new["ads_log"] = np.log1p(listening_new["Number_of_Ads"])
listening_new["ads_per_min"] = listening_new["Number_of_Ads"] / (listening_new["Episode_Length_minutes"] + 1)

In [10]:
# We changed the format for more efficient memory usage

listening_new[listening_new.select_dtypes(["object"]).columns] = (
    listening_new.select_dtypes(["object"]).apply(
        lambda x: x.astype("category"))
)

In [11]:
listening_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 23 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   Podcast_Name                 750000 non-null  category
 1   Episode_Title                750000 non-null  category
 2   Episode_Length_minutes       750000 non-null  float64 
 3   Genre                        750000 non-null  category
 4   Host_Popularity_percentage   750000 non-null  float64 
 5   Publication_Day              750000 non-null  category
 6   Publication_Time             750000 non-null  category
 7   Guest_Popularity_percentage  750000 non-null  float64 
 8   Number_of_Ads                750000 non-null  float64 
 9   Episode_Sentiment            750000 non-null  category
 10  Listening_Time_minutes       750000 non-null  float64 
 11  len_log                      750000 non-null  float64 
 12  len_sq                       750000 non-null  flo

# Data preprocessing

In [12]:
listening_end = listening_new.copy()

In [13]:
# We map the variables and change the format

eps_order = {"Negative" : 0, "Neutral" : 1, "Positive" : 2}
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].map(eps_order)
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].astype("float64")

In [14]:
listening_end["Episode_Title"] = listening_end["Episode_Title"].str.strip("Episode ").astype("float64")

In [15]:
# We separate the categorical variables from the numerical ones

df_numerical = listening_end.select_dtypes(include="number")
df_categorical = listening_end.select_dtypes(include="category")

In [16]:
df_categorical.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 4 columns):
 #   Column            Non-Null Count   Dtype   
---  ------            --------------   -----   
 0   Podcast_Name      750000 non-null  category
 1   Genre             750000 non-null  category
 2   Publication_Day   750000 non-null  category
 3   Publication_Time  750000 non-null  category
dtypes: category(4)
memory usage: 8.6 MB


## Encode

In [17]:
# We apply OrdinalEncoder to the remaining categorical variables

enc = OrdinalEncoder(categories="auto").set_output(transform="pandas")

enc_data = enc.fit_transform(df_categorical)

df_listening = pd.concat([enc_data, df_numerical], axis=1)

In [18]:
df_listening.corr().style.background_gradient(cmap="Greens")

Unnamed: 0,Podcast_Name,Genre,Publication_Day,Publication_Time,Episode_Title,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes,len_log,len_sq,len_cubert,host_log,host_sq,guest_log,guest_sq,pop_diff,pop_sum,pop_ratio,ads_log,ads_per_min
Podcast_Name,1.0,0.188052,0.003117,-0.002328,0.006225,0.006128,-0.002424,-0.005297,0.008955,0.003115,0.004346,0.004859,0.006615,0.005367,-0.002122,-0.00241,-0.004591,-0.004655,0.002353,-0.005508,0.000468,0.008653,0.001107
Genre,0.188052,1.0,0.003305,-0.001329,0.006555,-0.000616,-0.008807,0.00719,-0.003429,-0.001588,0.00494,-0.000965,-0.000356,-0.000836,-0.008339,-0.008522,0.004108,0.009066,-0.011347,-0.000514,-0.003065,-0.00274,-0.000962
Publication_Day,0.003117,0.003305,1.0,-0.00086,0.002622,0.006538,-0.003698,0.000441,0.004986,0.004594,0.00403,0.002171,0.008003,0.003866,-0.002875,-0.004188,0.002194,-0.001502,-0.002824,-0.002117,-0.001574,0.004845,0.005252
Publication_Time,-0.002328,-0.001329,-0.00086,1.0,-0.001611,0.009949,0.000182,-0.005236,-0.006727,0.008923,0.013225,0.012388,0.008533,0.011527,-7.5e-05,0.000615,-0.003659,-0.005271,0.004063,-0.003741,0.001629,-0.007263,-0.01213
Episode_Title,0.006225,0.006555,0.002622,-0.001611,1.0,-0.018766,0.018271,0.037481,0.005647,-0.002872,-0.017321,-0.022565,-0.014239,-0.021649,0.01938,0.016639,0.031564,0.036831,-0.015894,0.03971,-0.01003,0.006059,0.016911
Episode_Length_minutes,0.006128,-0.000616,0.006538,0.009949,-0.018766,1.0,0.022263,-0.008403,-0.054762,0.024654,0.866402,0.941396,0.971667,0.974891,0.016947,0.026728,-0.006627,-0.005574,0.021329,0.008507,-0.000148,-0.052386,-0.504731
Host_Popularity_percentage,-0.002424,-0.008807,-0.003698,0.000182,0.018271,0.022263,1.0,0.020343,-0.017831,0.007063,0.050854,0.016525,0.023822,0.018947,0.982051,0.985862,0.015697,0.020867,0.658661,0.675483,0.177788,-0.014356,-0.008854
Guest_Popularity_percentage,-0.005297,0.00719,0.000441,-0.005236,0.037481,-0.008403,0.020343,1.0,0.008727,0.000482,-0.014448,-0.003373,-0.010888,-0.005248,0.019968,0.01976,0.903716,0.961647,-0.738885,0.750965,-0.492037,0.008211,0.001269
Number_of_Ads,0.008955,-0.003429,0.004986,-0.006727,0.005647,-0.054762,-0.017831,0.008727,1.0,-0.020627,-0.124201,-0.045867,-0.055882,-0.049839,-0.014544,-0.020775,0.016047,0.001853,-0.018585,-0.005341,-0.022307,0.97966,0.5646
Episode_Sentiment,0.003115,-0.001588,0.004594,0.008923,-0.002872,0.024654,0.007063,0.000482,-0.020627,1.0,0.03947,0.026127,0.022598,0.02579,0.00594,0.008114,0.001803,0.000366,0.004398,0.005021,-0.002936,-0.018584,-0.02716


In [19]:
df_listening.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Podcast_Name,750000.0,23.515731,14.137577,0.0,11.0,23.0,37.0,47.0
Genre,750000.0,4.648788,2.963073,0.0,2.0,5.0,7.0,9.0
Publication_Day,750000.0,2.962776,1.997399,0.0,1.0,3.0,5.0,6.0
Publication_Time,750000.0,1.522868,1.119361,0.0,1.0,1.0,3.0,3.0
Episode_Title,750000.0,51.445811,28.085623,1.0,28.0,52.0,75.0,100.0
Episode_Length_minutes,750000.0,64.433284,31.004973,0.0,39.42,63.84,90.31,120.93
Host_Popularity_percentage,750000.0,59.859482,22.872221,1.3,39.41,60.05,79.53,100.0
Guest_Popularity_percentage,750000.0,52.48619,25.540435,0.0,34.55,53.72,71.04,100.0
Number_of_Ads,750000.0,1.347873,1.110966,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,750000.0,0.997969,0.81544,0.0,0.0,1.0,2.0,2.0


## Scaling

In [20]:
# We separate the target variable from the features and data to scale

x_listening = df_listening.drop(columns="Listening_Time_minutes")
y_listening = df_listening["Listening_Time_minutes"]

In [21]:
# Numerical variables to scale

df_numeric = x_listening[[
    "Podcast_Name",
    "Episode_Title",
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage"
]]

scaler = StandardScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(df_numeric)

# We create a df with the remaining variables

df_rest = x_listening[[
    "Genre",
    "Publication_Day",
    "Publication_Time",
    "Number_of_Ads",
    "Episode_Sentiment"
]]

df_new_fe = x_listening.drop(columns=
                             ["Podcast_Name",
                             "Episode_Title",
                             "Episode_Length_minutes",
                             "Host_Popularity_percentage",
                             "Guest_Popularity_percentage",
                             "Genre",
                             "Publication_Day",
                             "Publication_Time",
                             "Number_of_Ads",
                             "Episode_Sentiment"]
                            )

mm_scaler = MinMaxScaler().set_output(transform="pandas")
scale_mm = mm_scaler.fit_transform(df_new_fe)

# We concatenate the dataframes

x_end = pd.concat([df_rest, scale_num, scale_mm], axis=1)

In [22]:
x_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Genre,750000.0,4.648788,2.963073,0.0,2.0,5.0,7.0,9.0
Publication_Day,750000.0,2.962776,1.997399,0.0,1.0,3.0,5.0,6.0
Publication_Time,750000.0,1.522868,1.119361,0.0,1.0,1.0,3.0,3.0
Number_of_Ads,750000.0,1.347873,1.110966,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,750000.0,0.9979693,0.81544,0.0,0.0,1.0,2.0,2.0
Podcast_Name,750000.0,1.7886730000000003e-17,1.000001,-1.663351,-0.885282,-0.036479,0.95379,1.661125
Episode_Title,750000.0,4.092726e-18,1.000001,-1.796145,-0.834798,0.019732,0.838657,1.728793
Episode_Length_minutes,750000.0,5.373598e-17,1.000001,-2.078161,-0.806751,-0.019135,0.834599,1.822184
Host_Popularity_percentage,750000.0,-2.175208e-16,1.000001,-2.56029,-0.894076,0.00833,0.860018,1.754991
Guest_Popularity_percentage,750000.0,1.577594e-16,1.000001,-2.055025,-0.702267,0.048308,0.726449,1.860338


## Feature Selection

In [23]:
'''
mi_scores = mutual_info_regression(x_end, y_listening)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores
'''

'\nmi_scores = mutual_info_regression(x_end, y_listening)\nmi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)\nmi_scores = mi_scores.sort_values(ascending=False)\nmi_scores\n'

In [24]:
x_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Genre                        750000 non-null  float64
 1   Publication_Day              750000 non-null  float64
 2   Publication_Time             750000 non-null  float64
 3   Number_of_Ads                750000 non-null  float64
 4   Episode_Sentiment            750000 non-null  float64
 5   Podcast_Name                 750000 non-null  float64
 6   Episode_Title                750000 non-null  float64
 7   Episode_Length_minutes       750000 non-null  float64
 8   Host_Popularity_percentage   750000 non-null  float64
 9   Guest_Popularity_percentage  750000 non-null  float64
 10  len_log                      750000 non-null  float64
 11  len_sq                       750000 non-null  float64
 12  len_cubert                   750000 non-null  float64
 13  host

# Model Selection

In [25]:
# We separate the data into training and validation sets

x_train, x_val, y_train, y_val = (
    train_test_split(
        x_end, y_listening, test_size=0.2, random_state=42
    )
)

In [26]:
# Create the KFold object

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

## XGBRegressor

In [27]:
# We create the model instance

xgbr = XGBRegressor()

# Train the model with the data

#xgbr.fit(x_train, y_train)

In [28]:
xgbr.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [29]:
'''
# We establish the parameters to test

xgbr_param_grid = {
    "gamma" : [0.1, 0.5, 0.8, 0, 1],
    "max_depth" : [3, 4, 5, 6, 7],
    "learning_rate" : [0.2, 0.1, 0.01, 0.001],
    "subsample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "n_estimators" : [50, 100, 150, 200]
}

xgbr_grid = RandomizedSearchCV(
    xgbr,
    xgbr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

xgbr_search = xgbr_grid.fit(x_train, y_train)

print(
    f'Parameters: {xgbr_search.best_params_}\nScore: {xgbr_search.best_score_}'
)
'''

'\n# We establish the parameters to test\n\nxgbr_param_grid = {\n    "gamma" : [0.1, 0.5, 0.8, 0, 1],\n    "max_depth" : [3, 4, 5, 6, 7],\n    "learning_rate" : [0.2, 0.1, 0.01, 0.001],\n    "subsample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],\n    "n_estimators" : [50, 100, 150, 200]\n}\n\nxgbr_grid = RandomizedSearchCV(\n    xgbr,\n    xgbr_param_grid,\n    cv=kfold,\n    scoring="neg_root_mean_squared_error",\n    return_train_score=True\n)\n\nxgbr_search = xgbr_grid.fit(x_train, y_train)\n\nprint(\n    f\'Parameters: {xgbr_search.best_params_}\nScore: {xgbr_search.best_score_}\'\n)\n'

In [30]:
# We evaluate the performance after an initial optimization

#y_pred_xgbr = xgbr_search.best_estimator_.predict(x_val)

#r2_xgbr = r2_score(y_val, y_pred_xgbr)

#rmse_xgbr = np.sqrt(mean_squared_error(y_val, y_pred_xgbr))

#print(f"XGBRegressor optimization\n\nR-squared score: {r2_xgbr}\nRMSE: {rmse_xgbr}")

- With data dropped in variable management

      RandomizedSearchCV:

      Parameters: {'subsample':0.9, 'n_estimators':150, 'max_depth':6, 'learning_rate':0.2, 'gamma':0.1}

      Score: -10.385593205700733

      R-squared score: 0.8548779072393202

      RMSE: 10.359859724404574

In [31]:
'''
# XGBR different approaches

#xgbr_final = xgbr_search.best_estimator_
xgbr_final = XGBRegressor(
    subsample = 0.9, n_estimators = 150, max_depth = 6, learning_rate = 0.2, gamma = 0.1
)
xgbr_final.fit(x_train, y_train)
'''

'\n# XGBR different approaches\n\n#xgbr_final = xgbr_search.best_estimator_\nxgbr_final = XGBRegressor(\n    subsample = 0.9, n_estimators = 150, max_depth = 6, learning_rate = 0.2, gamma = 0.1\n)\nxgbr_final.fit(x_train, y_train)\n'

## BaggingRegressor

In [32]:
# We create the model instance

bagr = BaggingRegressor()

# Train the model with the data

#bagr.fit(x_train, y_train)

In [33]:
'''
y_pred_bagr = bagr.predict(x_val)

r2_bagr = r2_score(y_val, y_pred_bagr)

rmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))

print(f"BaggingRegressor\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")
'''

'\ny_pred_bagr = bagr.predict(x_val)\n\nr2_bagr = r2_score(y_val, y_pred_bagr)\n\nrmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))\n\nprint(f"BaggingRegressor\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")\n'

BaggingRegressor (Default)

R-squared score: 0.7586370826352582

RMSE: 13.326718743148652

In [34]:
bagr.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [35]:
'''
# We establish the parameters to test

bagr_param_grid = {
    "estimator" : [None, xgbr, xgbr_final],
    "max_features" : [0.2, 0.5, 1.0],
    "max_samples" : [0.2, 0.5, 1.0],
    "n_estimators" : [10, 20, 30]
}

bagr_grid = RandomizedSearchCV(
    bagr,
    bagr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

bagr_search = bagr_grid.fit(x_train, y_train)

print(
    f'Parameters: {bagr_search.best_params_}\nScore: {bagr_search.best_score_}'
)
'''

'\n# We establish the parameters to test\n\nbagr_param_grid = {\n    "estimator" : [None, xgbr, xgbr_final],\n    "max_features" : [0.2, 0.5, 1.0],\n    "max_samples" : [0.2, 0.5, 1.0],\n    "n_estimators" : [10, 20, 30]\n}\n\nbagr_grid = RandomizedSearchCV(\n    bagr,\n    bagr_param_grid,\n    cv=kfold,\n    scoring="neg_root_mean_squared_error",\n    return_train_score=True\n)\n\nbagr_search = bagr_grid.fit(x_train, y_train)\n\nprint(\n    f\'Parameters: {bagr_search.best_params_}\nScore: {bagr_search.best_score_}\'\n)\n'

Parameters: {'n_estimators': 20, 'max_samples': 1.0, 'max_features': 1.0, 'estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=150, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)}
             
Score: -13.031101049045333

In [36]:
'''
# We evaluate the performance after an initial optimization

y_pred_bagr = bagr_search.best_estimator_.predict(x_val)

r2_bagr = r2_score(y_val, y_pred_bagr)

rmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))

print(f"BaggingRegressor optimization\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")
'''

'\n# We evaluate the performance after an initial optimization\n\ny_pred_bagr = bagr_search.best_estimator_.predict(x_val)\n\nr2_bagr = r2_score(y_val, y_pred_bagr)\n\nrmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))\n\nprint(f"BaggingRegressor optimization\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")\n'

BaggingRegressor optimization

R-squared score: 0.7705749925584561

RMSE: 12.992967020835195

In [37]:
# BAGR different approaches

#bagr_final = bagr_search.best_estimator_

#bagr_final = BaggingRegressor(n_estimators=20, max_samples=1.0, max_features=1.0, estimator=xgbr_final)

#bagr_final.fit(x_train, y_train)

## CatBoostRegressor

In [38]:
# Initialize the CatBoostRegressor with RMSE as the loss function 
cbr = CatBoostRegressor(loss_function="RMSE", silent=True) 
  
# Fit the model
#cbr.fit(x_train, y_train)

In [39]:
'''
y_pred_cbr = cbr.predict(x_val)

r2_cbr = r2_score(y_val, y_pred_cbr)

rmse_cbr = np.sqrt(mean_squared_error(y_val, y_pred_cbr))

print(f"CatBoostRegressor\n\nR-squared score: {r2_cbr}\nRMSE: {rmse_cbr}")
'''

'\ny_pred_cbr = cbr.predict(x_val)\n\nr2_cbr = r2_score(y_val, y_pred_cbr)\n\nrmse_cbr = np.sqrt(mean_squared_error(y_val, y_pred_cbr))\n\nprint(f"CatBoostRegressor\n\nR-squared score: {r2_cbr}\nRMSE: {rmse_cbr}")\n'

CatBoostRegressor

R-squared score: 0.7695329849820068

RMSE: 13.022439466683332

In [40]:
'''
# We establish the parameters to test

cbr_param_grid = {
    "learning_rate" : [0.001, 0.01, 0.004],
    "max_depth" : [3,4,5],
    "l2_leaf_reg" : [1.0, 5.0, 0.5],
    "min_child_samples" : [1,3,6]
}

cbr_grid = RandomizedSearchCV(
    cbr,
    cbr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

cbr_search = cbr_grid.fit(x_train, y_train)

print(
    f'Parameters: {cbr_search.best_params_}\nScore: {cbr_search.best_score_}'
)
'''

'\n# We establish the parameters to test\n\ncbr_param_grid = {\n    "learning_rate" : [0.001, 0.01, 0.004],\n    "max_depth" : [3,4,5],\n    "l2_leaf_reg" : [1.0, 5.0, 0.5],\n    "min_child_samples" : [1,3,6]\n}\n\ncbr_grid = RandomizedSearchCV(\n    cbr,\n    cbr_param_grid,\n    cv=kfold,\n    scoring="neg_root_mean_squared_error",\n    return_train_score=True\n)\n\ncbr_search = cbr_grid.fit(x_train, y_train)\n\nprint(\n    f\'Parameters: {cbr_search.best_params_}\nScore: {cbr_search.best_score_}\'\n)\n'

CatBoostRegressor optimization

Parameters: {'min_child_samples': 3, 'max_depth': 5, 'learning_rate': 0.01, 'l2_leaf_reg': 0.5}

Score: -13.197115784814908

In [41]:
'''
# We evaluate the performance after an initial optimization

y_pred_cbr_s = cbr_search.best_estimator_.predict(x_val)

r2_cbr_s = r2_score(y_val, y_pred_cbr_s)

rmse_cbr_s = np.sqrt(mean_squared_error(y_val, y_pred_cbr_s))

print(f"CatBoostRegressor\n\nR-squared score: {r2_cbr_s}\nRMSE: {rmse_cbr_s}")
'''

'\n# We evaluate the performance after an initial optimization\n\ny_pred_cbr_s = cbr_search.best_estimator_.predict(x_val)\n\nr2_cbr_s = r2_score(y_val, y_pred_cbr_s)\n\nrmse_cbr_s = np.sqrt(mean_squared_error(y_val, y_pred_cbr_s))\n\nprint(f"CatBoostRegressor\n\nR-squared score: {r2_cbr_s}\nRMSE: {rmse_cbr_s}")\n'

CatBoostRegressor First approach

R-squared score: 0.7651383208535669
RMSE: 13.146012462379785

# Final Model

In [42]:
# We define the final model

final_model = cbr

# We fit the best model

final_model.fit(x_train, y_train)

<catboost.core.CatBoostRegressor at 0x7ff6b068c950>

In [43]:
# We evaluate the performance of the final model

final_model_ypred = final_model.predict(x_val)

final_model_rmse = np.sqrt(mean_squared_error(y_val, final_model_ypred))

print("Final Model RMSE Score: %.3f" % final_model_rmse)

Final Model RMSE Score: 13.029


bagr Model RMSE Score: 13.337

bagr, estimator=xgbr_final Model RMSE Score: 12.992

In [44]:
'''
# We create an explainer for the best estimator

explainer = shap.Explainer(final_model)
shap_values = explainer.shap_values(x_val)

# we visualize the importance

fig = shap.summary_plot(
    shap_values,
    x_val,
    show=False
)
plt.title("Feature Importance", fontsize=20, color='g', loc='left')
plt.xlabel("Mean SHAP Values", fontsize=20)
plt.ylabel("Features", fontsize=20)
plt.show()
'''

'\n# We create an explainer for the best estimator\n\nexplainer = shap.Explainer(final_model)\nshap_values = explainer.shap_values(x_val)\n\n# we visualize the importance\n\nfig = shap.summary_plot(\n    shap_values,\n    x_val,\n    show=False\n)\nplt.title("Feature Importance", fontsize=20, color=\'g\', loc=\'left\')\nplt.xlabel("Mean SHAP Values", fontsize=20)\nplt.ylabel("Features", fontsize=20)\nplt.show()\n'

# Test data

In [45]:
# We load the test data

df_test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")

In [46]:
df_test_new = df_test.copy()

In [47]:
# We check the null values

null_values_test = (
    pd.DataFrame(
        {f"Amount of Null Data" : df_test_new.isnull().sum(), 
         "Percentage of Null Data" : (
             df_test_new.isnull().sum()) / (len(df_test_new)) * (100)
        }
    ))

null_values_test.style.background_gradient(cmap="Greens")

Unnamed: 0,Amount of Null Data,Percentage of Null Data
id,0,0.0
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,28736,11.4944
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,48832,19.5328
Number_of_Ads,0,0.0


In [48]:
# We start by removing the variables that we will not use

df_test_new = df_test_new.drop(columns="id")

In [49]:
# We replace the erroneous values

df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x))
df_test_new["Episode_Length_minutes"] = (
    df_test_new["Episode_Length_minutes"].apply(lambda x: np.NaN if x>125.0 else x)
)
df_test_new["Host_Popularity_percentage"] = np.where(
    df_test_new["Host_Popularity_percentage"] > 100, 100, df_test_new["Host_Popularity_percentage"]
).round(decimals=2)
df_test_new["Guest_Popularity_percentage"] = np.where(
    df_test_new["Guest_Popularity_percentage"] > 100, 100, df_test_new["Guest_Popularity_percentage"]
).round(decimals=2)

In [50]:
'''
# We filled in null values with KNNImputer

test_object = df_test_new.select_dtypes(include="object")
test_impute = df_test_new.select_dtypes(include="number")

imputer = KNNImputer().set_output(transform="pandas")
after_imputation = imputer.fit_transform(test_impute)

df_test_new = pd.concat([test_object, after_imputation], axis=1)
'''

'\n# We filled in null values with KNNImputer\n\ntest_object = df_test_new.select_dtypes(include="object")\ntest_impute = df_test_new.select_dtypes(include="number")\n\nimputer = KNNImputer().set_output(transform="pandas")\nafter_imputation = imputer.fit_transform(test_impute)\n\ndf_test_new = pd.concat([test_object, after_imputation], axis=1)\n'

In [51]:
# We filled in null values

df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].fillna(
    df_test_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])
))
df_test_new["Episode_Length_minutes"] = (df_test_new["Episode_Length_minutes"].fillna(
    df_test_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("median")
))
df_test_new["Guest_Popularity_percentage"] = (df_test_new["Guest_Popularity_percentage"].fillna(
    df_test_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("median")
))

In [52]:
# Feature engineering on test data

df_test_new["len_log"] = np.log1p(df_test_new["Episode_Length_minutes"])
df_test_new["len_sq"]  = df_test_new["Episode_Length_minutes"] ** 2
df_test_new["len_cubert"] = np.cbrt(df_test_new["Episode_Length_minutes"])

df_test_new["host_log"] = np.log1p(df_test_new["Host_Popularity_percentage"])
df_test_new["host_sq"] = df_test_new["Host_Popularity_percentage"] ** 2
df_test_new["guest_log"] = np.log1p(df_test_new["Guest_Popularity_percentage"])
df_test_new["guest_sq"] = df_test_new["Guest_Popularity_percentage"] ** 2
df_test_new["pop_diff"] = df_test_new["Host_Popularity_percentage"] - df_test_new["Guest_Popularity_percentage"]
df_test_new["pop_sum"] = df_test_new["Host_Popularity_percentage"] + df_test_new["Guest_Popularity_percentage"]
df_test_new["pop_ratio"] = df_test_new["Host_Popularity_percentage"] / (df_test_new["Guest_Popularity_percentage"] + 1)

df_test_new["ads_log"] = np.log1p(df_test_new["Number_of_Ads"])
df_test_new["ads_per_min"] = df_test_new["Number_of_Ads"] / (df_test_new["Episode_Length_minutes"] + 1)

In [53]:
# We encode the categorical variables

df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].map(eps_order)
df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].astype("float64")

df_test_new["Episode_Title"] = df_test_new["Episode_Title"].str.strip("Episode ").astype("float64")

test_num = df_test_new.select_dtypes(include="number")
test_cat = df_test_new.select_dtypes(include="object")
test_enc = enc.fit_transform(test_cat)
test_encoded = pd.concat([test_enc, test_num], axis=1)

In [54]:
# We scale the test variables

test_numeric = test_encoded[[
    "Podcast_Name",
    "Episode_Title",
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage"
]]

test_scale_num = scaler.transform(test_numeric)

test_rest = test_encoded[[
    "Genre",
    "Publication_Day",
    "Publication_Time",
    "Number_of_Ads",
    "Episode_Sentiment"
]]

test_fe = test_encoded.drop(columns=
                             ["Podcast_Name",
                             "Episode_Title",
                             "Episode_Length_minutes",
                             "Host_Popularity_percentage",
                             "Guest_Popularity_percentage",
                             "Genre",
                             "Publication_Day",
                             "Publication_Time",
                             "Number_of_Ads",
                             "Episode_Sentiment"]
                            )

test_scale_mm = mm_scaler.transform(test_fe)

# We concatenate the dataframes

test_end = pd.concat([test_rest, test_scale_num, test_scale_mm], axis=1)

In [55]:
test_end.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 250000 entries, 0 to 249999
Data columns (total 22 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Genre                        250000 non-null  float64
 1   Publication_Day              250000 non-null  float64
 2   Publication_Time             250000 non-null  float64
 3   Number_of_Ads                250000 non-null  float64
 4   Episode_Sentiment            250000 non-null  float64
 5   Podcast_Name                 250000 non-null  float64
 6   Episode_Title                250000 non-null  float64
 7   Episode_Length_minutes       250000 non-null  float64
 8   Host_Popularity_percentage   250000 non-null  float64
 9   Guest_Popularity_percentage  250000 non-null  float64
 10  len_log                      250000 non-null  float64
 11  len_sq                       250000 non-null  float64
 12  len_cubert                   250000 non-null  float64
 13 

In [56]:
test_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Genre,250000.0,4.641336,2.959436,0.0,2.0,5.0,7.0,9.0
Publication_Day,250000.0,2.962788,1.993649,0.0,1.0,3.0,5.0,6.0
Publication_Time,250000.0,1.522652,1.11854,0.0,1.0,1.0,3.0,3.0
Number_of_Ads,250000.0,1.347248,1.112669,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,250000.0,0.997236,0.815666,0.0,0.0,1.0,2.0,2.0
Podcast_Name,250000.0,-0.001865,0.998106,-1.663351,-0.885282,-0.036479,0.883056,1.661125
Episode_Title,250000.0,-0.000884,0.998834,-1.796145,-0.834798,0.019732,0.838657,1.728793
Episode_Length_minutes,250000.0,0.001821,1.000222,-1.998496,-0.808364,-0.014297,0.84105,1.815733
Host_Popularity_percentage,250000.0,-0.006268,1.000309,-2.508262,-0.901071,0.001771,0.853897,1.754991
Guest_Popularity_percentage,250000.0,-0.002686,0.999432,-2.055025,-0.702267,0.032647,0.724491,1.860338


In [57]:
# We apply the trained model

listening_predictions = final_model.predict(test_end)

In [58]:
# We review the result

print("Total predictions: ", len(listening_predictions), "\n")

Total predictions:  250000 



In [59]:
# We create the dataframe

listening_submission = pd.DataFrame({
    "id" : df_test["id"], 
    "Listening_Time_minutes" : listening_predictions
})

listening_submission.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.39128
1,750001,18.069467
2,750002,48.422379
3,750003,76.647511
4,750004,49.052827


In [60]:
# We convert the dataframe to a csv file

listening_submission.to_csv("submission.csv", index=False)