In [1]:
# We load the competition data

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler
)
from sklearn.feature_selection import (
    mutual_info_regression
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    RandomizedSearchCV,
    GridSearchCV
)
from sklearn.metrics import (
    r2_score, 
    mean_squared_error
)
from xgboost import XGBRegressor

# Primer análisis

https://www.kaggle.com/code/les1781/listening-time-prediction-playground-series-s5-e4

In [3]:
# We load the data

listening_new = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv", index_col="id")

# Data wrangling

In [4]:
# We check for null values

null_values = (
    pd.DataFrame(
        {f"Amount of Null Data": listening_new.isnull().sum(), 
         "Percentage of Null Data" : (
             listening_new.isnull().sum()) / (len(listening_new)) * (100)
        }))

null_values.style.background_gradient(cmap="Greens")

Unnamed: 0,Amount of Null Data,Percentage of Null Data
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,87093,11.6124
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,146030,19.470667
Number_of_Ads,1,0.000133
Episode_Sentiment,0,0.0


In [5]:
# We replace the erroneous values with the mode of the Podcasts

listening_new["Number_of_Ads"] = (listening_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x))

listening_new["Number_of_Ads"] = (listening_new["Number_of_Ads"].fillna(
        listening_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])))

print(
    "Number of null values: ", listening_new["Number_of_Ads"].isnull().sum(), "\n\n",
    "Distribution of values: \n", listening_new["Number_of_Ads"].value_counts()
)

Number of null values:  0 

 Distribution of values: 
 Number_of_Ads
0.0    217597
1.0    214074
3.0    160173
2.0    158156
Name: count, dtype: int64


In [6]:
# We replace the erroneous values

listening_new["Host_Popularity_percentage"] = np.where(
    listening_new["Host_Popularity_percentage"] > 100, 100, listening_new["Host_Popularity_percentage"]
).round(decimals=2)

print(
    "Number of Unique values: ", listening_new["Host_Popularity_percentage"].nunique(), "\n",
    "Distribution of values: \n", listening_new["Host_Popularity_percentage"].value_counts(normalize=True)
)

Number of Unique values:  8007 
 Distribution of values: 
 Host_Popularity_percentage
38.68    0.000747
26.72    0.000697
56.29    0.000653
30.14    0.000593
31.57    0.000585
           ...   
1.77     0.000003
1.30     0.000001
39.18    0.000001
1.73     0.000001
1.47     0.000001
Name: proportion, Length: 8007, dtype: float64


In [7]:
# We replace the erroneous values

listening_new["Guest_Popularity_percentage"] = np.where(
    listening_new["Guest_Popularity_percentage"] > 100, 100, listening_new["Guest_Popularity_percentage"]
)

In [8]:
listening_new.dropna(inplace=True)

In [9]:
'''
# We replace the erroneous values

listening_new["Guest_Popularity_percentage"] = np.where(
    listening_new["Guest_Popularity_percentage"] > 100, 100, listening_new["Guest_Popularity_percentage"]
)

# We fill null values with the mean groupby Podcast

listening_new["Guest_Popularity_percentage"] = (
    listening_new["Guest_Popularity_percentage"].fillna(
        listening_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("mean")
    )
).round(decimals=2)

print(
    "Number of Unique values: ", listening_new["Guest_Popularity_percentage"].nunique(), "\n",
    "Number of null values: ", listening_new["Guest_Popularity_percentage"].isnull().sum(), "\n\n",
    "Distribution of values: \n", listening_new["Guest_Popularity_percentage"].value_counts(normalize=True)
)
'''

'\n# We replace the erroneous values\n\nlistening_new["Guest_Popularity_percentage"] = np.where(\n    listening_new["Guest_Popularity_percentage"] > 100, 100, listening_new["Guest_Popularity_percentage"]\n)\n\n# We fill null values with the mean groupby Podcast\n\nlistening_new["Guest_Popularity_percentage"] = (\n    listening_new["Guest_Popularity_percentage"].fillna(\n        listening_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("mean")\n    )\n).round(decimals=2)\n\nprint(\n    "Number of Unique values: ", listening_new["Guest_Popularity_percentage"].nunique(), "\n",\n    "Number of null values: ", listening_new["Guest_Popularity_percentage"].isnull().sum(), "\n\n",\n    "Distribution of values: \n", listening_new["Guest_Popularity_percentage"].value_counts(normalize=True)\n)\n'

In [10]:
'''
# We fill null values with the mean groupby Podcast

listening_new["Episode_Length_minutes"] = (listening_new["Episode_Length_minutes"].fillna(
    listening_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("mean")
))
print(
    "Number of null values: ", listening_new["Episode_Length_minutes"].isnull().sum(), "\n\n",
    "Distribution of values: \n", listening_new["Episode_Length_minutes"].value_counts(normalize=True)
)
'''

'\n# We fill null values with the mean groupby Podcast\n\nlistening_new["Episode_Length_minutes"] = (listening_new["Episode_Length_minutes"].fillna(\n    listening_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("mean")\n))\nprint(\n    "Number of null values: ", listening_new["Episode_Length_minutes"].isnull().sum(), "\n\n",\n    "Distribution of values: \n", listening_new["Episode_Length_minutes"].value_counts(normalize=True)\n)\n'

In [11]:
# We changed the format for more efficient memory usage

listening_new[listening_new.select_dtypes(["object"]).columns] = (
    listening_new.select_dtypes(["object"]).apply(
        lambda x: x.astype("category"))
)

In [12]:
listening_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539049 entries, 1 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   Podcast_Name                 539049 non-null  category
 1   Episode_Title                539049 non-null  category
 2   Episode_Length_minutes       539049 non-null  float64 
 3   Genre                        539049 non-null  category
 4   Host_Popularity_percentage   539049 non-null  float64 
 5   Publication_Day              539049 non-null  category
 6   Publication_Time             539049 non-null  category
 7   Guest_Popularity_percentage  539049 non-null  float64 
 8   Number_of_Ads                539049 non-null  float64 
 9   Episode_Sentiment            539049 non-null  category
 10  Listening_Time_minutes       539049 non-null  float64 
dtypes: category(6), float64(5)
memory usage: 27.8 MB


# Data preprocessing

In [13]:
listening_end = listening_new.copy()

In [14]:
# We map the variables and change the format

eps_order = {"Negative" : 0, "Neutral" : 1, "Positive" : 2}
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].map(eps_order)
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].astype("float64")

In [15]:
# We separate the categorical variables from the numerical ones

df_numerical = listening_end.select_dtypes(include="number")
df_categorical = listening_end.select_dtypes(include="category")

## Encode

In [16]:
# We apply OrdinalEncoder to the remaining categorical variables

enc = OrdinalEncoder(categories="auto").set_output(transform="pandas")

enc_data = enc.fit_transform(df_categorical)

df_listening = pd.concat([enc_data, df_numerical], axis=1)

In [17]:
df_listening.corr().style.background_gradient(cmap='Greens')

Unnamed: 0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
Podcast_Name,1.0,0.006888,0.17896,0.002305,-0.00392,0.005552,-0.002932,-0.004145,0.008332,0.002715,0.004926
Episode_Title,0.006888,1.0,0.006393,0.000483,0.000421,-0.020702,0.021065,0.038394,0.007482,-0.004185,-0.019772
Genre,0.17896,0.006393,1.0,0.003841,-0.001384,-0.003672,-0.008252,0.006257,-0.004431,-0.003389,0.004616
Publication_Day,0.002305,0.000483,0.003841,1.0,-0.000826,0.005649,-0.002642,0.000103,0.007247,0.003195,0.004828
Publication_Time,-0.00392,0.000421,-0.001384,-0.000826,1.0,0.010177,-0.001841,-0.006775,-0.008343,0.010675,0.011506
Episode_Length_minutes,0.005552,-0.020702,-0.003672,0.005649,0.010177,1.0,0.021038,-0.009699,-0.054865,0.020101,0.915462
Host_Popularity_percentage,-0.002932,0.021065,-0.008252,-0.002642,-0.001841,0.021038,1.0,0.023072,-0.021174,0.004831,0.046092
Guest_Popularity_percentage,-0.004145,0.038394,0.006257,0.000103,-0.006775,-0.009699,0.023072,1.0,0.009071,0.001149,-0.013246
Number_of_Ads,0.008332,0.007482,-0.004431,0.007247,-0.008343,-0.054865,-0.021174,0.009071,1.0,-0.021429,-0.122978
Episode_Sentiment,0.002715,-0.004185,-0.003389,0.003195,0.010675,0.020101,0.004831,0.001149,-0.021429,1.0,0.035618


## Scaling

In [18]:
# We separate the target variable from the features and data to scale

x_listening = df_listening.drop(columns="Listening_Time_minutes")
y_listening = df_listening["Listening_Time_minutes"]

In [19]:
# Numerical variables to scale

df_numeric = x_listening[[
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage"
]]

scaler = StandardScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(df_numeric)

# We create a df with the remaining variables

df_rest = x_listening.drop(
    columns=["Episode_Length_minutes",
             "Host_Popularity_percentage",
             "Guest_Popularity_percentage"]
)

# We concatenate the dataframes

x_end = pd.concat([df_rest, scale_num], axis=1)

In [20]:
x_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Podcast_Name,539049.0,23.52814,14.111985,0.0,11.0,23.0,36.0,47.0
Episode_Title,539049.0,50.4388,28.106487,0.0,26.0,52.0,75.0,99.0
Genre,539049.0,4.646507,2.959459,0.0,2.0,5.0,7.0,9.0
Publication_Day,539049.0,2.956527,2.000536,0.0,1.0,3.0,5.0,6.0
Publication_Time,539049.0,1.529894,1.121581,0.0,1.0,2.0,3.0,3.0
Number_of_Ads,539049.0,1.323776,1.104495,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,539049.0,0.996364,0.815434,0.0,0.0,1.0,2.0,2.0
Episode_Length_minutes,539049.0,2.024665e-16,1.000001,-1.925046,-0.869745,-0.026965,0.895777,7.925639
Host_Popularity_percentage,539049.0,-7.782306e-17,1.000001,-2.557827,-0.891669,0.000948,0.859855,1.762978
Guest_Popularity_percentage,539049.0,-3.732976e-17,1.000001,-1.83213,-0.840894,0.048974,0.857817,1.675429


## Feature Selection

In [21]:
'''
mi_scores = mutual_info_regression(x_end, y_listening)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores
'''

'\nmi_scores = mutual_info_regression(x_end, y_listening)\nmi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)\nmi_scores = mi_scores.sort_values(ascending=False)\nmi_scores\n'

Basándome en la correlación y la importancia mutua me desprendo de algunas variables. Probar otras combinaciones y métodos de selección de variables.

In [22]:
#x_end = x_end.drop(columns=["Podcast_Name", "Episode_Title"])

In [23]:
x_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 539049 entries, 1 to 749999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 539049 non-null  float64
 1   Episode_Title                539049 non-null  float64
 2   Genre                        539049 non-null  float64
 3   Publication_Day              539049 non-null  float64
 4   Publication_Time             539049 non-null  float64
 5   Number_of_Ads                539049 non-null  float64
 6   Episode_Sentiment            539049 non-null  float64
 7   Episode_Length_minutes       539049 non-null  float64
 8   Host_Popularity_percentage   539049 non-null  float64
 9   Guest_Popularity_percentage  539049 non-null  float64
dtypes: float64(10)
memory usage: 45.2 MB


# Model Selection

In [24]:
# We separate the data into training and validation sets

x_train, x_val, y_train, y_val = (
    train_test_split(
        x_end, y_listening, test_size=0.2, random_state=42
    )
)

In [25]:
# Create the KFold object

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

## XGBRegressor

In [26]:
# We create the model instance

xgbr = XGBRegressor()
xgbr.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [27]:
# We establish the parameters to test

xgbr_param_grid = {
    "gamma" : [0.1, 0.5, 0.8, 0, 1],
    "max_depth" : [3, 4, 5, 6, 7],
    "learning_rate" : [0.2, 0.1, 0.01, 0.001],
    "subsample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "n_estimators" : [50, 100, 150, 200]
}

xgbr_grid = RandomizedSearchCV(
    xgbr,
    xgbr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

xgbr_search = xgbr_grid.fit(x_train, y_train)

print(
    f'Parameters: {xgbr_search.best_params_}\nScore: {xgbr_search.best_score_}'
)

Parameters: {'subsample': 0.9, 'n_estimators': 200, 'max_depth': 6, 'learning_rate': 0.1, 'gamma': 0.8}
Score: -10.391169743543811


In [28]:
# We evaluate the performance after an initial optimization

y_pred_xgbr = xgbr_search.best_estimator_.predict(x_val)

r2_xgbr = r2_score(y_val, y_pred_xgbr)

rmse_xgbr = np.sqrt(mean_squared_error(y_val, y_pred_xgbr))

print(f"XGBRegressor optimization\n\nR-squared score: {r2_xgbr}\nRMSE: {rmse_xgbr}")

XGBRegressor optimization

R-squared score: 0.8547889998687307
RMSE: 10.363032662594255


# Final Model

In [29]:
final_model = xgbr_search.best_estimator_

# We fit the best model

final_model.fit(x_train, y_train)

In [30]:
# We evaluate the performance of the final model

final_model_ypred = final_model.predict(x_val)

final_model_rmse = np.sqrt(mean_squared_error(y_val, final_model_ypred))

print("Final Model RMSE Score: %.3f" % final_model_rmse)

Final Model RMSE Score: 10.363


In [31]:
'''
# We create an explainer for the best estimator

explainer = shap.Explainer(final_model)
shap_values = explainer.shap_values(x_val)

# we visualize the importance

fig = shap.summary_plot(
    shap_values,
    x_val,
    show=False
)
plt.title("Feature Importance", fontsize=20, color='g', loc='left')
plt.xlabel("Mean SHAP Values", fontsize=20)
plt.ylabel("Features", fontsize=20)
plt.show()
'''

'\n# We create an explainer for the best estimator\n\nexplainer = shap.Explainer(final_model)\nshap_values = explainer.shap_values(x_val)\n\n# we visualize the importance\n\nfig = shap.summary_plot(\n    shap_values,\n    x_val,\n    show=False\n)\nplt.title("Feature Importance", fontsize=20, color=\'g\', loc=\'left\')\nplt.xlabel("Mean SHAP Values", fontsize=20)\nplt.ylabel("Features", fontsize=20)\nplt.show()\n'

# Test data

In [32]:
# We load the test data

df_test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")

In [33]:
df_test_new = df_test.copy()

In [34]:
# We check the null values

null_values_test = (
    pd.DataFrame(
        {f'Amount of Null Data' : df_test_new.isnull().sum(), 
         'Percentage of Null Data' : (
             df_test_new.isnull().sum()) / (len(df_test_new)) * (100)
        }
    ))

null_values_test.style.background_gradient(cmap='Greens')

Unnamed: 0,Amount of Null Data,Percentage of Null Data
id,0,0.0
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,28736,11.4944
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,48832,19.5328
Number_of_Ads,0,0.0


In [35]:
# We replace the erroneous values

df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x))
df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].fillna(
    df_test_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])
))
df_test_new["Host_Popularity_percentage"] = np.where(
    df_test_new["Host_Popularity_percentage"] > 100, 100, df_test_new["Host_Popularity_percentage"]
).round(decimals=2)
df_test_new["Guest_Popularity_percentage"] = np.where(
    df_test_new["Guest_Popularity_percentage"] > 100, 100, df_test_new["Guest_Popularity_percentage"]
).round(decimals=2)

In [36]:
#df_test_new.dropna(inplace=True)
#df_test_new.info()

In [37]:
# We filled in null values

df_test_new["Episode_Length_minutes"] = (df_test_new["Episode_Length_minutes"].fillna(
    df_test_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("mean")
))
df_test_new["Guest_Popularity_percentage"] = (df_test_new["Guest_Popularity_percentage"].fillna(
    df_test_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("mean")
))

In [38]:
# We start by removing the variables that we will not use

df_test_new = df_test_new.drop(columns="id")
#df_test_new = df_test_new.drop(columns=["id", "Podcast_Name", "Episode_Title"])

In [39]:
# We encode the categorical variables

df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].map(eps_order)
df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].astype("float64")

test_num = df_test_new.select_dtypes(include="number")
test_cat = df_test_new.select_dtypes(include="object")
test_enc = enc.fit_transform(test_cat)
test_encoded = pd.concat([test_enc, test_num[["Number_of_Ads", "Episode_Sentiment"]]], axis=1)

# We transform the data

test_num = test_num.drop(columns=["Number_of_Ads", "Episode_Sentiment"])
test_sca = scaler.transform(test_num)

# We concatenate the dataframes

test_end = pd.concat([test_encoded, test_sca], axis=1)

In [40]:
test_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Podcast_Name,250000.0,23.489368,14.110784,0.0,11.0,23.0,36.0,47.0
Episode_Title,250000.0,50.209524,28.19198,0.0,26.0,51.0,74.0,99.0
Genre,250000.0,4.641336,2.959436,0.0,2.0,5.0,7.0,9.0
Publication_Day,250000.0,2.962788,1.993649,0.0,1.0,3.0,5.0,6.0
Publication_Time,250000.0,1.522652,1.11854,0.0,1.0,1.0,3.0,3.0
Number_of_Ads,250000.0,1.347248,1.112669,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,250000.0,0.997236,0.815666,0.0,0.0,1.0,2.0,2.0
Episode_Length_minutes,250000.0,10.762615,4772.564067,-1.88765,-0.765766,-0.00386,0.798182,2386243.0
Host_Popularity_percentage,250000.0,-0.000536,1.001589,-2.505733,-0.896484,0.007514,0.860731,1.762978
Guest_Popularity_percentage,250000.0,-0.001582,0.895071,-1.83213,-0.620268,0.000366,0.657886,1.675429


In [41]:
# We apply the trained model

listening_predictions = final_model.predict(test_end)

In [42]:
# We review the result

print("Total predictions: ", len(listening_predictions), "\n")

Total predictions:  250000 



In [43]:
# We create the dataframe

listening_submission = pd.DataFrame({
    "id" : df_test["id"], 
    "Listening_Time_minutes" : listening_predictions
})

listening_submission.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.02026
1,750001,17.521896
2,750002,49.45089
3,750003,78.285522
4,750004,48.812168


In [44]:
# We convert the dataframe to a csv file

listening_submission.to_csv("submission.csv", index=False)