In [1]:
# We load the competition data

import warnings
warnings.filterwarnings("ignore")

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/playground-series-s5e4/sample_submission.csv
/kaggle/input/playground-series-s5e4/train.csv
/kaggle/input/playground-series-s5e4/test.csv


# Libraries

In [2]:
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import shap

from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler
)
from sklearn.feature_selection import (
    mutual_info_regression
)
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    KFold,
    RandomizedSearchCV
)
from sklearn.metrics import (
    r2_score, 
    mean_squared_error
)
from sklearn.impute import KNNImputer
from xgboost import XGBRegressor
from sklearn.ensemble import ( 
    BaggingRegressor
)

# Primer análisis

Notebook y análisis completo:

https://www.kaggle.com/code/les1781/listening-time-prediction-playground-series-s5-e4

In [3]:
# We load the data

listening_new = pd.read_csv("/kaggle/input/playground-series-s5e4/train.csv", index_col="id")

# Data wrangling

In [4]:
# We check for null values

null_values = (
    pd.DataFrame(
        {f"Amount of Null Data": listening_new.isnull().sum(), 
         "Percentage of Null Data" : (
             listening_new.isnull().sum()) / (len(listening_new)) * (100)
        }))

null_values.style.background_gradient(cmap="Greens")

Unnamed: 0,Amount of Null Data,Percentage of Null Data
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,87093,11.6124
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,146030,19.470667
Number_of_Ads,1,0.000133
Episode_Sentiment,0,0.0


In [5]:
# We replace the erroneous values

listening_new["Number_of_Ads"] = (listening_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x))
listening_new["Number_of_Ads"] = (listening_new["Number_of_Ads"].fillna(
        listening_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])))

listening_new["Host_Popularity_percentage"] = np.where(
    listening_new["Host_Popularity_percentage"] > 100, 100, listening_new["Host_Popularity_percentage"]
).round(decimals=2)

listening_new["Guest_Popularity_percentage"] = np.where(
    listening_new["Guest_Popularity_percentage"] > 100, 100, listening_new["Guest_Popularity_percentage"]
).round(decimals=2)

In [6]:
# We drop null values

#listening_new.dropna(inplace=True)

In [7]:
# We fill null values with the mean groupby Podcast

listening_new["Guest_Popularity_percentage"] = (listening_new["Guest_Popularity_percentage"].fillna(
        listening_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("mean")
))

listening_new["Episode_Length_minutes"] = (listening_new["Episode_Length_minutes"].fillna(
    listening_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("mean")
))

In [8]:
# We changed the format for more efficient memory usage

listening_new[listening_new.select_dtypes(["object"]).columns] = (
    listening_new.select_dtypes(["object"]).apply(
        lambda x: x.astype("category"))
)

In [9]:
listening_new.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 11 columns):
 #   Column                       Non-Null Count   Dtype   
---  ------                       --------------   -----   
 0   Podcast_Name                 750000 non-null  category
 1   Episode_Title                750000 non-null  category
 2   Episode_Length_minutes       750000 non-null  float64 
 3   Genre                        750000 non-null  category
 4   Host_Popularity_percentage   750000 non-null  float64 
 5   Publication_Day              750000 non-null  category
 6   Publication_Time             750000 non-null  category
 7   Guest_Popularity_percentage  750000 non-null  float64 
 8   Number_of_Ads                750000 non-null  float64 
 9   Episode_Sentiment            750000 non-null  category
 10  Listening_Time_minutes       750000 non-null  float64 
dtypes: category(6), float64(5)
memory usage: 38.6 MB


# Data preprocessing

In [10]:
listening_end = listening_new.copy()

In [11]:
# We map the variables and change the format

eps_order = {"Negative" : 0, "Neutral" : 1, "Positive" : 2}
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].map(eps_order)
listening_end["Episode_Sentiment"] = listening_end["Episode_Sentiment"].astype("float64")

In [12]:
# We separate the categorical variables from the numerical ones

df_numerical = listening_end.select_dtypes(include="number")
df_categorical = listening_end.select_dtypes(include="category")

## Encode

In [13]:
# We apply OrdinalEncoder to the remaining categorical variables

enc = OrdinalEncoder(categories="auto").set_output(transform="pandas")

enc_data = enc.fit_transform(df_categorical)

df_listening = pd.concat([enc_data, df_numerical], axis=1)

In [14]:
df_listening.corr().style.background_gradient(cmap='Greens')

Unnamed: 0,Podcast_Name,Episode_Title,Genre,Publication_Day,Publication_Time,Episode_Length_minutes,Host_Popularity_percentage,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
Podcast_Name,1.0,0.006669,0.188052,0.003117,-0.002328,0.006335,-0.002424,-0.004752,0.008955,0.003115,0.004346
Episode_Title,0.006669,1.0,0.006847,0.0011,-0.000304,-0.019925,0.020208,0.034565,0.006395,-0.004845,-0.020842
Genre,0.188052,0.006847,1.0,0.003305,-0.001329,-0.000452,-0.008807,0.007003,-0.003429,-0.001588,0.00494
Publication_Day,0.003117,0.0011,0.003305,1.0,-0.00086,0.006598,-0.003698,0.000398,0.004986,0.004594,0.00403
Publication_Time,-0.002328,-0.000304,-0.001329,-0.00086,1.0,0.009908,0.000182,-0.005023,-0.006727,0.008923,0.013225
Episode_Length_minutes,0.006335,-0.019925,-0.000452,0.006598,0.009908,1.0,0.022221,-0.008353,-0.054821,0.024724,0.866202
Host_Popularity_percentage,-0.002424,0.020208,-0.008807,-0.003698,0.000182,0.022221,1.0,0.02006,-0.017831,0.007063,0.050854
Guest_Popularity_percentage,-0.004752,0.034565,0.007003,0.000398,-0.005023,-0.008353,0.02006,1.0,0.007658,0.000488,-0.014399
Number_of_Ads,0.008955,0.006395,-0.003429,0.004986,-0.006727,-0.054821,-0.017831,0.007658,1.0,-0.020627,-0.124201
Episode_Sentiment,0.003115,-0.004845,-0.001588,0.004594,0.008923,0.024724,0.007063,0.000488,-0.020627,1.0,0.03947


## Scaling

In [15]:
# We separate the target variable from the features and data to scale

x_listening = df_listening.drop(columns="Listening_Time_minutes")
y_listening = df_listening["Listening_Time_minutes"]

In [16]:
# Numerical variables to scale

df_numeric = x_listening[[
    "Episode_Length_minutes",
    "Host_Popularity_percentage",
    "Guest_Popularity_percentage"
]]

scaler = StandardScaler().set_output(transform="pandas")
scale_num = scaler.fit_transform(df_numeric)

# We create a df with the remaining variables

df_rest = x_listening.drop(
    columns=["Episode_Length_minutes",
             "Host_Popularity_percentage",
             "Guest_Popularity_percentage"]
)

# We concatenate the dataframes

x_end = pd.concat([df_rest, scale_num], axis=1)

In [17]:
x_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Podcast_Name,750000.0,23.51573,14.137577,0.0,11.0,23.0,37.0,47.0
Episode_Title,750000.0,50.286,28.186681,0.0,26.0,51.0,74.0,99.0
Genre,750000.0,4.648788,2.963073,0.0,2.0,5.0,7.0,9.0
Publication_Day,750000.0,2.962776,1.997399,0.0,1.0,3.0,5.0,6.0
Publication_Time,750000.0,1.522868,1.119361,0.0,1.0,1.0,3.0,3.0
Number_of_Ads,750000.0,1.347873,1.110966,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,750000.0,0.9979693,0.81544,0.0,0.0,1.0,2.0,2.0
Episode_Length_minutes,750000.0,2.563828e-16,1.000001,-2.080627,-0.809072,-0.002546,0.832465,8.410505
Host_Popularity_percentage,750000.0,-2.175208e-16,1.000001,-2.56029,-0.894076,0.00833,0.860018,1.754991
Guest_Popularity_percentage,750000.0,-3.267739e-16,1.000001,-2.045663,-0.692544,0.007787,0.736553,1.870745


## Feature Selection

In [18]:
'''
mi_scores = mutual_info_regression(x_end, y_listening)
mi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)
mi_scores = mi_scores.sort_values(ascending=False)
mi_scores
'''

'\nmi_scores = mutual_info_regression(x_end, y_listening)\nmi_scores = pd.Series(mi_scores, name="MI Scores", index=x_end.columns)\nmi_scores = mi_scores.sort_values(ascending=False)\nmi_scores\n'

In [19]:
# We make a selection of variables

#x_end = x_end.drop(columns=["Podcast_Name", "Episode_Title"])

In [20]:
x_end.info()

<class 'pandas.core.frame.DataFrame'>
Index: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   Podcast_Name                 750000 non-null  float64
 1   Episode_Title                750000 non-null  float64
 2   Genre                        750000 non-null  float64
 3   Publication_Day              750000 non-null  float64
 4   Publication_Time             750000 non-null  float64
 5   Number_of_Ads                750000 non-null  float64
 6   Episode_Sentiment            750000 non-null  float64
 7   Episode_Length_minutes       750000 non-null  float64
 8   Host_Popularity_percentage   750000 non-null  float64
 9   Guest_Popularity_percentage  750000 non-null  float64
dtypes: float64(10)
memory usage: 62.9 MB


# Model Selection

In [21]:
# We separate the data into training and validation sets

x_train, x_val, y_train, y_val = (
    train_test_split(
        x_end, y_listening, test_size=0.2, random_state=42
    )
)

In [22]:
# Create the KFold object

kfold = KFold(n_splits=10, shuffle=True, random_state=42)

## XGBRegressor

In [23]:
# We create the model instance

xgbr = XGBRegressor()

# Train the model with the data

#xgbr.fit(x_train, y_train)

In [24]:
xgbr.get_params()

{'objective': 'reg:squarederror',
 'base_score': None,
 'booster': None,
 'callbacks': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'device': None,
 'early_stopping_rounds': None,
 'enable_categorical': False,
 'eval_metric': None,
 'feature_types': None,
 'gamma': None,
 'grow_policy': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_bin': None,
 'max_cat_threshold': None,
 'max_cat_to_onehot': None,
 'max_delta_step': None,
 'max_depth': None,
 'max_leaves': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'multi_strategy': None,
 'n_estimators': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'sampling_method': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [25]:
'''
# We establish the parameters to test

xgbr_param_grid = {
    "gamma" : [0.1, 0.5, 0.8, 0, 1],
    "max_depth" : [3, 4, 5, 6, 7],
    "learning_rate" : [0.2, 0.1, 0.01, 0.001],
    "subsample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],
    "n_estimators" : [50, 100, 150, 200]
}

xgbr_grid = RandomizedSearchCV(
    xgbr,
    xgbr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

xgbr_search = xgbr_grid.fit(x_train, y_train)

print(
    f'Parameters: {xgbr_search.best_params_}\nScore: {xgbr_search.best_score_}'
)
'''

'\n# We establish the parameters to test\n\nxgbr_param_grid = {\n    "gamma" : [0.1, 0.5, 0.8, 0, 1],\n    "max_depth" : [3, 4, 5, 6, 7],\n    "learning_rate" : [0.2, 0.1, 0.01, 0.001],\n    "subsample" : [0.5, 0.6, 0.7, 0.8, 0.9, 1],\n    "n_estimators" : [50, 100, 150, 200]\n}\n\nxgbr_grid = RandomizedSearchCV(\n    xgbr,\n    xgbr_param_grid,\n    cv=kfold,\n    scoring="neg_root_mean_squared_error",\n    return_train_score=True\n)\n\nxgbr_search = xgbr_grid.fit(x_train, y_train)\n\nprint(\n    f\'Parameters: {xgbr_search.best_params_}\nScore: {xgbr_search.best_score_}\'\n)\n'

In [26]:
# We evaluate the performance after an initial optimization

#y_pred_xgbr = xgbr_search.best_estimator_.predict(x_val)

#r2_xgbr = r2_score(y_val, y_pred_xgbr)

#rmse_xgbr = np.sqrt(mean_squared_error(y_val, y_pred_xgbr))

#print(f"XGBRegressor optimization\n\nR-squared score: {r2_xgbr}\nRMSE: {rmse_xgbr}")

- With data dropped in variable management

      RandomizedSearchCV:

      Parameters: {'subsample':0.9, 'n_estimators':150, 'max_depth':6, 'learning_rate':0.2, 'gamma':0.1}

      Score: -10.385593205700733

      R-squared score: 0.8548779072393202

      RMSE: 10.359859724404574

In [27]:
# XGBR different approaches

#xgbr_final = xgbr_search.best_estimator_
xgbr_final = XGBRegressor(
    subsample = 0.9, n_estimators = 150, max_depth = 6, learning_rate = 0.2, gamma = 0.1
)
xgbr_final.fit(x_train, y_train)

## BaggingRegressor

In [28]:
# We create the model instance

bagr = BaggingRegressor()

# Train the model with the data

#bagr.fit(x_train, y_train)

In [29]:
'''
y_pred_bagr = bagr.predict(x_val)

r2_bagr = r2_score(y_val, y_pred_bagr)

rmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))

print(f"BaggingRegressor\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")
'''

'\ny_pred_bagr = bagr.predict(x_val)\n\nr2_bagr = r2_score(y_val, y_pred_bagr)\n\nrmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))\n\nprint(f"BaggingRegressor\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")\n'

BaggingRegressor (Default)

R-squared score: 0.7586370826352582

RMSE: 13.326718743148652

In [30]:
bagr.get_params()

{'base_estimator': 'deprecated',
 'bootstrap': True,
 'bootstrap_features': False,
 'estimator': None,
 'max_features': 1.0,
 'max_samples': 1.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [31]:
'''
# We establish the parameters to test

bagr_param_grid = {
    "estimator" : [None, xgbr, xgbr_final],
    "max_features" : [0.2, 0.5, 1.0],
    "max_samples" : [0.2, 0.5, 1.0],
    "n_estimators" : [10, 20, 30]
}

bagr_grid = RandomizedSearchCV(
    bagr,
    bagr_param_grid,
    cv=kfold,
    scoring="neg_root_mean_squared_error",
    return_train_score=True
)

bagr_search = bagr_grid.fit(x_train, y_train)

print(
    f'Parameters: {bagr_search.best_params_}\nScore: {bagr_search.best_score_}'
)
'''

'\n# We establish the parameters to test\n\nbagr_param_grid = {\n    "estimator" : [None, xgbr, xgbr_final],\n    "max_features" : [0.2, 0.5, 1.0],\n    "max_samples" : [0.2, 0.5, 1.0],\n    "n_estimators" : [10, 20, 30]\n}\n\nbagr_grid = RandomizedSearchCV(\n    bagr,\n    bagr_param_grid,\n    cv=kfold,\n    scoring="neg_root_mean_squared_error",\n    return_train_score=True\n)\n\nbagr_search = bagr_grid.fit(x_train, y_train)\n\nprint(\n    f\'Parameters: {bagr_search.best_params_}\nScore: {bagr_search.best_score_}\'\n)\n'

Parameters: {'n_estimators': 20, 'max_samples': 1.0, 'max_features': 1.0, 'estimator': XGBRegressor(base_score=None, booster=None, callbacks=None,
             colsample_bylevel=None, colsample_bynode=None,
             colsample_bytree=None, device=None, early_stopping_rounds=None,
             enable_categorical=False, eval_metric=None, feature_types=None,
             gamma=0.1, grow_policy=None, importance_type=None,
             interaction_constraints=None, learning_rate=0.2, max_bin=None,
             max_cat_threshold=None, max_cat_to_onehot=None,
             max_delta_step=None, max_depth=6, max_leaves=None,
             min_child_weight=None, missing=nan, monotone_constraints=None,
             multi_strategy=None, n_estimators=150, n_jobs=None,
             num_parallel_tree=None, random_state=None, ...)}
             
Score: -13.031101049045333

In [32]:
'''
# We evaluate the performance after an initial optimization

y_pred_bagr = bagr_search.best_estimator_.predict(x_val)

r2_bagr = r2_score(y_val, y_pred_bagr)

rmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))

print(f"BaggingRegressor optimization\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")
'''

'\n# We evaluate the performance after an initial optimization\n\ny_pred_bagr = bagr_search.best_estimator_.predict(x_val)\n\nr2_bagr = r2_score(y_val, y_pred_bagr)\n\nrmse_bagr = np.sqrt(mean_squared_error(y_val, y_pred_bagr))\n\nprint(f"BaggingRegressor optimization\n\nR-squared score: {r2_bagr}\nRMSE: {rmse_bagr}")\n'

BaggingRegressor optimization

R-squared score: 0.7705749925584561

RMSE: 12.992967020835195

In [33]:
# BAGR different approaches

#bagr_final = bagr_search.best_estimator_

#bagr_final = BaggingRegressor(n_estimators=20, max_samples=1.0, max_features=1.0, estimator=xgbr_final)

#bagr_final.fit(x_train, y_train)

# Final Model

In [34]:
# We define the final model

final_model = BaggingRegressor(n_estimators=20, max_samples=1.0, max_features=1.0, estimator=xgbr_final)

# We fit the best model

final_model.fit(x_train, y_train)

In [35]:
# We evaluate the performance of the final model

final_model_ypred = final_model.predict(x_val)

final_model_rmse = np.sqrt(mean_squared_error(y_val, final_model_ypred))

print("Final Model RMSE Score: %.3f" % final_model_rmse)

Final Model RMSE Score: 12.992


bagr Model RMSE Score: 13.337

bagr, estimator=xgbr_final Model RMSE Score: 12.992

In [36]:
'''
# We create an explainer for the best estimator

explainer = shap.Explainer(final_model)
shap_values = explainer.shap_values(x_val)

# we visualize the importance

fig = shap.summary_plot(
    shap_values,
    x_val,
    show=False
)
plt.title("Feature Importance", fontsize=20, color='g', loc='left')
plt.xlabel("Mean SHAP Values", fontsize=20)
plt.ylabel("Features", fontsize=20)
plt.show()
'''

'\n# We create an explainer for the best estimator\n\nexplainer = shap.Explainer(final_model)\nshap_values = explainer.shap_values(x_val)\n\n# we visualize the importance\n\nfig = shap.summary_plot(\n    shap_values,\n    x_val,\n    show=False\n)\nplt.title("Feature Importance", fontsize=20, color=\'g\', loc=\'left\')\nplt.xlabel("Mean SHAP Values", fontsize=20)\nplt.ylabel("Features", fontsize=20)\nplt.show()\n'

# Test data

In [37]:
# We load the test data

df_test = pd.read_csv("/kaggle/input/playground-series-s5e4/test.csv")

In [38]:
df_test_new = df_test.copy()

In [39]:
# We check the null values

null_values_test = (
    pd.DataFrame(
        {f'Amount of Null Data' : df_test_new.isnull().sum(), 
         'Percentage of Null Data' : (
             df_test_new.isnull().sum()) / (len(df_test_new)) * (100)
        }
    ))

null_values_test.style.background_gradient(cmap='Greens')

Unnamed: 0,Amount of Null Data,Percentage of Null Data
id,0,0.0
Podcast_Name,0,0.0
Episode_Title,0,0.0
Episode_Length_minutes,28736,11.4944
Genre,0,0.0
Host_Popularity_percentage,0,0.0
Publication_Day,0,0.0
Publication_Time,0,0.0
Guest_Popularity_percentage,48832,19.5328
Number_of_Ads,0,0.0


In [40]:
# We start by removing the variables that we will not use

df_test_new = df_test_new.drop(columns="id")
#df_test_new = df_test_new.drop(columns=["id", "Podcast_Name", "Episode_Title"])

In [41]:
# We replace the erroneous values

df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].apply(lambda x: np.NaN if x>3 else x))
df_test_new["Number_of_Ads"] = (df_test_new["Number_of_Ads"].fillna(
    df_test_new.groupby("Podcast_Name")["Number_of_Ads"].transform(lambda v: v.mode()[0])
))
df_test_new["Host_Popularity_percentage"] = np.where(
    df_test_new["Host_Popularity_percentage"] > 100, 100, df_test_new["Host_Popularity_percentage"]
).round(decimals=2)
df_test_new["Guest_Popularity_percentage"] = np.where(
    df_test_new["Guest_Popularity_percentage"] > 100, 100, df_test_new["Guest_Popularity_percentage"]
).round(decimals=2)

In [42]:
'''
# We filled in null values with KNNImputer

test_object = df_test_new.select_dtypes(include="object")
test_impute = df_test_new.select_dtypes(include="number")

imputer = KNNImputer().set_output(transform="pandas")
after_imputation = imputer.fit_transform(test_impute)

df_test_new = pd.concat([test_object, after_imputation], axis=1)
'''

'\n# We filled in null values with KNNImputer\n\ntest_object = df_test_new.select_dtypes(include="object")\ntest_impute = df_test_new.select_dtypes(include="number")\n\nimputer = KNNImputer().set_output(transform="pandas")\nafter_imputation = imputer.fit_transform(test_impute)\n\ndf_test_new = pd.concat([test_object, after_imputation], axis=1)\n'

In [43]:
# We filled in null values

df_test_new["Episode_Length_minutes"] = (df_test_new["Episode_Length_minutes"].fillna(
    df_test_new.groupby("Podcast_Name")["Episode_Length_minutes"].transform("mean")
))
df_test_new["Guest_Popularity_percentage"] = (df_test_new["Guest_Popularity_percentage"].fillna(
    df_test_new.groupby("Podcast_Name")["Guest_Popularity_percentage"].transform("mean")
))

In [44]:
# We encode the categorical variables

df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].map(eps_order)
df_test_new["Episode_Sentiment"] = df_test_new["Episode_Sentiment"].astype("float64")

test_num = df_test_new.select_dtypes(include="number")
test_cat = df_test_new.select_dtypes(include="object")
test_enc = enc.fit_transform(test_cat)
test_encoded = pd.concat([test_enc, test_num[["Number_of_Ads", "Episode_Sentiment"]]], axis=1)

# We transform the data

test_num = test_num.drop(columns=["Number_of_Ads", "Episode_Sentiment"])
test_sca = scaler.transform(test_num)

# We concatenate the dataframes

test_end = pd.concat([test_encoded, test_sca], axis=1)

In [45]:
test_end.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Podcast_Name,250000.0,23.489368,14.110784,0.0,11.0,23.0,36.0,47.0
Episode_Title,250000.0,50.209524,28.19198,0.0,26.0,51.0,74.0,99.0
Genre,250000.0,4.641336,2.959436,0.0,2.0,5.0,7.0,9.0
Publication_Day,250000.0,2.962788,1.993649,0.0,1.0,3.0,5.0,6.0
Publication_Time,250000.0,1.522652,1.11854,0.0,1.0,1.0,3.0,3.0
Number_of_Ads,250000.0,1.347248,1.112669,0.0,0.0,1.0,2.0,3.0
Episode_Sentiment,250000.0,0.997236,0.815666,0.0,0.0,1.0,2.0,2.0
Episode_Length_minutes,250000.0,11.420409,5063.475976,-2.000953,-0.810685,-0.002337,0.848594,2531697.0
Host_Popularity_percentage,250000.0,-0.006268,1.000309,-2.508262,-0.901071,0.001771,0.853897,1.754991
Guest_Popularity_percentage,250000.0,-0.001742,0.999403,-2.045663,-0.692544,0.000433,0.734595,1.870745


In [46]:
# We apply the trained model

listening_predictions = final_model.predict(test_end)

In [47]:
# We review the result

print("Total predictions: ", len(listening_predictions), "\n")

Total predictions:  250000 



In [48]:
# We create the dataframe

listening_submission = pd.DataFrame({
    "id" : df_test["id"], 
    "Listening_Time_minutes" : listening_predictions
})

listening_submission.head()

Unnamed: 0,id,Listening_Time_minutes
0,750000,55.587147
1,750001,17.614498
2,750002,49.638718
3,750003,78.239273
4,750004,49.173161


In [49]:
# We convert the dataframe to a csv file

listening_submission.to_csv("submission.csv", index=False)