# Predict Podcast Listening Time
By Josh Houlding

<b>Competition Page:</b> [https://www.kaggle.com/competitions/playground-series-s5e4/overview](https://www.kaggle.com/competitions/playground-series-s5e4/overview)

The following is an AutoML solution to the April 2025 Kaggle competition based around predicting a podcast episode's listening time based on various factors, implemented using the H2O library.

# Loading the data

In [75]:
#!pip install h2o
import pandas as pd

# Load data
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [76]:
# Show head
train.head(3)

Unnamed: 0,id,Podcast_Name,Episode_Title,Episode_Length_minutes,Genre,Host_Popularity_percentage,Publication_Day,Publication_Time,Guest_Popularity_percentage,Number_of_Ads,Episode_Sentiment,Listening_Time_minutes
0,0,Mystery Matters,Episode 98,,True Crime,74.81,Thursday,Night,,0.0,Positive,31.41998
1,1,Joke Junction,Episode 26,119.8,Comedy,66.95,Saturday,Afternoon,75.95,2.0,Negative,88.01241
2,2,Study Sessions,Episode 16,73.9,Education,69.97,Tuesday,Evening,8.97,0.0,Negative,44.92531


# Cleaning the data

In [77]:
# Drop unnecessary columns
columns_to_drop = ["Podcast_Name", "Episode_Title"]
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)

In [78]:
# Remove duplicates
train = train.drop_duplicates()
test = test.drop_duplicates()

In [79]:
# Function to fill missing values with the mode
def fill_na_with_mode(df):
    for col in df.columns:
        mode_value = df[col].mode()
        if not mode_value.empty:  # Check if a mode exists
            df[col].fillna(mode_value[0], inplace=True)
        else:
            print(f"No mode found for column '{col}'. Missing values not filled.") #Informative print
    return df

# Fill missing values
train = fill_na_with_mode(train)
test = fill_na_with_mode(test)

# Check missing value counts
print(f"Training rows with missing values: {train.isna().sum().sum()}")
print(f"Testing rows with missing values: {test.isna().sum().sum()}")

Training rows with missing values: 0
Testing rows with missing values: 0


In [80]:
# Check data types
print(f"TRAINING SET")
train.info()
print("________________________________________________________________ \n")
print(f"TESTING SET")
test.info()

TRAINING SET
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 750000 entries, 0 to 749999
Data columns (total 10 columns):
 #   Column                       Non-Null Count   Dtype  
---  ------                       --------------   -----  
 0   id                           750000 non-null  int64  
 1   Episode_Length_minutes       750000 non-null  float64
 2   Genre                        750000 non-null  object 
 3   Host_Popularity_percentage   750000 non-null  float64
 4   Publication_Day              750000 non-null  object 
 5   Publication_Time             750000 non-null  object 
 6   Guest_Popularity_percentage  750000 non-null  float64
 7   Number_of_Ads                750000 non-null  float64
 8   Episode_Sentiment            750000 non-null  object 
 9   Listening_Time_minutes       750000 non-null  float64
dtypes: float64(5), int64(1), object(4)
memory usage: 57.2+ MB
________________________________________________________________ 

TESTING SET
<class 'pandas.core.frame.

# H2O AutoML

In [None]:
import multiprocessing

# Set proportion of the CPU H2O is allowed to use
num_cores = multiprocessing.cpu_count()
cpu_percentage = 0.5
allowed_threads = int(num_cores * cpu_percentage)

# Initialize H2O with custom computation resource allocations
allowed_memory = "23G" # My computer has 24GB of RAM
h2o.init(nthreads=allowed_threads, max_mem_size=allowed_memory)

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; OpenJDK 64-Bit Server VM (Temurin)(build 25.432-b06, mixed mode)
  Starting server from C:\Users\jdh10\anaconda3\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\jdh10\AppData\Local\Temp\tmpa14ix_yp
  JVM stdout: C:\Users\jdh10\AppData\Local\Temp\tmpa14ix_yp\h2o_jdh10_started_from_python.out
  JVM stderr: C:\Users\jdh10\AppData\Local\Temp\tmpa14ix_yp\h2o_jdh10_started_from_python.err


In [None]:
# Convert datasets to H2O objects
train = h2o.H2OFrame(train)
test = h2o.H2OFrame(test)

In [None]:
# Identify label and features
y = "Listening_Time_minutes"
x = train.columns
x.remove(y) # Get only features
x.remove("id")

In [None]:
""" # Install libraries for multithreaded conversion of H2O dataframes to Pandas dataframes
!pip install polars
!pip install pyarrow
""";

In [None]:
from h2o.automl import H2OAutoML
import pandas as pd

# Train AutoML
aml = H2OAutoML(max_runtime_secs=3600, seed=1)  # Adjust runtime as needed
aml.train(x=x, y=y, training_frame=train)

# Show total training time
training_info = aml.training_info
aml_elapsed_time = training_info['duration_secs']
print(f"AutoML elapsed time from training info: {aml_elapsed_time} seconds \n")

# View leaderboard
lb = aml.leaderboard
print(lb)

# Get best model
best_model = aml.leader

# Make predictions on test data
predictions = best_model.predict(test)

# Format predictions for submission
predicted_values = predictions["predict"].as_data_frame(use_multi_thread=True)
ids = test["id"].as_data_frame(use_multi_thread=True)
submission = pd.concat([ids, predicted_values], axis=1)
submission.columns = ["id", "Listening_Time_minutes"]

# Save the submission DataFrame to a CSV file
submission.to_csv("submission.csv", index=False)

In [None]:
# Check submitted file
submission.sample(5, random_state=42)

In [None]:
# Shut down H2O
h2o.cluster().shutdown()