In [1]:
# H2O AutoML - All Features
import h2o
from h2o.automl import H2OAutoML
from sklearn.model_selection import train_test_split
import pandas as pd

In [2]:
# Start H2O
h2o.init()

Checking whether there is an H2O instance running at http://localhost:54321..... not found.
Attempting to start a local H2O server...
; Java HotSpot(TM) 64-Bit Server VM (build 25.441-b07, mixed mode)
  Starting server from C:\Users\John\anaconda3\envs\mlops_env\Lib\site-packages\h2o\backend\bin\h2o.jar
  Ice root: C:\Users\John\AppData\Local\Temp\tmp_ra6i9dt
  JVM stdout: C:\Users\John\AppData\Local\Temp\tmp_ra6i9dt\h2o_John_started_from_python.out
  JVM stderr: C:\Users\John\AppData\Local\Temp\tmp_ra6i9dt\h2o_John_started_from_python.err
  Server is running at http://127.0.0.1:54321
Connecting to H2O server at http://127.0.0.1:54321 ... successful.
Please download and install the latest version from: https://h2o-release.s3.amazonaws.com/h2o/latest_stable.html


0,1
H2O_cluster_uptime:,02 secs
H2O_cluster_timezone:,America/Chicago
H2O_data_parsing_timezone:,UTC
H2O_cluster_version:,3.46.0.7
H2O_cluster_version_age:,3 months and 27 days
H2O_cluster_name:,H2O_from_python_John_evz5fq
H2O_cluster_total_nodes:,1
H2O_cluster_free_memory:,3.485 Gb
H2O_cluster_total_cores:,24
H2O_cluster_allowed_cores:,24


In [3]:
# Load dataset
df = pd.read_csv("athletes_v2.csv")
df.head(2)

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
0,South East,Male,35,69,192,295,225,465,400,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385
1,Latin America,Male,27,68,164,254,187,397,397,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235


In [4]:
# Create mapping for gender column
gender_mapping = {'Male': 0, 'Female': 1, '--': 1}

# Apply mapping to create new column in both datasets
df['upd_gender'] = df['gender'].map(gender_mapping)

In [5]:
# Create mapping for howlong column with partial matching
def map_howlong(value):
    if '4+ years' in value:
        return 4
    elif '2-4 years' in value:
        return 3
    elif '1-2 years' in value:
        return 2
    elif '6-12 months' in value:
        return 1
    elif 'Less than 6 months' in value:
        return 0
    else:
        return None

# Apply mapping to create new column in both datasets
df['upd_howlong'] = df['howlong'].apply(map_howlong)

In [6]:
df.head(2)

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift,upd_gender,upd_howlong
0,South East,Male,35,69,192,295,225,465,400,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385,0,4
1,Latin America,Male,27,68,164,254,187,397,397,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235,0,3


In [7]:
numeric_cols = ['age', 'upd_gender', 'height', 'weight', 'upd_howlong',"total_lift"] #leakage - 'candj', 'snatch', 'deadlift', 'backsq'
df = df[numeric_cols]

In [8]:
# Split manually
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)

# Convert to H2OFrame
train_h2o = h2o.H2OFrame(train_df)
test_h2o = h2o.H2OFrame(test_df)

Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%
Parse progress: |████████████████████████████████████████████████████████████████| (done) 100%


In [9]:
target = "total_lift"
features = [col for col in df.columns if col != target]


In [10]:

# Run AutoML
aml_all = H2OAutoML(
    max_runtime_secs=300, 
    exclude_algos=["StackedEnsemble"], 
    seed=1
)
aml_all.train(x=features, y=target, training_frame=train_h2o)

AutoML progress: |
17:25:17.493: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%


Unnamed: 0,number_of_trees,number_of_internal_trees,model_size_in_bytes,min_depth,max_depth,mean_depth,min_leaves,max_leaves,mean_leaves
,54.0,54.0,44577.0,6.0,6.0,6.0,45.0,64.0,61.092594

Unnamed: 0,mean,sd,cv_1_valid,cv_2_valid,cv_3_valid,cv_4_valid,cv_5_valid
aic,,0.0,,,,,
loglikelihood,,0.0,,,,,
mae,118.980576,1.6897067,119.93327,120.83102,118.96871,116.32143,118.84846
mean_residual_deviance,23841.762,856.957,24496.244,24374.914,23523.758,22472.695,24341.195
mse,23841.762,856.957,24496.244,24374.914,23523.758,22472.695,24341.195
r2,0.689936,0.0074667,0.681656,0.6868949,0.6995604,0.6958405,0.6857281
residual_deviance,23841.762,856.957,24496.244,24374.914,23523.758,22472.695,24341.195
rmse,154.38753,2.7960691,156.51276,156.12468,153.37457,149.90897,156.01665
rmsle,0.1814408,0.0129814,0.1873305,0.1825851,0.1841862,0.1594818,0.1936203

Unnamed: 0,timestamp,duration,number_of_trees,training_rmse,training_mae,training_deviance
,2025-07-24 17:25:32,0.553 sec,0.0,277.3024258,229.8873903,76896.6353794
,2025-07-24 17:25:32,0.568 sec,5.0,213.1425108,173.3393692,45429.7299195
,2025-07-24 17:25:32,0.583 sec,10.0,181.0390175,144.4760982,32775.1258747
,2025-07-24 17:25:32,0.595 sec,15.0,164.7451263,129.5243249,27140.9566516
,2025-07-24 17:25:32,0.605 sec,20.0,157.8034845,123.0662308,24901.9397269
,2025-07-24 17:25:32,0.620 sec,25.0,154.1334058,119.630105,23757.106795
,2025-07-24 17:25:32,0.632 sec,30.0,152.1150369,117.7857386,23138.9844468
,2025-07-24 17:25:32,0.645 sec,35.0,150.847274,116.5906945,22754.9000804
,2025-07-24 17:25:32,0.660 sec,40.0,150.0918892,115.8838242,22527.5752166
,2025-07-24 17:25:32,0.677 sec,45.0,149.5192206,115.3582755,22355.9973282

variable,relative_importance,scaled_importance,percentage
upd_gender,3950272768.0,1.0,0.5996819
weight,1689991424.0,0.4278164,0.2565537
age,394642016.0,0.0999025,0.0599097
upd_howlong,341687840.0,0.0864973,0.0518709
height,210686144.0,0.0533346,0.0319838


In [11]:


# Leaderboard
lb_all = aml_all.leaderboard
print("Top Models (All Features):")
lb_all.head(rows=5)


Top Models (All Features):


model_id,rmse,mse,mae,rmsle,mean_residual_deviance
GBM_5_AutoML_1_20250724_172517,154.371,23830.4,118.972,0.181817,23830.4
GBM_grid_1_AutoML_1_20250724_172517_model_2,154.414,23843.8,119.049,0.181773,23843.8
GBM_grid_1_AutoML_1_20250724_172517_model_29,154.594,23899.4,119.081,0.181945,23899.4
GBM_2_AutoML_1_20250724_172517,154.677,23925.0,119.215,0.18199,23925.0
GBM_grid_1_AutoML_1_20250724_172517_model_34,154.745,23945.9,119.312,0.18212,23945.9


In [12]:
# Feature importance from top model
best_model = aml_all.leader
feat_importance = best_model.varimp(use_pandas=True)

if feat_importance is not None:
	top_3_feats = feat_importance["variable"].tolist()[:3]
	print("Top 3 Features:", top_3_feats)
else:
	top_3_feats = []
	print("Feature importance is not available for this model.")

Top 3 Features: ['upd_gender', 'weight', 'age']


In [None]:


# Run again with top 3 features only if available
if top_3_feats:
	aml_top3 = H2OAutoML(max_runtime_secs=300, seed=1)
	aml_top3.train(x=top_3_feats, y=target, training_frame=train_h2o)

	# Leaderboard
	lb_top3 = aml_top3.leaderboard
	print("Top Models (Top 3 Features):")
	lb_top3.head(rows=5)
else:
	print("No top features available. Skipping AutoML with top 3 features.")


AutoML progress: |
17:30:19.5: AutoML: XGBoost is not available; skipping it.

███████████████████████████████████████████████████████████████| (done) 100%
Top Models (Top 3 Features):
