In [1]:
#!pip install pandas scikit-learn mljar-supervised h2o

In [2]:
# MLJAR AutoML - All Features
from supervised.automl import AutoML
import pandas as pd
from sklearn.model_selection import train_test_split

In [3]:
# Load dataset
df = pd.read_csv("athletes_v2.csv")
df.head(2)

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift
0,South East,Male,35,69,192,295,225,465,400,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385
1,Latin America,Male,27,68,164,254,187,397,397,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235


In [4]:
# Create mapping for gender column
gender_mapping = {'Male': 0, 'Female': 1, '--': 1}

# Apply mapping to create new column in both datasets
df['upd_gender'] = df['gender'].map(gender_mapping)

In [5]:
# Create mapping for howlong column with partial matching
def map_howlong(value):
    if '4+ years' in value:
        return 4
    elif '2-4 years' in value:
        return 3
    elif '1-2 years' in value:
        return 2
    elif '6-12 months' in value:
        return 1
    elif 'Less than 6 months' in value:
        return 0
    else:
        return None

# Apply mapping to create new column in both datasets
df['upd_howlong'] = df['howlong'].apply(map_howlong)

In [6]:
df.head(2)

Unnamed: 0,region,gender,age,height,weight,candj,snatch,deadlift,backsq,eat,background,experience,schedule,howlong,total_lift,upd_gender,upd_howlong
0,South East,Male,35,69,192,295,225,465,400,I eat quality foods but don't measure the amount|,I played youth or high school level sports|I p...,I began CrossFit by trying it alone (without a...,I do multiple workouts in a day 3+ times a week|,4+ years|,1385,0,4
1,Latin America,Male,27,68,164,254,187,397,397,I weigh and measure my food|I eat strict Paleo|,I played youth or high school level sports|I p...,I began CrossFit with a coach (e.g. at an affi...,I do multiple workouts in a day 3+ times a wee...,2-4 years|,1235,0,3


In [7]:
numeric_cols = ['age', 'upd_gender', 'height', 'weight', 'upd_howlong'] #leakage - 'candj', 'snatch', 'deadlift', 'backsq',

In [8]:
# Separate features and target
X = df[numeric_cols]
y = df["total_lift"]

In [9]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [10]:
# Create AutoML model
automl_all = AutoML(
    mode="Compete",  # best for fast experiments
    total_time_limit=300,  # 5 minutes total
    eval_metric="rmse",
    algorithms=["Linear", "Random Forest", "Xgboost", "LightGBM", "Extra Trees"]
)
automl_all.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: AutoML_4
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'Xgboost', 'LightGBM', 'Extra Trees']
AutoML will stack models
AutoML will ensemble available models
AutoML steps: ['adjust_validation', 'simple_algorithms', 'default_algorithms', 'not_so_random', 'golden_features', 'kmeans_features', 'insert_random_feature', 'features_selection', 'hill_climbing_1', 'hill_climbing_2', 'boost_on_errors', 'ensemble', 'stack', 'ensemble_stacked']
* Step adjust_validation will try to check up to 1 model
1_DecisionTree rmse 177.319686 trained in 0.26 seconds
Adjust validation. Remove: 1_DecisionTree
Validation strategy: 5-fold CV Shuffle
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 4 models
1_Default_LightGBM rmse 155.020244 trained in 2.77 seconds
2_Default_Xgboost rmse 154.714226 trained in 1.57 seconds
3_Default_RandomForest rmse 167.090



14_LightGBM_KMeansFeatures rmse 154.701225 trained in 4.24 seconds




10_Xgboost_KMeansFeatures rmse 154.697026 trained in 4.23 seconds
* Step insert_random_feature will try to check up to 1 model
10_Xgboost_GoldenFeatures_RandomFeature rmse 154.357496 trained in 15.38 seconds
Drop features ['random_feature', 'height_multiply_upd_gender', 'upd_gender_ratio_age', 'upd_gender_multiply_age', 'height_ratio_upd_gender']
* Step features_selection will try to check up to 4 models
10_Xgboost_GoldenFeatures_SelectedFeatures rmse 154.279812 trained in 3.15 seconds
14_LightGBM_SelectedFeatures rmse 154.333714 trained in 3.41 seconds
* Step hill_climbing_1 will try to check up to 14 models
41_Xgboost_GoldenFeatures_SelectedFeatures rmse 154.36778 trained in 2.75 seconds
42_Xgboost_GoldenFeatures rmse 154.38936 trained in 2.76 seconds
43_LightGBM_SelectedFeatures rmse 154.453563 trained in 2.54 seconds
44_LightGBM rmse 154.453563 trained in 2.37 seconds
45_Xgboost rmse 154.358712 trained in 2.89 seconds
46_LightGBM_GoldenFeatures rmse 154.492772 trained in 2.72 secon

In [11]:
# Leaderboard
leaderboard_all = automl_all.get_leaderboard()
leaderboard_all_sorted = leaderboard_all.sort_values(by="metric_value", ascending=False)
print("Top Models (All Features) - Ordered by metric_value:")
print(leaderboard_all_sorted[["model_type", "metric_value", "train_time"]])

Top Models (All Features) - Ordered by metric_value:
     model_type  metric_value  train_time
11  Extra Trees    181.541539        3.73
7   Extra Trees    178.108156        2.93
31  Extra Trees    175.562429        2.93
35  Extra Trees    174.952081        3.10
3   Extra Trees    174.592279        3.65
..          ...           ...         ...
57      Xgboost    154.292496        3.76
46      Xgboost    154.279812        3.51
55      Xgboost    154.264619        3.75
61     Ensemble    153.948716        4.49
66     Ensemble    153.945619        5.03

[67 rows x 3 columns]


In [12]:
top_n = 5
# Show top n models as a table
top_models = leaderboard_all_sorted.nsmallest(top_n, "metric_value")
top_models[["model_type", "name", "metric_value", "train_time"]]


Unnamed: 0,model_type,name,metric_value,train_time
66,Ensemble,Ensemble_Stacked,153.945619,5.03
61,Ensemble,Ensemble,153.948716,4.49
55,Xgboost,48_Xgboost_GoldenFeatures_SelectedFeatures,154.264619,3.75
46,Xgboost,10_Xgboost_GoldenFeatures_SelectedFeatures,154.279812,3.51
57,Xgboost,50_Xgboost_GoldenFeatures,154.292496,3.76


In [22]:
# Get best model ID from automl_all
best_model_id = automl_all.get_best_model_id()
model_info = automl_all.get_model_info(best_model_id)

# Try different ways to access feature importance
try:
    # Method 1: Direct access to feature_importance in model_info
    feature_imp = model_info.get('feature_importance')
    if feature_imp and isinstance(feature_imp, list) and len(feature_imp) > 0:
        top_feats = [f[0] for f in feature_imp[:3]]
    else:
        # Method 2: Fallback to all features if feature importance not available
        top_feats = X_train.columns.tolist()[:3]
except Exception as e:
    print(f"Error accessing feature importance: {e}")
    # Use all available features as fallback
    top_feats = X_train.columns.tolist()[:3]

print(f"Top 3 features: {top_feats}")

AttributeError: 'AutoML' object has no attribute 'get_best_model_id'

In [None]:


automl_top3 = AutoML(
    mode="Compete", 
    total_time_limit=200,
    eval_metric="r2"
)
automl_top3.fit(X_train[top_feats], y_train)

In [None]:




leaderboard_top3 = automl_top3.get_leaderboard()
print("Top Models (Top 3 Features):")
print(leaderboard_top3[["model_type", "validation_score", "training_time"]])
