In [None]:
#!pip install pandas scikit-learn mljar-supervised h2o

In [1]:
# MLJAR AutoML - All Features
from supervised.automl import AutoML
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
# Load dataset
df = pd.read_csv("processed_athletes.csv")
df.head(2)

Unnamed: 0,upd_gender,upd_howlong,BMI,candj_rel,snatch_rel,deadlift_rel,backsq_rel,age,weight,total_lift
0,0,4,28.350347,1.536458,1.171875,2.421875,2.083333,35,192,1385
1,0,3,24.933391,1.54878,1.140244,2.420732,2.420732,27,164,1235


In [3]:
# Separate features and target
X = df.drop(columns=["total_lift"])
y = df["total_lift"]

In [4]:
# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [7]:
# Create AutoML model
automl_all = AutoML(
    mode="Explain",
    explain_level = 2,
    total_time_limit=300,
    start_random_models=3,
    eval_metric="rmse",
    algorithms=["Linear", "Random Forest", "Xgboost", "Neural Network"]
)
automl_all.fit(X_train, y_train)

Linear algorithm was disabled.
AutoML directory: AutoML_7
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 17.389836 trained in 9.32 seconds
2_Default_NeuralNetwork rmse 130.178686 trained in 5.1 seconds
3_Default_RandomForest rmse 91.232815 trained in 7.25 seconds
* Step not_so_random will try to check up to 6 models
4_Xgboost rmse 17.408163 trained in 4.73 seconds
6_RandomForest rmse 85.023349 trained in 1.69 seconds
8_NeuralNetwork rmse 15.643056 trained in 0.75 seconds
5_Xgboost rmse 18.890863 trained in 3.0 seconds
7_RandomForest rmse 109.587572 trained in 2.98 seconds
9_NeuralNetwork rmse 32.634501 trained in 2.26 seconds
* Step ensemble wil

In [8]:
# Leaderboard
leaderboard_all = automl_all.get_leaderboard()
leaderboard_all_sorted = leaderboard_all.sort_values(by="metric_value", ascending=False)
print("Top Models (All Features) - Ordered by metric_value:")
print(leaderboard_all_sorted[["model_type", "metric_value", "train_time"]])

Top Models (All Features) - Ordered by metric_value:
       model_type  metric_value  train_time
1  Neural Network    130.178686        5.49
7   Random Forest    109.587572        3.65
2   Random Forest     91.232815        7.78
4   Random Forest     85.023349        2.07
8  Neural Network     32.634501        2.71
6         Xgboost     18.890863        3.39
3         Xgboost     17.408163        5.21
0         Xgboost     17.389836        9.81
5  Neural Network     15.643056        1.03
9        Ensemble     11.620678        0.21


In [19]:
leaderboard_all_sorted = leaderboard_all.sort_values(by="train_time", ascending=True)
print("Top Models (All Features) - Ordered by train_time:")
print(leaderboard_all_sorted[["model_type", "metric_value", "train_time"]])

Top Models (All Features) - Ordered by train_time:
       model_type  metric_value  train_time
9        Ensemble     11.620678        0.21
5  Neural Network     15.643056        1.03
4   Random Forest     85.023349        2.07
8  Neural Network     32.634501        2.71
6         Xgboost     18.890863        3.39
7   Random Forest    109.587572        3.65
3         Xgboost     17.408163        5.21
1  Neural Network    130.178686        5.49
2   Random Forest     91.232815        7.78
0         Xgboost     17.389836        9.81


In [9]:
top_n = 5
# Show top n models as a table
top_models = leaderboard_all_sorted.nsmallest(top_n, "metric_value")
top_models[["model_type", "name", "metric_value", "train_time"]]


Unnamed: 0,model_type,name,metric_value,train_time
9,Ensemble,Ensemble,11.620678,0.21
5,Neural Network,8_NeuralNetwork,15.643056,1.03
0,Xgboost,1_Default_Xgboost,17.389836,9.81
3,Xgboost,4_Xgboost,17.408163,5.21
6,Xgboost,5_Xgboost,18.890863,3.39


In [14]:
from sklearn.feature_selection import SelectKBest, f_regression

# Select top 5 features using statistical test
selector = SelectKBest(score_func=f_regression, k=5)
X_train_selected = selector.fit_transform(X_train, y_train)

# Get the selected feature names
selected_features = X_train.columns[selector.get_support()]
top_5_feats = selected_features.tolist()

# Show feature scores
feature_scores = pd.DataFrame({
    'feature': X_train.columns,
    'score': selector.scores_
}).sort_values('score', ascending=False)

print("\nTop 5 Features by Statistical Score:")
print(feature_scores.head(5))
print(f"\nTop 5 feature names: {top_5_feats}")
    


Top 5 Features by Statistical Score:
      feature         score
0  upd_gender  25761.161229
3   candj_rel  15884.356635
4  snatch_rel  15554.137279
8      weight  14826.818858
6  backsq_rel  11868.308456

Top 5 feature names: ['upd_gender', 'candj_rel', 'snatch_rel', 'backsq_rel', 'weight']


In [15]:
automl_top3 = AutoML(
    mode="Explain", 
    explain_level = 2,
    total_time_limit=200,
    start_random_models=3,
    eval_metric="rmse",
    algorithms=["Linear", "Random Forest", "Xgboost", "Neural Network"]
)
automl_top3.fit(X_train[top_5_feats], y_train)

Linear algorithm was disabled.
AutoML directory: AutoML_1
The task is regression with evaluation metric rmse
AutoML will use algorithms: ['Random Forest', 'Xgboost', 'Neural Network']
AutoML will ensemble available models
AutoML steps: ['simple_algorithms', 'default_algorithms', 'not_so_random', 'ensemble']
Skip simple_algorithms because no parameters were generated.
* Step default_algorithms will try to check up to 3 models
1_Default_Xgboost rmse 41.337954 trained in 7.33 seconds
2_Default_NeuralNetwork rmse 84.748763 trained in 3.93 seconds
3_Default_RandomForest rmse 91.153681 trained in 4.38 seconds
* Step not_so_random will try to check up to 6 models
4_Xgboost rmse 40.899711 trained in 5.5 seconds
6_RandomForest rmse 88.332935 trained in 4.39 seconds
8_NeuralNetwork rmse 39.528099 trained in 0.8 seconds
5_Xgboost rmse 40.856129 trained in 2.58 seconds
7_RandomForest rmse 107.075861 trained in 1.29 seconds
9_NeuralNetwork rmse 45.709455 trained in 0.7 seconds
* Step ensemble will 

In [16]:
# Leaderboard
leaderboard_top3 = automl_top3.get_leaderboard()
leaderboard_top3 = leaderboard_top3.sort_values(by="metric_value", ascending=False)
print("Top Models (All Features) - Ordered by metric_value:")
print(leaderboard_top3[["model_type", "metric_value", "train_time"]])

Top Models (All Features) - Ordered by metric_value:
       model_type  metric_value  train_time
7   Random Forest    107.075861        1.69
2   Random Forest     91.153681        4.76
4   Random Forest     88.332935        4.77
1  Neural Network     84.748763        4.20
8  Neural Network     45.709455        0.96
0         Xgboost     41.337954        7.69
3         Xgboost     40.899711        5.86
6         Xgboost     40.856129        2.96
5  Neural Network     39.528099        1.09
9        Ensemble     38.526792        0.16


In [18]:
leaderboard_top3 = leaderboard_top3.sort_values(by="train_time", ascending=True)
print("Top Models (All Features) - Ordered by train_time:")
print(leaderboard_top3[["model_type", "metric_value", "train_time"]])

Top Models (All Features) - Ordered by train_time:
       model_type  metric_value  train_time
9        Ensemble     38.526792        0.16
8  Neural Network     45.709455        0.96
5  Neural Network     39.528099        1.09
7   Random Forest    107.075861        1.69
6         Xgboost     40.856129        2.96
1  Neural Network     84.748763        4.20
2   Random Forest     91.153681        4.76
4   Random Forest     88.332935        4.77
3         Xgboost     40.899711        5.86
0         Xgboost     41.337954        7.69


### 7. The top models in this Assignment-3 has better performance than Assignment-1 or Assignment-2, due to AutoML finding the best type of ML algorithm and AutoML optimizing the Hyperparameters for best performance.

### 8. MLJAR AutoML is full-code, and I chose this for maximum control over the experiments run.