In [10]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [1]:
%%capture
!pip install -q autogluon.tabular --force-reinstall;

In [5]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [13]:
data = pd.read_csv('/kaggle/input/playground-series-s4e4/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')
data.drop(columns=['id'], axis=1, inplace=True)
test.drop(columns=['id'], axis=1, inplace=True)

# Split the data into features and labels
X = data.drop(columns=['Rings'])  # Features
y = data['Rings']  # Ground truth labels

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Convert the training and validation data into TabularDataset objects
train_data = TabularDataset(pd.concat([X_train, y_train], axis=1))
val_data = TabularDataset(pd.concat([X_val, y_val], axis=1))

# Convert the test data into a TabularDataset object
test_data = TabularDataset(test)

In [14]:
# Train the model using AutoGluon
predictor = TabularPredictor(label='Rings', 
                             eval_metric='root_mean_squared_error', 
                             problem_type='regression',
                            )
predictor.fit(train_data,
             presets = 'best_quality',
             time_limit = 10000)

# Make predictions on the validation set
predictions = predictor.predict(val_data)

# Calculate RMSE for validation
rmse = np.sqrt(mean_squared_error(y_val, predictions))
print("Validation RMSE:", rmse)

No path specified. Models will be saved in: "AutogluonModels/ag-20240421_101738"
Presets specified: ['best_quality']
Setting dynamic_stacking from 'auto' to True. Reason: Enable dynamic_stacking when use_bag_holdout is disabled. (use_bag_holdout=False)
Stack configuration (auto_stack=True): num_stack_levels=1, num_bag_folds=8, num_bag_sets=1
Dynamic stacking is enabled (dynamic_stacking=True). AutoGluon will try to determine whether the input data is affected by stacked overfitting and enable or disable stacking as a consequence.
Detecting stacked overfitting by sub-fitting AutoGluon on the input data. That is, copies of AutoGluon will be sub-fit on subset(s) of the data. Then, the holdout validation data is used to detect stacked overfitting.
Sub-fit(s) time limit is: 10000 seconds.
Starting holdout-based sub-fit for dynamic stacking. Context path is: AutogluonModels/ag-20240421_101738/ds_sub_fit/sub_fit_ho.
2024-04-21 10:17:39,482	INFO util.py:124 -- Outdated packages:
  ipywidgets==

[1000]	valid_set's rmse: 1.83478
[2000]	valid_set's rmse: 1.82725
[3000]	valid_set's rmse: 1.82416
[4000]	valid_set's rmse: 1.82463
[1000]	valid_set's rmse: 1.82314
[2000]	valid_set's rmse: 1.8149
[3000]	valid_set's rmse: 1.81349
[4000]	valid_set's rmse: 1.81323
[1000]	valid_set's rmse: 1.83465
[2000]	valid_set's rmse: 1.8261
[3000]	valid_set's rmse: 1.82424
[4000]	valid_set's rmse: 1.82395
[5000]	valid_set's rmse: 1.82487
[1000]	valid_set's rmse: 1.85378
[2000]	valid_set's rmse: 1.84583
[3000]	valid_set's rmse: 1.84396
[4000]	valid_set's rmse: 1.8442
[1000]	valid_set's rmse: 1.89463
[2000]	valid_set's rmse: 1.88662
[3000]	valid_set's rmse: 1.88335
[4000]	valid_set's rmse: 1.88165
[5000]	valid_set's rmse: 1.88066
[6000]	valid_set's rmse: 1.88019
[7000]	valid_set's rmse: 1.88036
[1000]	valid_set's rmse: 1.86149
[2000]	valid_set's rmse: 1.8567
[1000]	valid_set's rmse: 1.89963
[2000]	valid_set's rmse: 1.88981
[3000]	valid_set's rmse: 1.88401
[4000]	valid_set's rmse: 1.87973
[5000]	valid_s

	-1.8437	 = Validation score   (-root_mean_squared_error)
	198.8s	 = Training   runtime
	26.63s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 1436.39s of the 2270.03s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's rmse: 1.80048
[1000]	valid_set's rmse: 1.85819


	-1.8326	 = Validation score   (-root_mean_squared_error)
	27.05s	 = Training   runtime
	1.52s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 1407.33s of the 2240.97s of remaining time.
	-1.8762	 = Validation score   (-root_mean_squared_error)
	46.7s	 = Training   runtime
	4.13s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 1353.7s of the 2187.34s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-1.8299	 = Validation score   (-root_mean_squared_error)
	723.22s	 = Training   runtime
	0.16s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ... Training model for up to 630.17s of the 1463.81s of remaining time.
	-1.8588	 = Validation score   (-root_mean_squared_error)
	12.71s	 = Training   runtime
	3.28s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ... Training model for up to 611.57s of the 1445.21s of remaining time.
	Fitting 8 c

[1000]	valid_set's rmse: 1.82168
[2000]	valid_set's rmse: 1.81505
[3000]	valid_set's rmse: 1.81132
[4000]	valid_set's rmse: 1.81003
[5000]	valid_set's rmse: 1.80989
[1000]	valid_set's rmse: 1.88338
[2000]	valid_set's rmse: 1.87092
[3000]	valid_set's rmse: 1.86691
[4000]	valid_set's rmse: 1.86349
[5000]	valid_set's rmse: 1.86331
[6000]	valid_set's rmse: 1.86292
[7000]	valid_set's rmse: 1.86246
[8000]	valid_set's rmse: 1.86298
[9000]	valid_set's rmse: 1.86329
[1000]	valid_set's rmse: 1.86917
[2000]	valid_set's rmse: 1.86036
[3000]	valid_set's rmse: 1.85805
[4000]	valid_set's rmse: 1.85712
[5000]	valid_set's rmse: 1.85661
[6000]	valid_set's rmse: 1.85733
[1000]	valid_set's rmse: 1.83734
[2000]	valid_set's rmse: 1.82876
[3000]	valid_set's rmse: 1.82607
[4000]	valid_set's rmse: 1.82449
[5000]	valid_set's rmse: 1.82328
[6000]	valid_set's rmse: 1.82254
[7000]	valid_set's rmse: 1.82245
[8000]	valid_set's rmse: 1.82343
[1000]	valid_set's rmse: 1.87534
[2000]	valid_set's rmse: 1.86615
[3000]	val

	-1.8393	 = Validation score   (-root_mean_squared_error)
	318.09s	 = Training   runtime
	52.74s	 = Validation runtime
Fitting model: LightGBM_BAG_L1 ... Training model for up to 4594.79s of the 7082.57s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy


[1000]	valid_set's rmse: 1.84908


	-1.8323	 = Validation score   (-root_mean_squared_error)
	27.88s	 = Training   runtime
	1.54s	 = Validation runtime
Fitting model: RandomForestMSE_BAG_L1 ... Training model for up to 4564.88s of the 7052.67s of remaining time.
	-1.8735	 = Validation score   (-root_mean_squared_error)
	51.12s	 = Training   runtime
	4.8s	 = Validation runtime
Fitting model: CatBoost_BAG_L1 ... Training model for up to 4506.04s of the 6993.83s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	-1.8278	 = Validation score   (-root_mean_squared_error)
	861.88s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: ExtraTreesMSE_BAG_L1 ... Training model for up to 3643.83s of the 6131.62s of remaining time.
	-1.855	 = Validation score   (-root_mean_squared_error)
	13.69s	 = Training   runtime
	4.02s	 = Validation runtime
Fitting model: NeuralNetFastAI_BAG_L1 ... Training model for up to 3623.59s of the 6111.38s of remaining time.
	Fitting 8

[1000]	valid_set's rmse: 1.79768
[2000]	valid_set's rmse: 1.7961
[1000]	valid_set's rmse: 1.85864
[2000]	valid_set's rmse: 1.85538
[3000]	valid_set's rmse: 1.85556
[1000]	valid_set's rmse: 1.84903
[2000]	valid_set's rmse: 1.84707
[1000]	valid_set's rmse: 1.81012
[2000]	valid_set's rmse: 1.80692
[3000]	valid_set's rmse: 1.80654
[1000]	valid_set's rmse: 1.84658
[2000]	valid_set's rmse: 1.84054
[1000]	valid_set's rmse: 1.83291
[2000]	valid_set's rmse: 1.83067
[1000]	valid_set's rmse: 1.79536
[1000]	valid_set's rmse: 1.84848
[2000]	valid_set's rmse: 1.84608


	-1.8265	 = Validation score   (-root_mean_squared_error)
	90.28s	 = Training   runtime
	8.77s	 = Validation runtime
Fitting model: NeuralNetFastAI_r191_BAG_L1 ... Training model for up to 671.83s of the 3159.62s of remaining time.
	Fitting 8 child models (S1F1 - S1F8) | Fitting with SequentialLocalFoldFittingStrategy
	Ran out of time, stopping training early. (Stopping on epoch 12)
	Ran out of time, stopping training early. (Stopping on epoch 13)
	Ran out of time, stopping training early. (Stopping on epoch 14)
	Ran out of time, stopping training early. (Stopping on epoch 14)
	Ran out of time, stopping training early. (Stopping on epoch 14)
	Ran out of time, stopping training early. (Stopping on epoch 15)
	Ran out of time, stopping training early. (Stopping on epoch 17)
No improvement since epoch 0: early stopping
	-1.885	 = Validation score   (-root_mean_squared_error)
	633.44s	 = Training   runtime
	2.06s	 = Validation runtime
Fitting model: CatBoost_r9_BAG_L1 ... Training model for

Validation RMSE: 1.8408106830016933


In [15]:
display(predictor.leaderboard().\
        style.format(precision = 5).\
        set_caption(f"\nModel Leaderboard\n")
       )

Unnamed: 0,model,score_val,eval_metric,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L3,-1.81547,root_mean_squared_error,87.48118,6322.09828,0.00219,0.32047,3,True,31
1,NeuralNetFastAI_BAG_L2,-1.81658,root_mean_squared_error,81.75044,5615.57218,1.42665,741.46814,2,True,23
2,WeightedEnsemble_L2,-1.81823,root_mean_squared_error,19.75928,3126.89637,0.00266,0.23613,2,True,17
3,CatBoost_r177_BAG_L2,-1.81973,root_mean_squared_error,80.39909,4904.4058,0.0753,30.30176,2,True,27
4,CatBoost_BAG_L2,-1.81995,root_mean_squared_error,80.41678,4962.87269,0.09299,88.76865,2,True,21
5,LightGBMXT_BAG_L2,-1.8208,root_mean_squared_error,81.06378,4903.3358,0.73999,29.23176,2,True,18
6,XGBoost_BAG_L2,-1.82184,root_mean_squared_error,80.63347,4887.89412,0.30968,13.79007,2,True,24
7,LightGBM_r131_BAG_L2,-1.82192,root_mean_squared_error,81.39913,4918.80662,1.07534,44.70257,2,True,29
8,LightGBM_BAG_L2,-1.82291,root_mean_squared_error,80.55905,4890.78126,0.23526,16.67722,2,True,19
9,LightGBM_r131_BAG_L1,-1.82647,root_mean_squared_error,8.77325,90.28317,8.77325,90.28317,1,True,14


In [16]:
# Use the trained predictor to make predictions on the test data
predictions = predictor.predict(test_data)
predictions

INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU
INFO:sklearnex: sklearn.neighbors.KNeighborsRegressor.predict: running accelerated version on CPU
INFO:sklearnex: sklearn.utils.validation._assert_all_finite: running accelerated version on CPU


0         9.740144
1         9.761234
2        10.124355
3        10.520326
4         7.604733
           ...    
60406     6.502382
60407     9.381942
60408    12.836523
60409    13.339374
60410     8.577337
Name: Rings, Length: 60411, dtype: float32

In [19]:
ids = pd.read_csv('/kaggle/input/playground-series-s4e4/test.csv')['id']
submission_df = pd.DataFrame({'id': ids, 'Rings': predictions})
submission_df.to_csv('submission.csv', index=False)
submission_df

Unnamed: 0,id,Rings
0,90615,9.740144
1,90616,9.761234
2,90617,10.124355
3,90618,10.520326
4,90619,7.604733
...,...,...
60406,151021,6.502382
60407,151022,9.381942
60408,151023,12.836523
60409,151024,13.339374
