In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from autosklearn.regression import AutoSklearnRegressor

In [13]:
data = pd.read_csv('../data/data_cleaned.csv')

In [14]:
data.dtypes

Unnamed: 0           int64
brand               object
model               object
price_in_pln       float64
mileage            float64
gearbox             object
engine_capacity    float64
fuel_type           object
city                object
voivodeship         object
year                 int64
dtype: object

In [15]:
# Dropping text columns
data.drop(columns=['model', 'city'], inplace=True)

In [16]:
x = data.drop(columns=["price_in_pln"])
y = data["price_in_pln"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [17]:


automl = AutoSklearnRegressor(
    time_left_for_this_task=1200,
    per_run_time_limit=60,
    seed=0,
)
automl.fit(x_train, y_train)

y_pred = automl.predict(x_test)

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ', r2_score(y_test, y_pred))



MAE:  26662.165765025205
R2:  0.6563314189579534


In [18]:
automl.show_models()

{9: {'model_id': 9,
  'rank': 1,
  'cost': 0.3665352973733311,
  'ensemble_weight': 0.18,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f20bef6abe0>,
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f20bef414f0>,
  'regressor': <autosklearn.pipeline.components.regression.RegressorChoice at 0x7f20bdd5a190>,
  'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=2.208787572338781e-05,
                                learning_rate=0.036087332404571744, max_iter=512,
                                max_leaf_nodes=64, min_samples_leaf=3,
                                n_iter_no_change=18, random_state=0,
                                validation_fraction=None, warm_start=True)},
 13: {'model_id': 13,
  'rank': 2,
  'cost': 0.36738939734581877,
  'ensemble_weight': 0.16,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPrepr

In [19]:
automl.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
22,1,0.62,gradient_boosting,0.352842,38.088313
9,2,0.18,gradient_boosting,0.366535,25.076687
13,3,0.16,gradient_boosting,0.367389,52.243327
49,4,0.04,mlp,0.580792,41.293739


In [20]:
automl.sprint_statistics()

'auto-sklearn results:\n  Dataset name: f844ca19-a733-11ef-9f63-f8a2d6b93135\n  Metric: r2\n  Best validation score: 0.647158\n  Number of target algorithm runs: 51\n  Number of successful target algorithm runs: 11\n  Number of crashed target algorithm runs: 24\n  Number of target algorithms that exceeded the time limit: 7\n  Number of target algorithms that exceeded the memory limit: 9\n'