In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import LabelEncoder
from autosklearn.regression import AutoSklearnRegressor

In [11]:
data = pd.read_csv('../data/data_cleaned.csv')

In [12]:
data.dtypes

Unnamed: 0           int64
brand               object
model               object
price_in_pln       float64
mileage            float64
gearbox             object
engine_capacity    float64
fuel_type           object
city                object
voivodeship         object
year                 int64
dtype: object

In [13]:
# Dropping text columns
data.drop(columns=['model', 'city'], inplace=True)

In [14]:
x = data.drop(columns=["price_in_pln"])
y = data["price_in_pln"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [15]:


automl = AutoSklearnRegressor(
    time_left_for_this_task=1200,
    per_run_time_limit=120,
    seed=0,
    memory_limit=4096
)
automl.fit(x_train, y_train)

y_pred = automl.predict(x_test)

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ', r2_score(y_test, y_pred))



MAE:  26877.985358386948
R2:  0.6352222056059282


In [16]:
automl.show_models()

{9: {'model_id': 9,
  'rank': 1,
  'cost': 0.3665352973733311,
  'ensemble_weight': 0.52,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7fd471ac6970>,
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7fd471459a30>,
  'regressor': <autosklearn.pipeline.components.regression.RegressorChoice at 0x7fd471459dc0>,
  'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=2.208787572338781e-05,
                                learning_rate=0.036087332404571744, max_iter=512,
                                max_leaf_nodes=64, min_samples_leaf=3,
                                n_iter_no_change=18, random_state=0,
                                validation_fraction=None, warm_start=True)},
 13: {'model_id': 13,
  'rank': 2,
  'cost': 0.36831033858815054,
  'ensemble_weight': 0.44,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPrepr

In [17]:
automl.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
9,1,0.52,gradient_boosting,0.366535,31.843557
13,2,0.44,gradient_boosting,0.36831,64.789374
21,3,0.04,gradient_boosting,0.860443,13.57437


In [18]:
automl.sprint_statistics()

'auto-sklearn results:\n  Dataset name: ad742fe7-a72a-11ef-8d01-f8a2d6b93135\n  Metric: r2\n  Best validation score: 0.633465\n  Number of target algorithm runs: 21\n  Number of successful target algorithm runs: 5\n  Number of crashed target algorithm runs: 5\n  Number of target algorithms that exceeded the time limit: 5\n  Number of target algorithms that exceeded the memory limit: 6\n'