In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from autosklearn.regression import AutoSklearnRegressor
from joblib import dump

In [3]:
data = pd.read_csv('../data/data_cleaned.csv')

In [4]:
data.dtypes

Unnamed: 0           int64
brand               object
model               object
price_in_pln       float64
mileage            float64
gearbox             object
engine_capacity    float64
fuel_type           object
city                object
voivodeship         object
year                 int64
dtype: object

In [5]:
# Dropping text columns
data.drop(columns=['model', 'city'], inplace=True)

In [6]:
x = data.drop(columns=["price_in_pln"])
y = data["price_in_pln"]

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [7]:


automl = AutoSklearnRegressor(
    time_left_for_this_task=1200,
    per_run_time_limit=60,
    seed=0,
)
automl.fit(x_train, y_train)

y_pred = automl.predict(x_test)

print('MAE: ', mean_absolute_error(y_test, y_pred))
print('R2: ', r2_score(y_test, y_pred))



MAE:  26714.3671518407
R2:  0.6529843818518992


In [8]:
automl.show_models()

{10: {'model_id': 10,
  'rank': 1,
  'cost': 0.3665352973733311,
  'ensemble_weight': 0.16,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPreprocessorChoice at 0x7f9eecadd7f0>,
  'feature_preprocessor': <autosklearn.pipeline.components.feature_preprocessing.FeaturePreprocessorChoice at 0x7f9eecad9520>,
  'regressor': <autosklearn.pipeline.components.regression.RegressorChoice at 0x7f9eecad9b50>,
  'sklearn_regressor': HistGradientBoostingRegressor(l2_regularization=2.208787572338781e-05,
                                learning_rate=0.036087332404571744, max_iter=512,
                                max_leaf_nodes=64, min_samples_leaf=3,
                                n_iter_no_change=18, random_state=0,
                                validation_fraction=None, warm_start=True)},
 16: {'model_id': 16,
  'rank': 2,
  'cost': 0.35956781987811315,
  'ensemble_weight': 0.42,
  'data_preprocessor': <autosklearn.pipeline.components.data_preprocessing.DataPre

In [9]:
automl.leaderboard()

Unnamed: 0_level_0,rank,ensemble_weight,type,cost,duration
model_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
16,1,0.42,gradient_boosting,0.359568,40.21018
62,2,0.24,gradient_boosting,0.363151,32.635219
10,3,0.16,gradient_boosting,0.366535,26.655734
64,4,0.06,gradient_boosting,0.368535,30.477064
36,5,0.06,gradient_boosting,0.40868,23.739277
77,6,0.04,mlp,0.524784,29.562296
46,7,0.02,gradient_boosting,0.535115,20.818397


In [10]:
automl.sprint_statistics()

'auto-sklearn results:\n  Dataset name: 307678b7-a820-11ef-9c39-f8a2d6b93135\n  Metric: r2\n  Best validation score: 0.640432\n  Number of target algorithm runs: 79\n  Number of successful target algorithm runs: 20\n  Number of crashed target algorithm runs: 41\n  Number of target algorithms that exceeded the time limit: 5\n  Number of target algorithms that exceeded the memory limit: 13\n'

In [11]:
best_model = automl.get_models_with_weights()[0][1]

best_model

SimpleRegressionPipeline({'data_preprocessor:__choice__': 'feature_type', 'feature_preprocessor:__choice__': 'no_preprocessing', 'regressor:__choice__': 'gradient_boosting', 'data_preprocessor:feature_type:numerical_transformer:imputation:strategy': 'mean', 'data_preprocessor:feature_type:numerical_transformer:rescaling:__choice__': 'standardize', 'data_preprocessor:feature_type:text_transformer:text_encoding:__choice__': 'tfidf_encoding', 'data_preprocessor:feature_type:text_transformer:text_feature_reduction:n_components': 100, 'regressor:gradient_boosting:early_stop': 'train', 'regressor:gradient_boosting:l2_regularization': 0.005746611563553693, 'regressor:gradient_boosting:learning_rate': 0.0913971028976721, 'regressor:gradient_boosting:loss': 'least_squares', 'regressor:gradient_boosting:max_bins': 255, 'regressor:gradient_boosting:max_depth': 'None', 'regressor:gradient_boosting:max_leaf_nodes': 9, 'regressor:gradient_boosting:min_samples_leaf': 2, 'regressor:gradient_boosting:s

In [12]:
dump(best_model, "gradient_boost_model.joblib")

['gradient_boost_model.joblib']