In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error

df = pd.read_csv("data/results_parsed.csv")

features = ['model', 'rok_produkcji', 'paliwo', 'przebieg', 
            'pojemnosc', 'skrzynia_biegow', 'kraj_pochodzenia', 'moc', 'prywatne']
target = 'cena'

X = df[features]
y = df[target]

X = X.fillna({
    'przebieg': X['przebieg'].median(),
    'pojemnosc': X['pojemnosc'].median(),
    'moc': X['moc'].median(),
})
X = X.fillna('Brak danych')

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical = ['model', 'paliwo', 'skrzynia_biegow', 'kraj_pochodzenia', 'prywatne']
numeric = ['rok_produkcji', 'przebieg', 'pojemnosc', 'moc']

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical),
        ('num', StandardScaler(), numeric)
    ])

model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(n_estimators=400, random_state=42))
])

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

MAE: 7641.161495098038
R²: 0.7138405514182984


In [10]:
print("MAE:", mean_absolute_error(y_test, y_pred))
print("R²:", r2_score(y_test, y_pred))

MAE: 7641.161495098038
R²: 0.7138405514182984


In [11]:
import numpy as np
np.mean(y)

np.float64(22272.312992125986)