In [None]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pandas as pd

# Загружаем данные
df = pd.read_csv("/kaggle/input/vehicle-sales-data/car_prices.csv")

# Удаляем строки, где нет цены
df = df.dropna(subset=["sellingprice"])

# Целевая переменная
y = df["sellingprice"]

# Признаки (выкинем явно ненужные — vin, saledate)
X = df.drop(columns=["sellingprice", "vin", "saledate"])

# Категориальные признаки (строковые)
cat_features = ["make", "model", "trim", "body", "transmission", "color", "interior", "state", "seller"]

# Делим выборку
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Заполняем NaN в категориальных признаках
for col in cat_features:
    X_train[col] = X_train[col].astype(str).fillna("Unknown")
    X_test[col] = X_test[col].astype(str).fillna("Unknown")

# CatBoost с категориальными признаками
cat_model = CatBoostRegressor(
    depth=6,
    iterations=500,
    learning_rate=0.1,
    loss_function="MAE",
    verbose=100
)

# Обучение
cat_model.fit(X_train, y_train, cat_features=cat_features)

# Предсказания
y_pred = cat_model.predict(X_test)

# Ошибка
mae = mean_absolute_error(y_test, y_pred)
print("CatBoost MAE с категориальными признаками:", mae)


In [None]:
import pandas as pd
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt

# ===============================
# 1. Загружаем данные
# ===============================
df = pd.read_csv("/kaggle/input/vehicle-sales-data/car_prices.csv")

# Целевая переменная
y = df["sellingprice"]

# Признаки
X = df.drop(columns=["sellingprice", "vin", "saledate"])

# Категориальные признаки
cat_features = ["make", "model", "trim", "body", "transmission", 
                "color", "interior", "state", "seller"]

# Обрабатываем пропуски
for col in cat_features:
    X[col] = X[col].astype(str).fillna("Unknown")

# Убираем строки с NaN в целевой переменной
mask = ~y.isna()
X = X[mask]
y = y[mask]

# Train/Test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ===============================
# 2. Финальная модель (лучшие параметры Trial 4)
# ===============================
best_params = {
    "depth": 10,
    "iterations": 894,
    "learning_rate": 0.15903410295472148,
    "l2_leaf_reg": 5.276540089873177,
    "random_strength": 18.860360227609693,
    "loss_function": "MAE",
    "silent": False,
    "random_state": 42
}

final_model = CatBoostRegressor(**best_params)
final_model.fit(X_train, y_train, cat_features=cat_features, verbose=100)

# Проверяем на тесте
y_pred = final_model.predict(X_test)
final_mae = mean_absolute_error(y_test, y_pred)
print("Финальный MAE на тесте:", final_mae)

# ===============================
# 3. Визуализация важности признаков
# ===============================
feature_importances = final_model.get_feature_importance(prettified=True)
print(feature_importances)

plt.figure(figsize=(10,6))
plt.barh(feature_importances["Feature Id"], feature_importances["Importances"])
plt.xlabel("Важность признака")
plt.ylabel("Признак")
plt.title("Feature Importance")
plt.show()
