In [49]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler

In [51]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

In [53]:
cars_data = pd.read_csv('Car_price_ds.csv', sep=';')

In [55]:
# Основная информация о датасете
print(cars_data.info())
print('-------------------------')
print(cars_data.describe())
# Проверка пропущенных значений
print(cars_data.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Brand         10000 non-null  object 
 1   Model         10000 non-null  object 
 2   Year          10000 non-null  int64  
 3   Engine_Size   10000 non-null  float64
 4   Fuel_Type     10000 non-null  object 
 5   Transmission  10000 non-null  object 
 6   Mileage       10000 non-null  int64  
 7   Doors         10000 non-null  int64  
 8   Owner_Count   10000 non-null  int64  
 9   Price         10000 non-null  int64  
dtypes: float64(1), int64(5), object(4)
memory usage: 781.4+ KB
None
-------------------------
               Year   Engine_Size        Mileage         Doors   Owner_Count  \
count  10000.000000  10000.000000   10000.000000  10000.000000  10000.000000   
mean    2011.543700      3.000560  149239.111800      3.497100      2.991100   
std        6.897699      1.149324   86322

In [57]:
# Инициализация LabelEncoder
label_encoder = LabelEncoder()
# Преобразование категориальных признаков датасета
categorial_columns = ['Brand', 'Model', 'Fuel_Type', 'Transmission']
for col in categorial_columns:
    cars_data[col] = label_encoder.fit_transform(cars_data[col])

In [59]:
# Отделение признаков (X) и целевой переменной (y)
X = cars_data.drop(columns=['Price'])
y = cars_data['Price']

# Разделение на обучающую и тестовую выборки
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [61]:
# StandardScaler
scaler = StandardScaler()

# Масштабирование числовых признаков
numeric_cols = ['Year', 'Engine_Size', 'Mileage', 'Doors', 'Owner_Count']
X_train[numeric_cols] = scaler.fit_transform(X_train[numeric_cols])
X_test[numeric_cols] = scaler.transform(X_test[numeric_cols])

In [63]:
# Инициализация и обучение модели
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Предсказание на тестовой выборке
y_pred = model.predict(X_test)

# Оценка качества модели
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

In [65]:
print(f"RMSE: {rmse}")
print(f"R: {r2}")

RMSE: 480.0574596756517
R: 0.9754386835483809


In [67]:
# Важность признаков
feature_importances = model.feature_importances_
features = X.columns

# Вывод важности признаков
for feature, importance in zip(features, feature_importances):
    print(f"{feature}: {importance:.4f}")

Brand: 0.0041
Model: 0.0055
Year: 0.4397
Engine_Size: 0.1404
Fuel_Type: 0.0383
Transmission: 0.0501
Mileage: 0.3162
Doors: 0.0026
Owner_Count: 0.0030
