In [8]:
from sklearn.ensemble import RandomForestRegressor


In [2]:
import pandas as pd

# Load the cleaned dataset
df = pd.read_csv('Car_cleaned_train.csv')

# Define features (X) and target variable (y)
X = df.drop(columns=['price'])  # Features
y = df['price']  # Target variable

# Check data structure
print(X.shape, y.shape)


(184706, 10) (184706,)


# Split Data into Training & Testing Sets

In [3]:
from sklearn.model_selection import train_test_split

# Split into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)


Training set size: (147764, 10)
Testing set size: (36942, 10)


# Train the Random Forest Model

In [4]:
# Check data types of all columns
print(X.dtypes)

# List columns that are still objects (categorical)
categorical_cols = X.select_dtypes(include=['object']).columns
print("\nCategorical Columns:", categorical_cols)


brand           object
model           object
model_year       int64
milage           int64
fuel_type       object
engine          object
transmission    object
ext_col         object
int_col         object
accident        object
dtype: object

Categorical Columns: Index(['brand', 'model', 'fuel_type', 'engine', 'transmission', 'ext_col',
       'int_col', 'accident'],
      dtype='object')


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

for col in categorical_cols:
    X[col] = le.fit_transform(X[col])  # Convert text to numbers

In [None]:
# Split the data again after encoding
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest
rf_model = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf_model.fit(X_train, y_train)

# Predict
y_pred_rf = rf_model.predict(X_test)


#  Make Predictions

In [None]:
# Make predictions on test set
y_pred_rf = rf_model.predict(X_test)


# Evaluate Model Performance

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Calculate performance metrics
mae_rf = mean_absolute_error(y_test, y_pred_rf)
rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)

# Print results
print("Random Forest Model Performance:")
print(f"Mean Absolute Error (MAE): {mae_rf:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse_rf:.2f}")
print(f"R² Score: {r2_rf:.4f}")


# Identify Most Important Features

In [None]:
import matplotlib.pyplot as plt

# Get feature importance
feature_importance = pd.DataFrame({'Feature': X_train.columns, 'Importance': rf_model.feature_importances_})

# Sort by importance
feature_importance = feature_importance.sort_values(by='Importance', ascending=False)

# Plot feature importance
plt.figure(figsize=(10,6))
plt.barh(feature_importance['Feature'][:10], feature_importance['Importance'][:10], color='blue')
plt.xlabel("Feature Importance")
plt.ylabel("Feature")
plt.title("Top 10 Important Features in Random Forest Model")
plt.gca().invert_yaxis()  # Invert y-axis for better visualization
plt.show()
