In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_percentage_error
file_path = "C:/Users/VENUGOPAL BADRI/Downloads/car_purchasing.csv"
data = pd.read_csv(file_path, encoding='latin1')
# Drop irrelevant columns
data = data.drop(['customer name', 'customer e-mail'], axis=1)
# Split data into features (X) and target (y)
X = data.drop(['car purchase amount'], axis=1)
y = data['car purchase amount']
# Identify categorical and numerical columns
categorical_columns = ['country']
numerical_columns = ['gender', 'age', 'annual Salary', 'credit card debt', 'net worth']
# Define column transformer for preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_columns),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_columns)  # Updated
    ]
)
# Create the pipeline
pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])
# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Hyperparameter tuning using RandomizedSearchCV
param_dist = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [10, 20, 30, None],
    'regressor__min_samples_split': [2, 5, 10],
    'regressor__min_samples_leaf': [1, 2, 4],
    'regressor__bootstrap': [True, False]
}
# Randomized search on hyperparameters
random_search = RandomizedSearchCV(pipeline, param_distributions=param_dist, n_iter=100, cv=5, 
                                   verbose=2, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)
print(f"Best Hyperparameters: {random_search.best_params_}")
# Evaluate the best model from RandomizedSearchCV
best_model = random_search.best_estimator_
# Make predictions with the best model
y_pred = best_model.predict(X_test)
# Evaluate the model
r2 = r2_score(y_test, y_pred)
mape = mean_absolute_percentage_error(y_test, y_pred)  
print(f"R^2 Score: {r2}")
print(f"Mean Absolute Percentage Error (MAPE): {mape * 100}%")
# Example prediction with best model
example_input = pd.DataFrame({
    'country': ['Brazil'],
    'gender': [1],
    'age': [40],
    'annual Salary': [60000],
    'credit card debt': [5000],
    'net worth': [300000]
})
example_prediction = best_model.predict(example_input)
print(f"Predicted Car Purchase Amount: {example_prediction[0]}")


Fitting 5 folds for each of 100 candidates, totalling 500 fits
Best Hyperparameters: {'regressor__n_estimators': 100, 'regressor__min_samples_split': 2, 'regressor__min_samples_leaf': 2, 'regressor__max_depth': None, 'regressor__bootstrap': True}
R^2 Score: 0.9513728282138564
Mean Absolute Percentage Error (MAPE): 3.5786892580321057%
Predicted Car Purchase Amount: 33607.045775389444
