In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Load dataset
sales_file_path = "car_purchasing.csv"
df = pd.read_csv(sales_file_path, encoding="ISO-8859-1")

# Drop unnecessary columns
df.drop(columns=['customer name', 'customer e-mail'], inplace=True)

# One-hot encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
country_encoded = encoder.fit_transform(df[['country']])
country_df = pd.DataFrame(country_encoded, columns=encoder.get_feature_names_out(['country']), index=df.index)
df = pd.concat([df.drop(columns=['country']), country_df], axis=1)

# Split dataset into features and target
X = df.drop(columns=['car purchase amount'])
y = df['car purchase amount']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature scaling
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Train regression model
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)

# Evaluate model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print results
print(f"Mean Absolute Error: {mae:.2f}")
print(f"Mean Squared Error: {mse:.2f}")
print(f"R² Score: {r2:.4f}")



Mean Absolute Error: 1670.12
Mean Squared Error: 5691065.23
R² Score: 0.9473
