In [16]:
#import libraries
import pandas as pd
import numpy as np

In [17]:
#load dataset
from sklearn.datasets import fetch_california_housing

In [18]:
data = fetch_california_housing()

In [19]:
#assign variables
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target

In [20]:
#split the data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [21]:
#training raw data with randomforest
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

model_raw = RandomForestRegressor(n_estimators=100, random_state=42)
model_raw.fit(X_train, y_train)
y_pred_raw = model_raw.predict(X_test)
rmse_raw = np.sqrt(mean_squared_error(y_test, y_pred_raw))
print(f"RMSE (Raw Data): {rmse_raw:.4f}")

RMSE (Raw Data): 0.5053


In [22]:
#transforming using standard scalar
from sklearn.preprocessing import StandardScaler
from scipy.stats import skew

skewed_features = X_train.apply(lambda x: skew(x)).sort_values(ascending=False)
skewed_features = skewed_features[abs(skewed_features) > 0.5].index

X_train_transformed = X_train.copy()
X_test_transformed = X_test.copy()

for feature in skewed_features:
    X_train_transformed[feature] = np.log1p(X_train_transformed[feature])
    X_test_transformed[feature] = np.log1p(X_test_transformed[feature])

In [23]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_transformed)
X_test_scaled = scaler.transform(X_test_transformed)

# Convert back to DataFrame
X_train_scaled = pd.DataFrame(X_train_scaled, columns=X_train.columns)
X_test_scaled = pd.DataFrame(X_test_scaled, columns=X_test.columns)

In [24]:
# Train Random Forest on transformed data
model_transformed = RandomForestRegressor(n_estimators=100, random_state=42)
model_transformed.fit(X_train_scaled, y_train)
y_pred_transformed = model_transformed.predict(X_test_scaled)
rmse_transformed = np.sqrt(mean_squared_error(y_test, y_pred_transformed))
print(f"RMSE (Transformed Data): {rmse_transformed:.4f}")

RMSE (Transformed Data): 0.5049


In [25]:
print(f"RMSE (Raw Data): {rmse_raw:.4f}")
print(f"RMSE (Transformed Data): {rmse_transformed:.4f}")

if rmse_transformed < rmse_raw:
    improvement = ((rmse_raw - rmse_transformed) / rmse_raw) * 100
    print(f"Transformations improved performance by {improvement:.2f}%")
else:
    print("Transformations did not improve performance.")

RMSE (Raw Data): 0.5053
RMSE (Transformed Data): 0.5049
Transformations improved performance by 0.09%
