In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import mean_absolute_error, r2_score

file = 'melbourne_housing_raw.csv'
data = pd.read_csv(file)
cleaned_data = data.dropna(subset=['Price'])
X_all = cleaned_data.drop(['Price'], axis=1)
y_all = cleaned_data['Price']

X_numeric = X_all.select_dtypes(exclude=['object'])

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_all, test_size=0.2, random_state=0)

imputer = SimpleImputer(strategy='mean')
X_train_filled = imputer.fit_transform(X_train)
X_test_filled = imputer.transform(X_test)

rf_model = RandomForestRegressor(random_state=0)
rf_model.fit(X_train_filled, y_train)
initial_pred = rf_model.predict(X_test_filled)
mae_initial = mean_absolute_error(y_test, initial_pred)
r2_initial = r2_score(y_test, initial_pred)

var_thresh = VarianceThreshold(threshold=0.01)
X_train_filtered = var_thresh.fit_transform(X_train_filled)
X_test_filtered = var_thresh.transform(X_test_filled)

rf_model.fit(X_train_filtered, y_train)
final_pred = rf_model.predict(X_test_filtered)
mae_final = mean_absolute_error(y_test, final_pred)
r2_final = r2_score(y_test, final_pred)

print("MAE before variance filtering:", mae_initial)
print("MAE after variance filtering:", mae_final)
print("R2 before variance filtering:", r2_initial)
print("R2 after variance filtering:", r2_final)


MAE before variance filtering: 177684.04652441444
MAE after variance filtering: 180584.93886446857
R2 before variance filtering: 0.7692490565510182
R2 after variance filtering: 0.7635169483457628
