In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.metrics import mean_absolute_error, r2_score

file = 'melbourne_housing_raw.csv'
data = pd.read_csv(file)

clean_data = data.dropna(subset=['Price'])
X_all = clean_data.drop(['Price'], axis=1)
y_all = clean_data['Price']

X_num = X_all.select_dtypes(exclude=['object'])

imputer = SimpleImputer(strategy='mean')
X_filled = imputer.fit_transform(X_num)

X_train, X_test, y_train, y_test = train_test_split(X_filled, y_all, test_size=0.2, random_state=42)

reg_model = LinearRegression()

selector = SequentialFeatureSelector(reg_model, direction='forward', n_features_to_select='auto', scoring='neg_mean_absolute_error', cv=5)

selector.fit(X_train, y_train)

chosen_indices = selector.get_support(indices=True)

X_train_chosen = X_train[:, chosen_indices]
X_test_chosen = X_test[:, chosen_indices]
reg_model.fit(X_train_chosen, y_train)

predicted = reg_model.predict(X_test_chosen)
mae_val = mean_absolute_error(y_test, predicted)
r2_val = r2_score(y_test, predicted)

print("Chosen features:", chosen_indices)
print("Mean Absolute Error after selection:", mae_val)
print("R2 score:", r2_val)


Chosen features: [ 0  1  2  8  9 10]
Mean Absolute Error after selection: 310442.66414210177
R2 score: 0.4398394043768179
