In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = '/content/melbourne_housing_raw.csv'
melbourne_data = pd.read_csv(file_path)

# Drop rows with missing target (Price) values
melbourne_data_clean = melbourne_data.dropna(subset=['Price'])

# Split features and target
X = melbourne_data_clean.drop(columns=['Price'])
y = melbourne_data_clean['Price']

# Drop object-type columns (categorical) for this numerical analysis
X_numerical = X.select_dtypes(include=['float64', 'int64'])

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
X_numerical_imputed = pd.DataFrame(imputer.fit_transform(X_numerical), columns=X_numerical.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_numerical_imputed, y, test_size=0.2, random_state=42)

# Initialize linear regression model
lr = LinearRegression()

# Perform forward feature selection using SequentialFeatureSelector
sfs = SequentialFeatureSelector(lr, n_features_to_select="auto", direction='forward', scoring='neg_mean_absolute_error', cv=5)
sfs.fit(X_train, y_train)

# Get the selected feature names
selected_features = X_numerical_imputed.columns[sfs.get_support()]

print("Selected features for prediction:")
print(selected_features)

# Train the model using the selected features
X_train_selected = sfs.transform(X_train)
X_test_selected = sfs.transform(X_test)
lr.fit(X_train_selected, y_train)

# Make predictions and evaluate the model
y_pred = lr.predict(X_test_selected)
mae = mean_absolute_error(y_test, y_pred)

print("\nMean Absolute Error with selected features:", mae)


Selected features for prediction:
Index(['Rooms', 'Distance', 'Postcode', 'YearBuilt', 'Lattitude',
       'Longtitude'],
      dtype='object')

Mean Absolute Error with selected features: 310442.66414210177
