In [2]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.feature_selection import RFE
from sklearn.impute import SimpleImputer

# Load the dataset
melbourne_data = pd.read_csv('/content/melbourne_housing_raw.csv')  # Replace with your dataset path

# Drop rows with missing target (Price)
melbourne_data_cleaned = melbourne_data.dropna(subset=['Price'])

# Select numerical columns and drop the target column
X = melbourne_data_cleaned.select_dtypes(include=['float64', 'int64']).drop(columns=['Price'])
y = melbourne_data_cleaned['Price']

# Impute missing values using the mean strategy
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=0)

# Initialize the RandomForest model
model = RandomForestRegressor(random_state=0)

# Use Recursive Feature Elimination (RFE) to rank features by importance
rfe = RFE(estimator=model, n_features_to_select=1, step=1)
rfe.fit(X_train, y_train)

# Get feature ranking
ranking = rfe.ranking_
feature_names = melbourne_data_cleaned.select_dtypes(include=['float64', 'int64']).drop(columns=['Price']).columns

# Create a dataframe to show features and their rankings
feature_ranking = pd.DataFrame({'Feature': feature_names, 'Ranking': ranking})
feature_ranking_sorted = feature_ranking.sort_values(by='Ranking')

# Display feature ranking
print("Feature Rankings:")
print(feature_ranking_sorted)

# Evaluate model performance after removing least important features one by one
mae_list = []
features_left = list(feature_names)

for i in range(1, len(features_left)):
    # Select top i features (excluding least important features)
    selected_features = feature_ranking_sorted['Feature'].head(len(features_left) - i)
    X_train_selected = pd.DataFrame(X_train, columns=feature_names)[selected_features]
    X_test_selected = pd.DataFrame(X_test, columns=feature_names)[selected_features]

    # Train and evaluate model with selected features
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_test_selected)
    mae = mean_absolute_error(y_test, y_pred)
    mae_list.append((len(selected_features), mae))

# Display MAE as features are eliminated
print("\nModel Performance after feature elimination:")
for num_features, mae in mae_list:
    print(f'Number of features left: {num_features}, MAE: {mae}')


Feature Rankings:
          Feature  Ranking
2        Postcode        1
1        Distance        2
0           Rooms        3
6        Landsize        4
10     Longtitude        5
7    BuildingArea        6
9       Lattitude        7
11  Propertycount        8
8       YearBuilt        9
4        Bathroom       10
5             Car       11
3        Bedroom2       12

Model Performance after feature elimination:
Number of features left: 11, MAE: 176998.19870524888
Number of features left: 10, MAE: 177714.15932807353
Number of features left: 9, MAE: 181258.39991172406
Number of features left: 8, MAE: 182413.51800820546
Number of features left: 7, MAE: 183782.7822219414
Number of features left: 6, MAE: 187781.76158525897
Number of features left: 5, MAE: 196097.33604028804
Number of features left: 4, MAE: 205806.3389602595
Number of features left: 3, MAE: 228167.78543471175
Number of features left: 2, MAE: 335833.91629570926
Number of features left: 1, MAE: 338353.7961801237
