In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error
import numpy as np

# Load the dataset
file_path = '/content/melbourne_housing_raw.csv'
melbourne_data = pd.read_csv(file_path)

# Drop rows with missing target (Price) values
melbourne_data_clean = melbourne_data.dropna(subset=['Price'])

# Split features and target
X = melbourne_data_clean.drop(columns=['Price'])
y = melbourne_data_clean['Price']

# Drop object-type columns (categorical) for this numerical analysis
X_numerical = X.select_dtypes(include=['float64', 'int64'])

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
X_numerical_imputed = pd.DataFrame(imputer.fit_transform(X_numerical), columns=X_numerical.columns)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_numerical_imputed, y, test_size=0.2, random_state=42)

# Train the random forest model
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# Make predictions and evaluate the model
y_pred = rf.predict(X_test)
mae_before = mean_absolute_error(y_test, y_pred)

print("Mean Absolute Error before removing least important features:", mae_before)

# Get feature importances
importances = rf.feature_importances_
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("\nFeature ranking:")
for f in range(X_numerical_imputed.shape[1]):
    print(f"{f + 1}. Feature {X_numerical_imputed.columns[indices[f]]} ({importances[indices[f]]})")

# Remove the least important features (let's say bottom 20% least important features)
threshold = np.percentile(importances, 20)
least_important_features = X_numerical_imputed.columns[importances < threshold]
X_train_reduced = X_train.drop(columns=least_important_features)
X_test_reduced = X_test.drop(columns=least_important_features)

# Train the model again after removing the least important features
rf_reduced = RandomForestRegressor(random_state=42)
rf_reduced.fit(X_train_reduced, y_train)

# Make predictions and evaluate the reduced model
y_pred_reduced = rf_reduced.predict(X_test_reduced)
mae_after = mean_absolute_error(y_test, y_pred_reduced)

print("\nLeast important features removed:", least_important_features)
print("Mean Absolute Error after removing least important features:", mae_after)


Mean Absolute Error before removing least important features: 179476.4312564365

Feature ranking:
1. Feature Distance (0.27642548087435304)
2. Feature Rooms (0.21859591163938233)
3. Feature Postcode (0.1891360409212323)
4. Feature Landsize (0.07666168167616283)
5. Feature BuildingArea (0.05089059548042887)
6. Feature Longtitude (0.043968289315635255)
7. Feature Lattitude (0.04358082967759252)
8. Feature Propertycount (0.03468505475918846)
9. Feature YearBuilt (0.025026372736625536)
10. Feature Bathroom (0.01958544012834771)
11. Feature Car (0.016252519882662745)
12. Feature Bedroom2 (0.005191782908388576)

Least important features removed: Index(['Bedroom2', 'Bathroom', 'Car'], dtype='object')
Mean Absolute Error after removing least important features: 183336.16241149075
