In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from sklearn.impute import SimpleImputer

# Load the dataset
file_path = '/content/melbourne_housing_raw.csv'
melbourne_data = pd.read_csv(file_path)

# Drop rows with missing target (Price) values
melbourne_data_clean = melbourne_data.dropna(subset=['Price'])

# Split features and target
X = melbourne_data_clean.drop(columns=['Price'])
y = melbourne_data_clean['Price']

# Drop object-type columns (categorical) for this numerical analysis
X_numerical = X.select_dtypes(include=['float64', 'int64'])

# Display columns with missing values
missing_features = X_numerical.columns[X_numerical.isnull().any()]
print("Features with missing values:")
print(missing_features)

# Calculate variance for numerical features
variance = X_numerical.var()

# Set a threshold for low variance (e.g., variance below 0.1 will be considered low)
low_variance_cols = variance[variance < 0.1].index
print("\nLow variance features removed:")
print(low_variance_cols)

# Remove low variance features
X_numerical_filtered = X_numerical.drop(columns=low_variance_cols)

# Handle missing values with mean imputation
imputer = SimpleImputer(strategy='mean')
X_numerical_imputed = pd.DataFrame(imputer.fit_transform(X_numerical), columns=X_numerical.columns)
X_numerical_filtered_imputed = pd.DataFrame(imputer.fit_transform(X_numerical_filtered), columns=X_numerical_filtered.columns)

# Function to train and evaluate model
def train_evaluate_model(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    model = RandomForestRegressor(random_state=42)
    model.fit(X_train, y_train)
    predictions = model.predict(X_test)
    mae = mean_absolute_error(y_test, predictions)
    return mae

# Train and evaluate the model on the imputed numerical features
mae_before_filtering = train_evaluate_model(X_numerical_imputed, y)

# Train and evaluate the model after filtering out low variance features
mae_after_filtering = train_evaluate_model(X_numerical_filtered_imputed, y)

print("\nMean Absolute Error (MAE) before filtering:", mae_before_filtering)
print("Mean Absolute Error (MAE) after filtering:", mae_after_filtering)


Features with missing values:
Index(['Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize',
       'BuildingArea', 'YearBuilt', 'Lattitude', 'Longtitude',
       'Propertycount'],
      dtype='object')

Low variance features removed:
Index(['Lattitude', 'Longtitude'], dtype='object')

Mean Absolute Error (MAE) before filtering: 179476.4312564365
Mean Absolute Error (MAE) after filtering: 185711.7544868934
