In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error

# Load the dataset
file_path = '/content/melbourne_housing_raw.csv'
melbourne_data = pd.read_csv(file_path)

# Step 1: Identify and Filter Columns
# Define the columns to be removed based on more than 20% missing values
columns_to_remove = [
    'BuildingArea',  # 60.58% missing
    'YearBuilt',     # 55.39% missing
    'Car',           # 25.04% missing
    'Bathroom',      # 23.60% missing
    'Bedroom2',      # 23.57% missing
    'Landsize',      # 33.88% missing
    'Lattitude',     # 22.88% missing
    'Longtitude'     # 22.88% missing
]

# Print the columns that will be removed in vertical manner
print("Columns with more than 20% missing data that are removed:")
for column in columns_to_remove:
    print(column)

# Remove these columns from the dataset
filtered_data = melbourne_data.drop(columns=columns_to_remove)

# Step 2: Handle Missing Values in Price
# Drop rows with missing values in the 'Price' column
filtered_data = filtered_data.dropna(subset=['Price'])

# After dropping missing Price values, we can confirm how many rows are left
print(f'\nNumber of rows after handling missing values in Price: {filtered_data.shape[0]}')

# Step 3: Data Split
# Define the target variable and features
X = filtered_data.drop(columns=['Price'])
y = filtered_data['Price']

# One-hot encode categorical variables
X_encoded = pd.get_dummies(X, drop_first=True)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Step 4: Model Training and Performance
# Train a basic Random Forest model
model = RandomForestRegressor(random_state=42, n_estimators=100)
model.fit(X_train, y_train)

# Predict on the test set and evaluate the model
y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')


Columns with more than 20% missing data that are removed:
BuildingArea
YearBuilt
Car
Bathroom
Bedroom2
Landsize
Lattitude
Longtitude

Number of rows after handling missing values in Price: 27247
Mean Absolute Error: 199084.30510074442
