In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import opendatasets as od
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, r2_score

#Import Data
od.download("https://www.kaggle.com/datasets/dansbecker/melbourne-housing-snapshot")
file = ('melbourne-housing-snapshot/melb_data.csv')
data = pd.read_csv(file)

#Drops columns not used in evaluation
data = data.drop(['Suburb', 'Address', 'Type', 'Method', 'SellerG', 'Date', 'BuildingArea', 'YearBuilt', 'CouncilArea', 'Lattitude', 'Longtitude', 'Regionname', 'Propertycount'], axis=1)

#large price deviations are removed from the dataset
lower_bound = data['Price'].quantile(0.05)  # 5th percentile
upper_bound = data['Price'].quantile(0.8)  # 80th percentile
data = data[(data['Price'] >= lower_bound) & (data['Price'] <= upper_bound)]

#Prepare features and target
features = data[['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize']]
target = data['Price']

# Calculate average price by postcode
postcode_avg_price = data.groupby('Postcode')['Price'].mean().reset_index()
postcode_avg_price.columns = ['Postcode', 'avg_price']

# Save postcode averages to a CSV
postcode_avg_price.to_csv('postcode_avg_price.csv', index=False)

# If zipcode is empty, replace null with average
features = features.merge(postcode_avg_price, on='Postcode', how='left')
features['Postcode'] = features['Postcode'].fillna(features['avg_price'])

# Splits the data into 80% train data and 20% test data
X_train, X_test, y_train, y_test = train_test_split(features.drop(columns=['avg_price']), target, test_size=0.20, random_state=22)

# Train the model
rf_model = RandomForestRegressor(n_estimators=100, random_state=22)
rf_model.fit(X_train, y_train)
predictions = rf_model.predict(X_test)

# Run the test data on the model to get results
mae = mean_absolute_error(y_test, predictions)
r2 = r2_score(y_test, predictions)
print(f'R-squared: {r2:.2f}')
print(f'Mean Absolute Error: {mae:.2f}')


#Display charts
sns.set_style("whitegrid")

#Display the amount of properties sold at each price level
plt.hist(data['Price'], bins=30);
plt.title('Distribution of Property Prices')
plt.xlabel('Price')
plt.ylabel('Frequency')
plt.show()

# Scatter plot of actual vs predicted prices
plt.figure(figsize=(10, 6))
plt.scatter(y_test, predictions, alpha=0.3)
plt.xlabel('Actual Prices')
plt.ylabel('Predicted Prices')
plt.title('Actual vs Predicted Prices')
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], lw=2, linestyle='--', color='black')
plt.show()

# Distribution of prediction errors
prediction_errors = y_test - predictions
plt.figure(figsize=(12, 8))
sns.histplot(prediction_errors, kde=True, bins=40)
plt.xlabel('Prediction Error')
plt.title('Distribution of Prediction Errors')
plt.show()

# Correlation heatmap of features
plt.figure(figsize=(8, 8))
sns.heatmap(data.corr(), annot=True, fmt=".2f", cmap='coolwarm', square=True, cbar_kws={"shrink": .8})
plt.title('Correlation Heatmap of Features')
plt.xticks(rotation=45)
plt.yticks(rotation=45)
plt.show()

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

In [None]:
data.describe()

In [None]:
# Get user input
rooms = float(input("Enter number of rooms: "))
distance = float(input("Enter distance from c: "))
postcode = int(input("Enter postcode: "))
bedroom2 = float(input("Enter number of bedrooms: "))
bathroom = float(input("Enter number of bathrooms: "))
car = float(input("Enter number of car spaces: "))
landsize = float(input("Enter land size (sqm): "))

# Create a DataFrame with the input
user_input_df = pd.DataFrame([[rooms, distance, postcode, bedroom2, bathroom, car, landsize]],
                             columns=['Rooms', 'Distance', 'Postcode', 'Bedroom2', 'Bathroom', 'Car', 'Landsize'])

# Predict the price
predicted_price = rf_model.predict(user_input_df)
print(f"Predicted Price: {predicted_price[0]}")