In [None]:
import pandas as pd
import numpy as np
import math
import matplotlib.pyplot as plt
from sklearn import linear_model
from sklearn import metrics
from sklearn.metrics import mean_squared_error

# Display all rows and columns 
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)

In [None]:
# Loading of original datatset
# aus_real_estate = pd.read_csv(r"C:\Users\melvj\Downloads\aus_real_estate.csv")

# Loading of cleaned datatset - Transformation of variables, 'City' & 'Type' to binary done in Excel  
aus_real_estate = pd.read_excel(r"C:\Users\melvj\OneDrive\Desktop\VSP_Main_Cleaned.xlsx")

# Initial Cleaning - Dropping of variables 'State' & 'Garage'
aus_real_estate = aus_real_estate.drop(["State","Garage"], axis = 1)

# Dropping of 'NA' values if there are any
aus_real_estate = aus_real_estate.dropna()

# Selecting only integer columns
int_columns = aus_real_estate.select_dtypes(include=['int64', 'float64'])

# Checking for negative values
negative_values = aus_real_estate[(int_columns < 0).any(axis=1)]

# Empty dataframe printed, hence, there are no negative values present 
print(negative_values)

aus_real_estate.head()

In [None]:
# Define independent and target variable
X = aus_real_estate.drop(['Price', 'City', 'Type', 'Year_Built'], axis=1)

y = aus_real_estate['Price'] 

In [None]:
# Descriptive visualization 1: Count of property types per city

# Grouping by City and Type and counting the occurrences
city_type_counts = aus_real_estate.groupby(['City', 'Type']).size().unstack(fill_value=0)

# Plotting the horizontal bar chart
fig, ax = plt.subplots(figsize=(10, 8))
city_type_counts.plot(kind='barh', stacked=True, ax=ax)
plt.title('Count of Property Types per City')
plt.xlabel('Count')
plt.ylabel('City')
plt.legend(title='Property Type', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Descriptive visualization 2: Count of bedroom types(#) per property type

# Grouping by Type and Bedrooms and counting the occurrences
bedroom_counts = aus_real_estate.groupby(['Type', 'Bedrooms']).size().unstack(fill_value=0)

# Plotting the horizontal bar chart
fig, ax = plt.subplots(figsize=(10, 8))
bedroom_counts.plot(kind='barh', ax=ax)
plt.title('Count of Bedrooms per Property Type')
plt.xlabel('Count')
plt.ylabel('Property Type')
plt.legend(title='Number of Bedrooms', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.show()

# Descriptive visualization 3: Average price per city

plt.figure(figsize=(10, 6))
avg_price_city = aus_real_estate.groupby('City')['Price'].mean().sort_values()
bars = avg_price_city.plot(kind='bar')

# Adding values on top of each bar
for bar in bars.patches:
    yval = bar.get_height()
    bars.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', ha='center', va='bottom')

plt.title('Average Price by City')
plt.xlabel('City')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show()

# Descriptive visualization 4: Average price per property type
plt.figure(figsize=(10, 6))
avg_price_type = aus_real_estate.groupby('Type')['Price'].mean().sort_values()
bars = avg_price_type.plot(kind='bar')

# Adding values on top of each bar
for bar in bars.patches:
    yval = bar.get_height()
    bars.text(bar.get_x() + bar.get_width()/2, yval, f'{yval:.2f}', ha='center', va='bottom')

plt.title('Average Price by Property Type')
plt.xlabel('Property Type')
plt.ylabel('Average Price')
plt.xticks(rotation=45)
plt.show()

In [None]:
# We initialize and fit the model in the same way as before
multi_lr_model = linear_model.LinearRegression()
multi_lr_model.fit(X, y)

coefficients = multi_lr_model.coef_
feature_names = X.columns
dict(zip(feature_names, coefficients)) 

In [None]:
mse_multivariate = metrics.mean_squared_error(y, multi_lr_model.predict(X))
print(f"Our Multivariate model had a mean-squared error of {mse_multivariate:.4f}")

In [None]:
# Descriptive visualization 4: Average price per number of bedrooms

# Plotting the line graph
plt.figure(figsize=(10, 6))
avg_price_bedrooms = aus_real_estate.groupby('Bedrooms')['Price'].mean().sort_index()
plt.plot(avg_price_bedrooms.index, avg_price_bedrooms.values, marker='o', linestyle='-', color='skyblue')
plt.title('Average Price by Number of Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Average Price')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

# Descriptive visualization 5: Average price per number of bathrooms

# Plotting the line graph
plt.figure(figsize=(10, 6))
avg_price_bathrooms = aus_real_estate.groupby('Bathrooms')['Price'].mean().sort_index()
plt.plot(avg_price_bathrooms.index, avg_price_bathrooms.values, marker='o', linestyle='-', color='skyblue')
plt.title('Average Price by Number of Bathrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Average Price')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show() 

In [None]:
# Regress SqFt and Price

sqft_lr_model = linear_model.LinearRegression()

# Step2: Fit the model (this means estimate beta_0 and beta_1 using our data!)
sqft_lr_model.fit(X[["SqFt"]], y)

beta_0 = sqft_lr_model.intercept_
beta_1 = sqft_lr_model.coef_[0]

print(f"Fitted model: price = {beta_0:.4f} + {beta_1:.4f} sqft_living")

In [None]:
# Regress lot area and Price

lot_lr_model = linear_model.LinearRegression()

# Step2: Fit the model (this means estimate beta_0 and beta_1 using our data!)
lot_lr_model.fit(X[["Lot_Area"]], y)

beta_0 = lot_lr_model.intercept_
beta_1 = lot_lr_model.coef_[0]

print(f"Fitted model: price = {beta_0:.4f} + {beta_1:.4f} lot_area")

In [None]:
# Regress bedrooms and Price

bedroom_lr_model = linear_model.LinearRegression()

bedroom_lr_model.fit(X[["Bedrooms"]], y)

beta_0 = bedroom_lr_model.intercept_
beta_1 = bedroom_lr_model.coef_[0]

print(f"Fitted model: price = {beta_0:.4f} + {beta_1:.4f} bedroom")

In [None]:
# Regress bathrooms and Price

bathroom_lr_model = linear_model.LinearRegression()

bathroom_lr_model.fit(X[["Bathrooms"]], y)

beta_0 = bathroom_lr_model.intercept_
beta_1 = bathroom_lr_model.coef_[0]

print(f"Fitted model: price = {beta_0:.4f} + {beta_1:.4f} bathroom")

In [None]:
# Regress bedrooms & bathrooms with SqFt
# To see if an increase in one unit of bedrooms & bathrooms results in an increase or decrease in SqFt

# Redefine the target variable y to be SqFt
y = aus_real_estate['SqFt']

bedroom_lr_model = linear_model.LinearRegression()

bedroom_lr_model.fit(X[["Bedrooms"]], y)

beta_0_bed = bedroom_lr_model.intercept_
beta_1_bed = bedroom_lr_model.coef_[0]

print(f"Fitted model: SqFt = {beta_0_bed:.4f} + {beta_1_bed:.4f} bedroom")

bathroom_lr_model = linear_model.LinearRegression()

bathroom_lr_model.fit(X[["Bathrooms"]], y)

beta_0_bath = bathroom_lr_model.intercept_
beta_1_bath = bathroom_lr_model.coef_[0]

print(f"Fitted model: SqFt = {beta_0_bath:.4f} + {beta_1_bath:.4f} bathroom") 

In [None]:
plt.figure(figsize=(10, 6))
avg_sqft = aus_real_estate.groupby('Bedrooms')['SqFt'].mean().sort_index()
plt.plot(avg_sqft.index, avg_sqft.values, marker='o', linestyle='-', color='skyblue')
plt.title('Average SqFt by Number of Bedrooms')
plt.xlabel('Number of Bedrooms')
plt.ylabel('Average SqFt')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()

plt.figure(figsize=(10, 6))
avg_sqft1 = aus_real_estate.groupby('Bathrooms')['SqFt'].mean().sort_index()
plt.plot(avg_sqft1.index, avg_sqft1.values, marker='o', linestyle='-', color='skyblue')
plt.title('Average SqFt by Number of Bathrooms')
plt.xlabel('Number of Bathrooms')
plt.ylabel('Average SqFt')
plt.grid(axis='y', linestyle='--', alpha=0.7)
plt.show()