In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from random import randint
import ipynb.fs.full.preProcessing as preProcessing
# from ipynb.fs.full.eda_new import population_data as _population_data
from ipynb.fs.full.eda_new import population_data

In [13]:
df = pd.read_csv('./final.csv')

cleaned_egm = preProcessing.cleanedEgm
cleaned_offences = preProcessing.cleanedOffences
cleaned_housing_prices = preProcessing.cleanedHousingPrice
cleaned_communities = preProcessing.cleanedCommunities

# Restructure dataframe
cleaned_housing_prices_melted = pd.melt(cleaned_housing_prices.reset_index(), id_vars=['LGA'], var_name='Year', value_name='house_price')
cleaned_housing_prices_melted.head()

# Convert year to int and increment by 1 to lag the year
cleaned_housing_prices_melted['Year'] = cleaned_housing_prices_melted['Year'].astype(int) + 1

# Merge with housing prices data
merged = pd.merge(left=cleaned_communities, right=cleaned_housing_prices_melted, on="LGA")

# Merge with population data
merged = pd.merge(left=merged, right=population_data, left_on=["LGA", 'Year'], right_on=["lga", 'year'], how='left')

features = [
    # 'Median House Price',
    'Residential (km^2)',
    'Number of Households',
    'Occupied private dwellings',
    'Equivalent household income <$600/week',
    'Personal income <$400/week, persons',
    'Number of families',
    'Aged 75+ and lives alone, persons',
    'house_price',
    'Year'
]

# There are some missing values in the population data
# Turns out that the missing values all belong to the year 2024, which does not exist in offences dataset, but is in housing prices dataset
print(merged[merged['lga'].isnull()]['Year'].unique())

# Drop rows with year 2013, or use inner join above
merged = merged[merged['Year'] != 2024]

print(merged.shape)

print(list(merged.columns))  

[2024]
(560, 109)
['LGA', 'Travel time to GPO (minutes)', 'Area (km^2)', 'ARIA+ (min)', 'ARIA+ (max)', 'ARIA+ (avg)', 'Commercial (km^2)', 'Industrial (km^2)', 'Residential (km^2)', 'Rural (km^2)', 'Other (km^2)', '2012 ERP age 0-4, persons', '2012 ERP age 5-9, persons', '2012 ERP age 10-14, persons', '2012 ERP age 15-19, persons', '2012 ERP age 20-24, persons', '2012 ERP age 25-44, persons', '2012 ERP age 45-64, persons', '2012 ERP age 65-69, persons', '2012 ERP age 70-74, persons', '2012 ERP age 75-79, persons', '2012 ERP age 80-84, persons', '2012 ERP age 85+, persons', '2012 ERP, total', '2007 ERP age 0-4, persons', '2007 ERP age 5-9, persons', '2007 ERP age 10-14, persons', '2007 ERP age 15-19, persons', '2007 ERP age 20-24, persons', '2007 ERP age 25-44, persons', '2007 ERP age 45-64, persons', '2007 ERP age 65-69, persons', '2007 ERP age 70-74, persons', '2007 ERP age 75-79, persons', '2007 ERP age 80-84, persons', '2007 ERP age 85+, persons', '2007 ERP, total', 'Public Hospital

# The two models below are trained independently for each LGA, might not be accurate?

In [14]:
# Linear Regression using single test-train split for each LGA

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

model_data = merged.groupby('LGA')

model = LinearRegression()

mse_values = []
rmse_values = []

for lga, group in model_data:
    X = group[features]
    y = group['population'] # From population_data dataframe
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=randint(1, 10000000))
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    # print("Predicted Population:", y_pred)

    mse = mean_squared_error(y_test, y_pred)
    # print("Mean Squared Error (MSE):", mse)
    mse_values.append(mse)

    rmse = np.sqrt(mse)
    # print("Root Mean Squared Error (RMSE):", rmse)
    rmse_values.append(rmse)


print("Mean Squared Error (MSE):", np.mean(mse_values))
print("Root Mean Squared Error (RMSE):", np.mean(rmse_values))

rmse_percentage = np.mean(rmse_values) / np.mean(merged['population']) * 100
print(rmse_percentage, "%")

Mean Squared Error (MSE): 9058215.463370716
Root Mean Squared Error (RMSE): 1694.0343714456988
1.5717583026624442 %


In [15]:
# Linear Regression using KFold for each LGA

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold

model_data = merged.groupby('LGA')

model = LinearRegression()

mse_values = []
rmse_values = []

kf = KFold(n_splits=10, shuffle=True, random_state=randint(1, 10000000))

for lga, group in model_data:
    X = group[features]
    y = group['population'] # From population_data dataframe

    for train_index, test_index in kf.split(X):
        # Split the data into training and testing sets for the current fold
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model.fit(X_train, y_train)

        y_pred = model.predict(X_test)
        # print("Predicted Population:", y_pred)

        mse = mean_squared_error(y_test, y_pred)
        # print("Mean Squared Error (MSE):", mse)
        mse_values.append(mse)

        rmse = np.sqrt(mse)
        # print("Root Mean Squared Error (RMSE):", rmse)
        rmse_values.append(rmse)


print("Mean Squared Error (MSE):", np.mean(mse_values))
print("Root Mean Squared Error (RMSE):", np.mean(rmse_values))

rmse_percentage = np.mean(rmse_values) / np.mean(merged['population']) * 100
print(rmse_percentage, "%")

Mean Squared Error (MSE): 7522020.5945827225
Root Mean Squared Error (RMSE): 1561.5282565369391
1.4488165313667118 %


# Train on across all LGA (using one-hot encoding)

In [16]:
# Linear Regression using KFold for each LGA

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold
from sklearn.preprocessing import OneHotEncoder

model_data = merged.copy()

# One-hot encode LGA column
unique_lga_names = model_data['LGA'].apply(lambda x: 'LGA_' + x).unique()
model_data_encoded = pd.get_dummies(model_data, columns=['LGA'])

# Combine one-hot encoded LGA columns with other features
features_new = [
    *features,
    *unique_lga_names
]

model = LinearRegression()

mse_values = []
rmse_values = []

kf = KFold(n_splits=10, shuffle=True, random_state=randint(1, 10000000))

# for lga, group in model_data:
X = group[features]
y = group['population'] # From population_data dataframe

for train_index, test_index in kf.split(X):
    # Split the data into training and testing sets for the current fold
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    # print("Predicted Population:", y_pred)

    mse = mean_squared_error(y_test, y_pred)
    # print("Mean Squared Error (MSE):", mse)
    mse_values.append(mse)

    rmse = np.sqrt(mse)
    # print("Root Mean Squared Error (RMSE):", rmse)
    rmse_values.append(rmse)


print("Mean Squared Error (MSE):", np.mean(mse_values))
print("Root Mean Squared Error (RMSE):", np.mean(rmse_values))

rmse_percentage = np.mean(rmse_values) / np.mean(merged['population']) * 100
print(rmse_percentage, "%")

Mean Squared Error (MSE): 3400098.874310495
Root Mean Squared Error (RMSE): 1703.6696748297632
1.5806981260496467 %
