# Generate training dataset

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

# make all output interactive
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import helper

In [2]:
# Define file structure constants
# ================================
DATA_PATH = os.path.join(os.getcwd(), "../data/")
RAW_DATA_PATH = os.path.join(DATA_PATH, "raw")
DERIVED_DATA_PATH = os.path.join(DATA_PATH, "derived")

In [90]:
# Read datasets
# ================================
# main suburb data
suburb_data = helper.getSuburbData()
# generated preference data
preference_data = helper.getPreferenceData()
# suburb campus distance matrix
suburb_campus_dist = pd.read_csv("../data/derived/SuburbCampusDist.csv")
# suburb raw data
suburb_rawdata = pd.read_csv("../data/derived/SuburbRawdata.csv")

In [4]:
# All the predictor variables in training data
attributes = ['acc_house_onebed', 'acc_house_twobed', 'acc_house_threebed', 'acc_house_fourplusbed', 'acc_apartment_onebed', 'acc_apartment_twobed', 'acc_apartment_threebed', 'acc_apartment_fourplusbed', 'acc_rented_house_relative', 'acc_rented_apartment_relative', 'acc_shared_relative', 'acc_rent_1_74', 'acc_rent_75_99', 'acc_rent_100_149', 'acc_rent_150_199', 'acc_rent_200_224', 'acc_rent_225_274', 'acc_rent_275_349', 'acc_rent_350_449', 'acc_rent_450_549', 'acc_rent_550_649', 'acc_rent_650_749', 'acc_rent_750_849', 'acc_rent_850_949', 'acc_rent_950_plus', 'env_retail', 'env_accomodation_food', 'env_public_admin', 'env_healthcare_social_assist', 'env_arts_recreation', 'env_rental_hiring_realestate', 'env_parks', 'dem_students_relative', 'tra_train', 'tra_bus', 'tra_tram', 'saf_crime_person', 'saf_crime_property', 'saf_drug_offences', 'saf_order_security', 'saf_justice_procedure', 'saf_other', 'com_max_dist', 'com_catholic_ballarat', 'com_catholic_melbourne', 'com_deakin_burwood', 'com_deakin_geelong', 'com_deakin_warrnambool', 'com_federation_ballarat', 'com_federation_berwick', 'com_federation_churchill', 'com_federation_wimmera', 'com_latrobe_bendigo', 'com_latrobe_melbourne', 'com_latrobe_mildura', 'com_latrobe_shepparton', 'com_latrobe_wodonga', 'com_monash_caulfield', 'com_monash_clayton', 'com_monash_parkville', 'com_monash_peninsula', 'com_rmit_melbourne', 'com_swinbourne_croydon', 'com_swinbourne_hawthorn', 'com_swinbourne_wantirna', 'com_swinburne_croydon', 'com_swinburne_hawthorne', 'com_swinburne_wantirna', 'com_torrens_melbourne', 'com_unimelb_burnley', 'com_unimelb_creswick', 'com_unimelb_dookie', 'com_unimelb_parkville', 'com_unimelb_shepparton', 'com_unimelb_southbank', 'com_unimelb_werribee', 'com_vicuni_footscray', 'com_vicuni_melbourne', 'com_vicuni_stalbans', 'com_vicuni_sunshine', 'com_vicuni_werribee']

## Normalisation

The values/scores for each 'quality' of the suburbs must be normalised between all suburbs to ensure that one attribute/'quality' does not dominate others. For example, an attribute such as `dem_students_relative` is very small (due to the relative nature of the attribute) but another attribute such as `com_monash_clayton` can be very large (since this attribute is determined by the distance in kilometers from the campus to the suburb).

Normalisation is done using the following equation: $$X_{normalised} = \frac{X - min(X)}{max(X) - min(X)}$$

In [482]:
def normalise_column(column):
    return (column - column.min()) / (column.max() - column.min())

def normalise_dataframe(df: pd.DataFrame, columns: list = None, inplace: bool = False) -> pd.DataFrame:
    # use all columns by default (columns = None)
    if columns is None:
        columns = df.columns
    # link or copy depending on inplace method
    new_df = df if inplace is True else df.copy()
    # iterate over columns and replace/create column with normalised
    for col in columns:
        new_df[col] = normalise_column(new_df[col])
    return new_df

In [6]:
normalised_suburb_data = normalise_dataframe(
    suburb_data, 
    list(suburb_data.columns[3:])
)
normalised_suburb_data.head()

Unnamed: 0,postcode,locality,coordinates,acc_house_onebed,acc_house_twobed,acc_house_threebed,acc_house_fourplusbed,acc_apartment_onebed,acc_apartment_twobed,acc_apartment_threebed,...,com_unimelb_dookie,com_unimelb_parkville,com_unimelb_shepparton,com_unimelb_southbank,com_unimelb_werribee,com_vicuni_footscray,com_vicuni_melbourne,com_vicuni_stalbans,com_vicuni_sunshine,com_vicuni_werribee
0,3737.0,Abbeyard,"(-36.986557, 146.7708948)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.337638,0.337638,0.293617,0.337615,0.386492,0.348794,0.347584,0.364839,0.361423,0.385768
1,3067.0,Abbotsford,"(-37.8023601, 144.9983623)",0.006712,0.049681,0.042386,0.020136,0.6141,0.511209,0.320312,...,0.00369,0.00369,0.344681,0.005505,0.04878,0.014842,0.01487,0.034026,0.02809,0.046816
2,3040.0,Aberfeldie,"(-37.7603346, 144.8956625)",0.0,0.026369,0.070471,0.141401,0.063228,0.050335,0.256866,...,0.01107,0.01107,0.340426,0.016514,0.037523,0.007421,0.005576,0.015123,0.011236,0.037453
3,3825.0,Aberfeldy,"(-37.6979736, 146.3609099)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.226937,0.226937,0.359574,0.223853,0.272045,0.237477,0.237918,0.258979,0.252809,0.271536
4,3714.0,Acheron,"(-37.2587906, 145.702904)",0.0,0.041096,0.090411,0.157534,0.0,0.0,0.0,...,0.162362,0.162362,0.214894,0.163303,0.206379,0.170686,0.171004,0.181474,0.179775,0.205993


## Inversion

Some of the suburb attributes must be inverted due to differing definitions of 'goodness'. For example, the attributes for commute themed qualities are 'worse' for a suburb when the value is large (i.e., a large distance is not desired), whereas for another attribute such as `dem_students_relative`, having a large value is good since the user is looking to have a higher density of students.

The attributes which must be inverted are the attributes from the commute theme and the safety theme (since the safety values are determined as high values = more crimes).

Since the attributes have been normalised in the previous section of the notebook, we can simply subtract the value within these columns from 1. This is possible since all values are now from 0 to 1 (inclusive). Any previously high values (which we want to become small), will do so when subtracted from 1. Likewise for the case of low values (which we want to become large).

In [360]:
def invert_dataframe(df: pd.DataFrame, columns: list = None, inplace: bool = False) -> pd.DataFrame:
    # use all columns by default (columns = None)
    if columns is None:
        columns = df.columns
    # link or copy depending on inplace method
    new_df = df if inplace is True else df.copy()
    # iterate over columns and replace/create column with inverted
    for col in columns:
        new_df[col] = 1 / new_df[col]
    return new_df

In [361]:
invertable_columns = [i for i in list(normalised_suburb_data.columns) if i.split("_", maxsplit=1)[0] in ["saf", "com"]]

inv_norm_suburb_data = invert_dataframe(normalised_suburb_data, invertable_columns)

inv_norm_suburb_data.head()

inv_norm_suburb_data.to_csv(os.path.join(DERIVED_DATA_PATH, "NormalizedSuburbData.csv"), index=False)

Unnamed: 0,postcode,locality,coordinates,acc_house_onebed,acc_house_twobed,acc_house_threebed,acc_house_fourplusbed,acc_apartment_onebed,acc_apartment_twobed,acc_apartment_threebed,...,com_unimelb_dookie,com_unimelb_parkville,com_unimelb_shepparton,com_unimelb_southbank,com_unimelb_werribee,com_vicuni_footscray,com_vicuni_melbourne,com_vicuni_stalbans,com_vicuni_sunshine,com_vicuni_werribee
0,3737.0,Abbeyard,"(-36.986557, 146.7708948)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.961749,2.961749,3.405797,2.961957,2.587379,2.867021,2.877005,2.740933,2.766839,2.592233
1,3067.0,Abbotsford,"(-37.8023601, 144.9983623)",0.006712,0.049681,0.042386,0.020136,0.6141,0.511209,0.320312,...,271.0,271.0,2.901235,181.666667,20.5,67.375,67.25,29.388889,35.6,21.36
2,3040.0,Aberfeldie,"(-37.7603346, 144.8956625)",0.0,0.026369,0.070471,0.141401,0.063228,0.050335,0.256866,...,90.333333,90.333333,2.9375,60.555556,26.65,134.75,179.333333,66.125,89.0,26.7
3,3825.0,Aberfeldy,"(-37.6979736, 146.3609099)",0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.406504,4.406504,2.781065,4.467213,3.675862,4.210938,4.203125,3.861314,3.955556,3.682759
4,3714.0,Acheron,"(-37.2587906, 145.702904)",0.0,0.041096,0.090411,0.157534,0.0,0.0,0.0,...,6.159091,6.159091,4.653465,6.123596,4.845455,5.858696,5.847826,5.510417,5.5625,4.854545


## Additional suburb data processing

We are going to remove all suburbs who don't have a university campus within 35 km. Remember, the preferences only allow the user to put a max distance of 30 km.

In [88]:
only_commute = suburb_data[suburb_data.columns[suburb_data.columns.str.contains("^com_", regex=True)]]

only_commute_dist_mask = only_commute.apply(lambda row: min(row) <= 35, axis=1)

new_suburb_data = suburb_data.where(only_commute_dist_mask, axis=0).dropna()

new_suburb_data

Unnamed: 0,postcode,locality,coordinates,acc_house_onebed,acc_house_twobed,acc_house_threebed,acc_house_fourplusbed,acc_apartment_onebed,acc_apartment_twobed,acc_apartment_threebed,...,com_unimelb_dookie,com_unimelb_parkville,com_unimelb_shepparton,com_unimelb_southbank,com_unimelb_werribee,com_vicuni_footscray,com_vicuni_melbourne,com_vicuni_stalbans,com_vicuni_sunshine,com_vicuni_werribee
1,3067.0,Abbotsford,"(-37.8023601, 144.9983623)",0.006712,0.066241,0.070643,0.020136,0.106404,0.165603,0.022557,...,3.0,3.0,164.0,4.0,29.0,9.0,9.0,19.0,15.0,28.0
2,3040.0,Aberfeldie,"(-37.7603346, 144.8956625)",0.000000,0.035159,0.117452,0.141401,0.010955,0.016306,0.018089,...,7.0,7.0,162.0,10.0,23.0,5.0,4.0,9.0,6.0,23.0
7,3352.0,Addington,"(-37.3858942, 143.6850371)",0.000000,0.046154,0.153846,0.138462,0.000000,0.000000,0.000000,...,122.0,122.0,191.0,123.0,105.0,117.0,116.0,106.0,111.0,106.0
10,3336.0,Aintree,"(-37.7215428, 144.6625687)",0.000000,0.001002,0.028564,0.238286,0.000000,0.000000,0.000000,...,28.0,28.0,165.0,29.0,19.0,23.0,22.0,12.0,16.0,20.0
14,3042.0,Airport West,"(-37.726234, 144.8813526)",0.002569,0.086382,0.247400,0.072434,0.000000,0.003181,0.002936,...,11.0,11.0,158.0,13.0,25.0,9.0,8.0,8.0,7.0,24.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2863,3869.0,Yinnar,"(-38.3258006, 146.324683)",0.002938,0.036239,0.211557,0.134182,0.000000,0.000000,0.000000,...,133.0,133.0,233.0,131.0,151.0,138.0,138.0,148.0,144.0,150.0
2864,3869.0,Yinnar South,"(-38.3774747, 146.34308)",0.011852,0.023704,0.148148,0.180741,0.000000,0.000000,0.000000,...,137.0,137.0,239.0,135.0,154.0,142.0,142.0,152.0,148.0,154.0
2866,3646.0,Youanmite,"(-36.1567319, 145.6916906)",0.000000,0.000000,0.232143,0.142857,0.000000,0.000000,0.000000,...,193.0,193.0,34.0,196.0,212.0,196.0,195.0,194.0,195.0,212.0
2870,3063.0,Yuroke,"(-37.5896434, 144.8779217)",0.000000,0.032520,0.113821,0.186992,0.000000,0.000000,0.000000,...,24.0,24.0,144.0,27.0,37.0,24.0,23.0,19.0,21.0,37.0


Now we remove any suburbs whose total renting population (`Tot_Tot_G40` in Raw Dataset) is less than 100 (since it is unlikely that a user would be able to find rented accomodation in this region) and where the population is less than 100 (since these are likely too small for comfortable living).

In [93]:
suburb_rawdata["rentable_per_capita"] = ((suburb_rawdata["Tot_Tot_G40"] / suburb_rawdata["Tot_P_P"]) * suburb_rawdata.Tot_P_P.mean()).fillna(0)

pop_locals = suburb_rawdata.query("rentable_per_capita > 100 and Tot_Tot_G40 > 100").locality

Combine these two filters together.

In [95]:
new_suburb_data = new_suburb_data.merge(pop_locals)

## Generate output suburb

Using `new_suburb_data` with the smaller refined list of suburbs, we now determine which suburb is best for each instance of generated preference data in `preference_data`.

- Ensure that the distance to the selected campus is within the `com_max_dist` input.
- Remove the bottom 10% of suburbs for selected transport options.

In [483]:
def get_campus(pref_row):
    temp_com = pref_row.where(pref_row[pref_row.index[pref_row.index.str.contains("^com_", regex=True)]] > 0).dropna().index
    if len(temp_com) > 1:
        return temp_com[1]
    else:
        return None

In [487]:
def get_suburb(pref_row):
    # copy suburb dataframe so that we can make edits
    copy_suburb_data = new_suburb_data.copy()

    # filter by campus distance when that is required (not None priority)
    if (campus := get_campus(pref_row)) is not None:
        # ensure the user max distance is not smaller than the closest suburb
        if (minimum := min(copy_suburb_data[get_campus(pref_row)])) > pref_row.com_max_dist:
            pref_row.com_max_dist = minimum * 10

        copy_suburb_data = copy_suburb_data.query(f"{get_campus(pref_row)} <= {pref_row.com_max_dist}")


    # invert and normalise the data
    copy_suburb_data = invert_dataframe(copy_suburb_data, invertable_columns)
    # copy_suburb_data.iloc[:,3:] = copy_suburb_data.iloc[:,3:] * 1000
    copy_suburb_data = copy_suburb_data.replace(np.inf, 0)
    copy_suburb_data = normalise_dataframe(copy_suburb_data, copy_suburb_data.columns[3:])

    # dot product each row of suburb dataset with pref_data and get the top 'score'
    best_suburb_index = copy_suburb_data.iloc[:,3:].multiply(pref_row).sum(axis=1).nlargest(1).index[0]
    # return the name of the locality which is 'best'
    return copy_suburb_data.locality[best_suburb_index]

In [505]:
training_data = preference_data.assign(suburb=(
    preference_data.apply(lambda row: get_suburb(row), axis=1)
))

In [507]:
training_data.suburb.value_counts()

Burnley             14368
Collingwood          9202
Carlton              8306
Heidelberg West      7418
Flemington           6324
                    ...  
Golden Square           1
Waurn Ponds             1
Noble Park North        1
Oakleigh East           1
Avondale Heights        1
Name: suburb, Length: 225, dtype: int64

In [None]:
training_data.to_csv(os.path.join(DERIVED_DATA_PATH, "TrainingData.csv"), index=False)