# Merge census data suburbs with panel data
- can put into the preprocess forecast notebook


In [298]:
import pandas as pd

In [299]:
# Open the panel data
data_path = "../data/curated/rent_growth/panel_data.csv"
panel_data = pd.read_csv(data_path)



In [300]:
def merge_on_suburb(column, data, destination):
    """ 
    A function to merge on suburb when they are of different forms
    Takes the average of combo suburbs - so only works for numeric data
    """
    # incoming dataset needs to have brackets/ extra info removed
    # also ensure lower case
    data["suburb"] = data["suburb"].str.replace(r"\s*\([^)]*\)", "", regex=True)
    data["suburb"] = data['suburb'].str.lower()

    # extra adjustments needed to match specific formatting
    data["suburb"] = data["suburb"].replace("brunswick west", "west brunswick")
    data["suburb"] = data["suburb"].replace("brunswick east", "east brunswick")
    data["suburb"] = data["suburb"].replace("st kilda east", "east st kilda")
    data["suburb"] = data["suburb"].replace("st kilda west", "west st kilda")
    data["suburb"] = data["suburb"].replace("hawthorn east", "east hawthorn")
    data["suburb"] = data["suburb"].replace("east bendigo", "bendigo east")

    data["suburb"] = data["suburb"].replace("mount martha", "mt martha")
    data["suburb"] = data["suburb"].replace("mount eliza", "mt eliza")

    data["suburb"] = data["suburb"].replace("wangaratta", "wanagaratta")
    data["suburb"] = data["suburb"].replace("newcomb", "newcombe")
    
    
    # First create conjoined suburbs
    # iterate through the suburbs 
    for suburb in destination["suburb"].unique():
        # check if we have a hyphen suburb needs to be averaged
        if "-" in suburb:
            to_avg = suburb.split("-")
            # take average of metric from each suburb
            if set(to_avg).issubset(set(data['suburb'])) == True:
                # we can  take an average
                # Filter to only those suburbs
                subset = data[data['suburb'].isin(to_avg)]
                
                # Compute the average population
                average_population = subset[column].mean()
                
                new_row = pd.DataFrame({
                    'suburb': [suburb], 
                    column: [average_population]
                })
                
                # Append to the other DataFrame
                data = pd.concat([data, new_row], ignore_index=True)
                                
    # now do a merge on suburb
    merged = pd.merge( destination, data, on='suburb', how='inner') 

    # save the csv - MAYBE
    # merged.to_csv(f"../data/curated/check_{column}.csv", index = False)
    return merged
        

In [324]:
# open census population data
data_path = "../data/landing/population_breakdown.csv"
census_pop = pd.read_csv(data_path)

# Drop individual Age years
mask = census_pop["Age group"].str.contains(r"-|years|Total", case=False, na=False)
pop_filtered = census_pop[mask].reset_index(drop=True)
pop_filtered["Suburb"] = pop_filtered['Suburb'].str.lower()
# rename the suburb column to match destination df
pop_filtered = pop_filtered.rename(columns={"Suburb": "suburb"})

# Only care about suburb totals for now
subset = pop_filtered[pop_filtered["Age group"] == "Total"]
subset = subset.rename(columns={"Persons": "population size"}).drop(columns=["Age group"])
panel_data_1 = merge_on_suburb("population size", subset, panel_data)
#for age_cat in pop_filtered["Age group"].unique():
 #   subset = pop_filtered[pop_filtered["Age group"] == age_cat]
  #  subset = subset.rename(columns={"Persons": age_cat}).drop(columns=["Age group"])
   # merge_on_suburb(age_cat, subset, moving_annual)


# Use median stats to add median personal income
data_path = "../data/landing/median_stats.csv"
census_medians = pd.read_csv(data_path)
# extract median personal income
subset = census_medians[census_medians["Statistic"] == "Median total personal income ($/weekly)"]
subset = subset.rename(columns={"Value": 
                                "median personal income"}).drop(columns=["Statistic"])
subset = subset.rename(columns={"Suburb": "suburb"})



# extract median age
subset_age = census_medians[census_medians["Statistic"] == "Median age of persons"]
subset_age = subset_age.rename(columns={"Value": 
                                "median age"}).drop(columns=["Statistic"])
subset_age = subset_age.rename(columns={"Suburb": "suburb"})
panel_data_3 = merge_on_suburb("median age", subset_age, panel_data_2)

# save the updated panel data
panel_data_3.to_csv("../data/curated/rent_growth/panel_data_updates.csv", index = False)


In [302]:
# check which suburbs are being dropped from panel data
og = panel_data["suburb"].unique()
reduced = panel_data_2["suburb"].unique()

lost = set(og) - set(reduced)
print(lost)


{'cbd-st kilda rd', 'yarra ranges', 'ballarat'}
