In [1]:
import pandas as pd
import numpy as np
import os 

# Load the Excel file and examine its structure
file_path = '../data/landing/moving_annual_rent/moving_annual_median_weekly_rent_by_suburb.xlsx'
# Get all sheet names
xl_file = pd.ExcelFile(file_path)
sheet_names = xl_file.sheet_names
print("Available sheets:", sheet_names)


Available sheets: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house', 'All properties']


In [2]:
def extract_median_data(sheet_name):
    """
    Extract median rent data from a specific sheet.
    Returns a DataFrame with suburbs as rows and quarters as columns.
    """
    # Read the raw data
    df = pd.read_excel(file_path, sheet_name=sheet_name, header=None)
    
    # Find the header row (row 1 contains quarter names)
    quarter_row = 1
    median_row = 2
    
    # Extract quarter names and identify median columns
    quarters = []
    median_cols = []
    
    for col in range(df.shape[1]):
        if pd.notna(df.iloc[quarter_row, col]) and df.iloc[quarter_row, col] != 'Lease commenced in year ending ':
            quarter_name = df.iloc[quarter_row, col]
            if df.iloc[median_row, col] == 'Median':
                quarters.append(quarter_name)
                median_cols.append(col)
    
    # Extract suburb names (starting from row 3)
    suburb_names = []
    data_rows = []
    
    for row in range(3, df.shape[0]):
        if pd.notna(df.iloc[row, 1]):  # Suburb name is in column 1
            suburb_names.append(df.iloc[row, 1])
            # Extract median values for this suburb
            median_values = []
            for col in median_cols:
                value = df.iloc[row, col]
                # Convert '-' to NaN and other values to numeric
                if value == '-' or pd.isna(value):
                    median_values.append(np.nan)
                else:
                    try:
                        median_values.append(float(value))
                    except (ValueError, TypeError):
                        median_values.append(np.nan)
            data_rows.append(median_values)
    
    # Create DataFrame
    result_df = pd.DataFrame(data_rows, index=suburb_names, columns=quarters)
    result_df.index.name = 'Suburb'
    
    return result_df

# Test with the first sheet
test_df = extract_median_data('1 bedroom flat')
print(f"Shape: {test_df.shape}")
print(f"Columns (first 10): {test_df.columns[:10].tolist()}")
print(f"Index (first 10): {test_df.index[:10].tolist()}")
print("\nFirst few rows and columns:")
print(test_df.iloc[:5, :5])


Shape: (159, 101)
Columns (first 10): ['Mar 2000', 'Jun 2000', 'Sep 2000', 'Dec 2000', 'Mar 2001', 'Jun 2001', 'Sep 2001', 'Dec 2001', 'Mar 2002', 'Jun 2002']
Index (first 10): ['Albert Park-Middle Park-West St Kilda', 'Armadale', 'Carlton North', 'Carlton-Parkville', 'CBD-St Kilda Rd', 'Collingwood-Abbotsford', 'Docklands', 'East Melbourne', 'East St Kilda', 'Elwood']

First few rows and columns:
                                       Mar 2000  Jun 2000  Sep 2000  Dec 2000  \
Suburb                                                                          
Albert Park-Middle Park-West St Kilda     165.0     165.0     170.0     175.0   
Armadale                                  150.0     150.0     155.0     160.0   
Carlton North                             150.0     155.0     150.0     150.0   
Carlton-Parkville                         165.0     170.0     175.0     180.0   
CBD-St Kilda Rd                           250.0     250.0     250.0     250.0   

                               

In [3]:
# Process all property type sheets (excluding 'All properties')
property_types = [sheet for sheet in sheet_names if sheet != 'All properties']
print(f"Processing {len(property_types)} property types: {property_types}")

# Combine all property types into one DataFrame
all_dataframes = []

for prop_type in property_types:
    print(f"Processing {prop_type}...")
    df = extract_median_data(prop_type)
    # Add property type column
    df['Property_Type'] = prop_type
    # Reset index to make suburb a regular column
    df = df.reset_index()
    all_dataframes.append(df)

# Combine all DataFrames
combined_df = pd.concat(all_dataframes, ignore_index=True)
print(f"\nCombined DataFrame shape: {combined_df.shape}")
print(f"Columns: {combined_df.columns.tolist()}")
print(f"Property types: {combined_df['Property_Type'].unique()}")
print(f"Number of suburbs per property type:")
print(combined_df['Property_Type'].value_counts())


# convert columns to lower case
combined_df.columns = combined_df.columns.str.lower()

# convert suburb column to lower case
combined_df['suburb'] = combined_df['suburb'].str.lower()

# remove rows with suburb = 'group total'
combined_df = combined_df[combined_df['suburb'] != 'group total']

# sort the dataframe by suburb, property_type
combined_df = combined_df.sort_values(['suburb', 'property_type'])


Processing 6 property types: ['1 bedroom flat', '2 bedroom flat', '3 bedroom flat', '2 bedroom house', '3 bedroom house', '4 bedroom house']
Processing 1 bedroom flat...
Processing 2 bedroom flat...
Processing 3 bedroom flat...
Processing 2 bedroom house...
Processing 3 bedroom house...
Processing 4 bedroom house...

Combined DataFrame shape: (954, 103)
Columns: ['Suburb', 'Mar 2000', 'Jun 2000', 'Sep 2000', 'Dec 2000', 'Mar 2001', 'Jun 2001', 'Sep 2001', 'Dec 2001', 'Mar 2002', 'Jun 2002', 'Sep 2002', 'Dec 2002', 'Mar 2003', 'Jun 2003', 'Sep 2003', 'Dec 2003', 'Mar 2004', 'Jun 2004', 'Sep 2004', 'Dec 2004', 'Mar 2005', 'Jun 2005', 'Sep 2005', 'Dec 2005', 'Mar 2006', 'Jun 2006', 'Sep 2006', 'Dec 2006', 'Mar 2007', 'Jun 2007', 'Sep 2007', 'Dec 2007', 'Mar 2008', 'Jun 2008', 'Sep 2008', 'Dec 2008', 'Mar 2009', 'Jun 2009', 'Sep 2009', 'Dec 2009', 'Mar 2010', 'Jun 2010', 'Sep 2010', 'Dec 2010', 'Mar 2011', 'Jun 2011', 'Sep 2011', 'Dec 2011', 'Mar 2012', 'Jun 2012', 'Sep 2012', 'Dec 2012', 

In [4]:
# Display sample of the final DataFrame
print("Sample of the combined DataFrame:")
print(combined_df[['suburb', 'property_type'] + list(combined_df.columns[1:6])].head(10))

print("\nData types:")
print(combined_df.dtypes.head(10))

# Check for missing values
print(f"\nMissing values per column (first 10 quarters):")
quarter_cols = [col for col in combined_df.columns if col not in ['suburb', 'property_type']]
print(combined_df[quarter_cols[:10]].isnull().sum())

# Summary statistics
print(f"\nSummary statistics for recent quarters:")
recent_quarters = [col for col in quarter_cols if '2024' in col or '2025' in col]
if recent_quarters:
    print(combined_df[recent_quarters].describe())
else:
    print("No recent quarters found, showing last available quarters:")
    print(combined_df[quarter_cols[-5:]].describe())


Sample of the combined DataFrame:
                                    suburb    property_type  mar 2000  \
0    albert park-middle park-west st kilda   1 bedroom flat     165.0   
159  albert park-middle park-west st kilda   2 bedroom flat     250.0   
477  albert park-middle park-west st kilda  2 bedroom house     300.0   
318  albert park-middle park-west st kilda   3 bedroom flat     350.0   
636  albert park-middle park-west st kilda  3 bedroom house     390.0   
795  albert park-middle park-west st kilda  4 bedroom house     500.0   
55                                  altona   1 bedroom flat      95.0   
214                                 altona   2 bedroom flat     155.0   
532                                 altona  2 bedroom house     160.0   
373                                 altona   3 bedroom flat     180.0   

     jun 2000  sep 2000  dec 2000  mar 2001  
0       165.0     170.0     175.0     180.0  
159     250.0     250.0     255.0     260.0  
477     310.0     320.0 

In [5]:
# Save the processed data

#create moving_rent folder if it doesn't exist
if not os.path.exists('../data/processed/moving_rent'):
    os.makedirs('../data/processed/moving_rent')

output_path = '../data/processed/moving_rent/moving_annual_rent_long.csv'
combined_df.to_csv(output_path, index=False)
print(f"Data saved to: {output_path}")

# Also create a pivot version for easier analysis
pivot_df = combined_df.set_index(['suburb', 'property_type']).stack().reset_index()
pivot_df.columns = ['suburb', 'property_type', 'quarter', 'median_rent']
pivot_df = pivot_df.sort_values(['suburb', 'property_type', 'quarter'])

# convert columns to lowercase
pivot_df.columns = pivot_df.columns.str.lower()


# remove rows with suburb = 'Group Total'
pivot_df = pivot_df[pivot_df['suburb'] != 'Group Total']

# convert suburb column to lowercase
pivot_df['suburb'] = pivot_df['suburb'].str.lower()

# convert the quarter column to datetime object and then year, quarter
pivot_df['quarter'] = pd.to_datetime(pivot_df['quarter'])
pivot_df['year'] = pivot_df['quarter'].dt.year
pivot_df['quarter'] = pivot_df['quarter'].dt.quarter

# sort the dataframe by suburb, property_type, year quarter
pivot_df = pivot_df.sort_values(['suburb', 'property_type', 'year', 'quarter'])

pivot_df.head()


Data saved to: ../data/processed/moving_rent/moving_annual_rent_long.csv


  pivot_df['quarter'] = pd.to_datetime(pivot_df['quarter'])


Unnamed: 0,suburb,property_type,quarter,median_rent,year
0,albert park-middle park-west st kilda,1 bedroom flat,1,165.0,2000
1,albert park-middle park-west st kilda,1 bedroom flat,2,165.0,2000
2,albert park-middle park-west st kilda,1 bedroom flat,3,170.0,2000
3,albert park-middle park-west st kilda,1 bedroom flat,4,175.0,2000
4,albert park-middle park-west st kilda,1 bedroom flat,1,180.0,2001


In [6]:
# Save the result
# create moving_rent folder if it doesn't exist
if not os.path.exists('../data/processed/moving_rent'):
    os.makedirs('../data/processed/moving_rent')

pivot_output_path = '../data/processed/moving_rent/moving_annual_rent_pivot.csv'
pivot_df.to_csv(pivot_output_path, index=False)
print(f"Pivot data saved to: {pivot_output_path}")

print(f"\nFinal DataFrame Summary:")
print(f"- Total records: {len(combined_df)}")
print(f"- Unique suburbs: {combined_df['suburb'].nunique()}")
print(f"- Property types: {combined_df['property_type'].nunique()}")
print(f"- Quarters covered: {len(quarter_cols)}")
print(f"- Date range: {quarter_cols[0]} to {quarter_cols[-1]}")

Pivot data saved to: ../data/processed/moving_rent/moving_annual_rent_pivot.csv

Final DataFrame Summary:
- Total records: 876
- Unique suburbs: 146
- Property types: 6
- Quarters covered: 101
- Date range: mar 2000 to mar 2025


In [7]:
import plotly.graph_objects as go
import plotly.express as px
from plotly.subplots import make_subplots
import datetime

# Select 3 interesting suburbs for plotting
selected_suburbs = ['cbd-st kilda rd', 'carlton-parkville', 'richmond-burnley']
print(f"Selected suburbs for time series analysis: {selected_suburbs}")

# Check data availability for selected suburbs
print(f"\nData availability for selected suburbs:")
for suburb in selected_suburbs:
    suburb_data = pivot_df[pivot_df['suburb'] == suburb]
    if len(suburb_data) > 0:
        print(f"{suburb}: {suburb_data['property_type'].nunique()} property types, {suburb_data['quarter'].nunique()} quarters")
        print(f"  Latest quarter: {suburb_data['quarter'].max()}")
        print(f"  Rent range: ${suburb_data['median_rent'].min():.0f} - ${suburb_data['median_rent'].max():.0f}")
    else:
        print(f"{suburb}: No data found")


Selected suburbs for time series analysis: ['cbd-st kilda rd', 'carlton-parkville', 'richmond-burnley']

Data availability for selected suburbs:
cbd-st kilda rd: 3 property types, 4 quarters
  Latest quarter: 4
  Rent range: $250 - $1100
carlton-parkville: 6 property types, 4 quarters
  Latest quarter: 4
  Rent range: $165 - $1200
richmond-burnley: 6 property types, 4 quarters
  Latest quarter: 4
  Rent range: $140 - $1300


In [8]:
pivot_df.head()

Unnamed: 0,suburb,property_type,quarter,median_rent,year
0,albert park-middle park-west st kilda,1 bedroom flat,1,165.0,2000
1,albert park-middle park-west st kilda,1 bedroom flat,2,165.0,2000
2,albert park-middle park-west st kilda,1 bedroom flat,3,170.0,2000
3,albert park-middle park-west st kilda,1 bedroom flat,4,175.0,2000
4,albert park-middle park-west st kilda,1 bedroom flat,1,180.0,2001
