In [47]:
import polars as pl
import csv
import glob

In [48]:
extract_folder = 'extract'

In [49]:
dat_files = glob.glob(f"{extract_folder}/*.dat")
print(f"Number of .dat files found: {len(dat_files)}")

Number of .dat files found: 19218


In [50]:
df_list = []
df_schema = {
    'Record Type':pl.String(),'District Code':pl.String(),
    'Property Id.':pl.String(),'Sale Counter':pl.String(),
    'Download Datetime':pl.String(),'Property Name':pl.String(),
    'Property Unit Number':pl.String(),'Property House Number':pl.String(),
    'Property Street Name':pl.String(), 'Property Locality':pl.String(),
    'Property Post Code':pl.String(), 'Area':pl.String(),
    'Area Type':pl.String(), 'Contract Date':pl.String(),
    'Settlement Date':pl.String(), 'Purchase Price':pl.String(),
    'Zoning':pl.String(), 'Nature of Property':pl.String(),
    'Primary Purpose':pl.String(), 'Strata Lot Number':pl.String(),
    'Component code':pl.String(), 'Sale Code':pl.String(), 
    '% Interest of Sale':pl.String(), 'Dealing Number':pl.String()
    }

In [51]:
for file in dat_files:
    df = pl.read_csv(file,has_header=False, schema=df_schema, truncate_ragged_lines=True, separator=';', new_columns=['Record Type','District Code','Property Id.','Sale Counter','Download Datetime','Property Name','Property Unit Number','Property House Number','Property Street Name', 'Property Locality', 'Property Post Code', 'Area', 'Area Type', 'Contract Date', 'Settlement Date', 'Purchase Price', 'Zoning', 'Nature of Property', 'Primary Purpose', 'Strata Lot Number', 'Component code', 'Sale Code', '% Interest of Sale', 'Dealing Number'])
    df_list.append(df)

In [52]:
for df in df_list:
  df = df.filter(pl.col("Record Type") == "B")

In [53]:
df_combined = pl.concat(df_list)

In [56]:
df_combined = df_combined.filter(pl.col("Record Type") == "B")

In [66]:
output_file = "./unclean.csv"

# Export DataFrame to CSV
df_combined.write_csv(output_file)

In [75]:
df_combined.head()

Record Type,District Code,Property Id.,Sale Counter,Download Datetime,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Zoning,Nature of Property,Primary Purpose,Strata Lot Number,Component code,Sale Code,% Interest of Sale,Dealing Number
str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str,str
"""B""","""001""","""2962131""","""1""","""20210201 01:00""",,,"""11""","""BARNHILL ACCS""","""ROTHBURY""","""2320""","""609.8""","""M""","""20201211""","""20210127""","""845000""","""SP3""","""R""","""RESIDENCE""",,"""SAV""",,"""0""","""AQ754912"""
"""B""","""001""","""4156561""","""2""","""20210201 01:00""",,,"""111""","""TALLEYRAND CCT""","""GRETA""","""2334""","""612""","""M""","""20201211""","""20210127""","""210000""","""R2""","""R""","""RESIDENCE""",,"""AAJ""",,"""0""","""AQ751923"""
"""B""","""001""","""4224453""","""3""","""20210201 01:00""",,,"""15""","""GUY CL""","""GRETA""","""2334""","""657.5""","""M""","""20201222""","""20210122""","""205000""","""R2""","""R""","""RESIDENCE""",,"""AAJ""",,"""0""","""AQ744227"""
"""B""","""001""","""4224457""","""4""","""20210201 01:00""",,,"""10""","""GUY CL""","""GRETA""","""2334""","""725.5""","""M""","""20201203""","""20210122""","""200000""","""R2""","""V""","""VACANT LAND""",,"""AAJ""",,"""0""","""AQ742328"""
"""B""","""001""","""4225342""","""5""","""20210201 01:00""",,,"""90""","""TALLEYRAND CCT""","""GRETA""","""2334""","""640.6""","""M""","""20201211""","""20210122""","""210000""","""R2""","""V""","""VACANT LAND""",,"""AAJ""",,"""0""","""AQ744713"""


In [82]:
df_combined = df_combined.with_columns(
    pl.col("Contract Date").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("Contract Date")
)

In [96]:
df_cleaned_col = df_combined.select('District Code',
                            'Property Id.',
                            'Property Name',
                            'Property Unit Number',
                            'Property House Number',
                            'Property Street Name',
                            'Property Locality',
                            'Property Post Code',
                            'Area',
                            'Area Type',
                            'Contract Date',
                            'Settlement Date',
                            'Purchase Price',
                            'Primary Purpose',
                            'Strata Lot Number')

In [97]:
df_cleaned_col = df_cleaned_col.with_columns(pl.col("Area").cast(pl.Float64).alias("Area"))

In [98]:
df_cleaned_col = df_cleaned_col.with_columns(pl.col("Purchase Price").cast(pl.Int64).alias("Purchase Price"))

In [102]:
df_cleaned_col = df_cleaned_col.with_columns(pl.col("Property Id.").cast(pl.Int64).alias("Property Id."))

In [101]:
df_cleaned_col = df_cleaned_col.with_columns(
    pl.col("Settlement Date").str.strptime(pl.Date, "%Y%m%d", strict=False).alias("Settlement Date")
)

In [113]:
df_cleaned_col = df_cleaned_col.with_columns((pl.col("Purchase Price") / pl.col("Area")).alias("Price per Unit Area"))

In [116]:
df_cleaned_col.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Price per Unit Area
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64
"""001""",2962131,,,"""11""","""BARNHILL ACCS""","""ROTHBURY""","""2320""",609.8,"""M""",2020-12-11,2021-01-27,845000,"""RESIDENCE""",,1385.70023
"""001""",4156561,,,"""111""","""TALLEYRAND CCT""","""GRETA""","""2334""",612.0,"""M""",2020-12-11,2021-01-27,210000,"""RESIDENCE""",,343.137255
"""001""",4224453,,,"""15""","""GUY CL""","""GRETA""","""2334""",657.5,"""M""",2020-12-22,2021-01-22,205000,"""RESIDENCE""",,311.787072
"""001""",4224457,,,"""10""","""GUY CL""","""GRETA""","""2334""",725.5,"""M""",2020-12-03,2021-01-22,200000,"""VACANT LAND""",,275.67195
"""001""",4225342,,,"""90""","""TALLEYRAND CCT""","""GRETA""","""2334""",640.6,"""M""",2020-12-11,2021-01-22,210000,"""VACANT LAND""",,327.817671


In [122]:
df_cleaned_col.n_unique

<bound method DataFrame.n_unique of shape: (625_210, 16)
┌──────────┬──────────┬───────────┬──────────────────────┬───┬──────────┬─────────────┬────────────┬────────────────┐
│ District ┆ Property ┆ Property  ┆ Property Unit Number ┆ … ┆ Purchase ┆ Primary     ┆ Strata Lot ┆ Price per Unit │
│ Code     ┆ Id.      ┆ Name      ┆ ---                  ┆   ┆ Price    ┆ Purpose     ┆ Number     ┆ Area           │
│ ---      ┆ ---      ┆ ---       ┆ str                  ┆   ┆ ---      ┆ ---         ┆ ---        ┆ ---            │
│ str      ┆ i64      ┆ str       ┆                      ┆   ┆ i64      ┆ str         ┆ str        ┆ f64            │
╞══════════╪══════════╪═══════════╪══════════════════════╪═══╪══════════╪═════════════╪════════════╪════════════════╡
│ 001      ┆ 2962131  ┆ null      ┆ null                 ┆ … ┆ 845000   ┆ RESIDENCE   ┆ null       ┆ 1385.70023     │
│ 001      ┆ 4156561  ┆ null      ┆ null                 ┆ … ┆ 210000   ┆ RESIDENCE   ┆ null       ┆ 343.137255     │

In [117]:
unit_df = df_cleaned_col.filter((pl.col("Property Unit Number") != "null") | (pl.col("Strata Lot Number") != "null"))

In [118]:
unit_df.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Price per Unit Area
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64
"""001""",3029751,"""GOLDEN DOOR HEALTH RETREAT AND…","""16""","""165""","""THOMPSONS RD""","""POKOLBIN""","""2320""",,,2020-11-25,2021-01-22,100000,"""RESIDENCE""","""16""",
"""001""",3830378,,"""11""","""5""","""STONEBRIDGE DR""","""CESSNOCK""","""2325""",,,2020-12-16,2021-01-27,335000,"""RESIDENCE""","""11""",
"""001""",12688,,"""18""","""52""","""RAILWAY PDE""","""KURRI KURRI""","""2327""",,,2021-01-14,2021-02-25,230000,"""RESIDENCE""","""18""",
"""001""",4111319,,"""1""","""39""","""ASHTON DR""","""HEDDON GRETA""","""2321""",336.0,"""M""",2021-01-20,2021-02-24,455000,"""RESIDENCE""",,1354.166667
"""001""",3278183,,"""2""","""33""","""MCMULLINS RD""","""EAST BRANXTON""","""2335""",358.3,"""M""",2021-01-11,2021-02-22,345000,"""RESIDENCE""",,962.880268


In [123]:
unit_df_filter = unit_df.filter(pl.col("Primary Purpose").is_in(["UNIT", "RESIDENCE", "HOUSE UNIT", "VACANT LAND"]))

In [124]:
unit_df_filter.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Price per Unit Area
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64
"""001""",3029751,"""GOLDEN DOOR HEALTH RETREAT AND…","""16""","""165""","""THOMPSONS RD""","""POKOLBIN""","""2320""",,,2020-11-25,2021-01-22,100000,"""RESIDENCE""","""16""",
"""001""",3830378,,"""11""","""5""","""STONEBRIDGE DR""","""CESSNOCK""","""2325""",,,2020-12-16,2021-01-27,335000,"""RESIDENCE""","""11""",
"""001""",12688,,"""18""","""52""","""RAILWAY PDE""","""KURRI KURRI""","""2327""",,,2021-01-14,2021-02-25,230000,"""RESIDENCE""","""18""",
"""001""",4111319,,"""1""","""39""","""ASHTON DR""","""HEDDON GRETA""","""2321""",336.0,"""M""",2021-01-20,2021-02-24,455000,"""RESIDENCE""",,1354.166667
"""001""",3278183,,"""2""","""33""","""MCMULLINS RD""","""EAST BRANXTON""","""2335""",358.3,"""M""",2021-01-11,2021-02-22,345000,"""RESIDENCE""",,962.880268


In [126]:
output_file2 = "./units.csv"

unit_df_filter.write_csv(output_file2)

In [127]:
house_df = df_cleaned_col.filter(pl.col("Property Unit Number").is_null() & pl.col("Strata Lot Number").is_null())


In [128]:
house_df.head()

District Code,Property Id.,Property Name,Property Unit Number,Property House Number,Property Street Name,Property Locality,Property Post Code,Area,Area Type,Contract Date,Settlement Date,Purchase Price,Primary Purpose,Strata Lot Number,Price per Unit Area
str,i64,str,str,str,str,str,str,f64,str,date,date,i64,str,str,f64
"""001""",2962131,,,"""11""","""BARNHILL ACCS""","""ROTHBURY""","""2320""",609.8,"""M""",2020-12-11,2021-01-27,845000,"""RESIDENCE""",,1385.70023
"""001""",4156561,,,"""111""","""TALLEYRAND CCT""","""GRETA""","""2334""",612.0,"""M""",2020-12-11,2021-01-27,210000,"""RESIDENCE""",,343.137255
"""001""",4224453,,,"""15""","""GUY CL""","""GRETA""","""2334""",657.5,"""M""",2020-12-22,2021-01-22,205000,"""RESIDENCE""",,311.787072
"""001""",4224457,,,"""10""","""GUY CL""","""GRETA""","""2334""",725.5,"""M""",2020-12-03,2021-01-22,200000,"""VACANT LAND""",,275.67195
"""001""",4225342,,,"""90""","""TALLEYRAND CCT""","""GRETA""","""2334""",640.6,"""M""",2020-12-11,2021-01-22,210000,"""VACANT LAND""",,327.817671


In [129]:
output_file3 = "./house.csv"

house_df.write_csv(output_file3)