## Processing Original Dataset

In [1]:
import pandas as pd
pd.set_option('display.max_columns', None)

In [2]:
# Explore data
df = pd.read_csv('data/au_data.csv')
df.head()

Unnamed: 0,index,TID,breadcrumb,category_name,property_type,building_size,land_size,preferred_size,open_date,listing_agency,price,location_number,location_type,location_name,address,address_1,city,state,zip_code,phone,latitude,longitude,product_depth,bedroom_count,bathroom_count,parking_count,RunDate
0,0,1350988,Buy>NT>DARWIN CITY,Real Estate & Property for sale in DARWIN CITY...,House,,,,Added 2 hours ago,Professionals - DARWIN CITY,"$435,000",139468611,Buy,"$435,000","44 Woods Street, Darwin City, NT 0800",44 Woods Street,Darwin City,NT,800,08 8941 8289,,,premiere,2.0,1.0,1.0,2022-05-27 15:54:05
1,1,1350989,Buy>NT>DARWIN CITY,Real Estate & Property for sale in DARWIN CITY...,Apartment,171m²,,171m²,Added 7 hours ago,Nick Mousellis Real Estate - Eview Group Member,"Offers Over $320,000",139463755,Buy,"Offers Over $320,000","14/14 Dashwood Place, Darwin City, NT 0800",14/14 Dashwood Place,Darwin City,NT,800,0411724000,,,premiere,3.0,2.0,2.0,2022-05-27 15:54:05
2,2,1350990,Buy>NT>DARWIN CITY,Real Estate & Property for sale in DARWIN CITY...,Unit,,,,Added 22 hours ago,Habitat Real Estate - THE GARDENS,"$310,000",139462495,Buy,"$310,000","13/86 Woods Street, Darwin City, NT 0800",13/86 Woods Street,Darwin City,NT,800,08 8981 0080,,,premiere,2.0,1.0,1.0,2022-05-27 15:54:05
3,3,1350991,Buy>NT>DARWIN CITY,Real Estate & Property for sale in DARWIN CITY...,House,,,,Added yesterday,Ray White - NIGHTCLIFF,"$259,000",139451679,Buy,"$259,000","1309/43B Knuckey Street, Darwin City, NT 0800",1309/43B Knuckey Street,Darwin City,NT,800,08 8982 2403,,,premiere,1.0,1.0,0.0,2022-05-27 15:54:05
4,4,1350992,Buy>NT>DARWIN CITY,Real Estate & Property for sale in DARWIN CITY...,Unit,201m²,,201m²,Added yesterday,Carol Need Real Estate - Fannie Bay,"$439,000",139433803,Buy,"$439,000","3/10 McLachlan Street, Darwin City, NT 0800",3/10 McLachlan Street,Darwin City,NT,800,0418885966,,,premiere,3.0,2.0,2.0,2022-05-27 15:54:05


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 27 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            1000 non-null   int64  
 1   TID              1000 non-null   int64  
 2   breadcrumb       1000 non-null   object 
 3   category_name    1000 non-null   object 
 4   property_type    1000 non-null   object 
 5   building_size    280 non-null    object 
 6   land_size        533 non-null    object 
 7   preferred_size   609 non-null    object 
 8   open_date        302 non-null    object 
 9   listing_agency   1000 non-null   object 
 10  price            1000 non-null   object 
 11  location_number  1000 non-null   int64  
 12  location_type    1000 non-null   object 
 13  location_name    1000 non-null   object 
 14  address          988 non-null    object 
 15  address_1        988 non-null    object 
 16  city             1000 non-null   object 
 17  state          

In [4]:
# Make data copy
og_data = df

In [5]:
# Remove unwanted columns
unwanted_cols = [
    'index',
    'TID',
    'breadcrumb',
    'open_date',
    'location_type',
    'location_name',
    'latitude',
    'longitude'
]

df.drop(unwanted_cols, axis=1, inplace=True)

In [6]:
# Edit size datatype - remove commas and unit, convert to float

df['building_size'] = df['building_size'].str.replace('m²', '').str.replace(',', '').astype(float)

df['land_size'] = df['land_size'].str.replace('m²', '').str.replace(',', '').str.replace('ha', '').astype(float)

df['preferred_size'] = df['preferred_size'].str.replace('m²', '').str.replace(',', '').str.replace('ha', '').astype(float)

In [7]:
# Remove non number characters from price and convert to float
df['price'] = df['price'].str.replace(r'[^0-9]', '', regex=True).replace('', None).astype(float)

In [8]:
# Convert Date column to datetime type
df['RunDate'] = pd.to_datetime(df['RunDate'])

In [9]:
# Check transformed data
df.head()

Unnamed: 0,category_name,property_type,building_size,land_size,preferred_size,listing_agency,price,location_number,address,address_1,city,state,zip_code,phone,product_depth,bedroom_count,bathroom_count,parking_count,RunDate
0,Real Estate & Property for sale in DARWIN CITY...,House,,,,Professionals - DARWIN CITY,435000.0,139468611,"44 Woods Street, Darwin City, NT 0800",44 Woods Street,Darwin City,NT,800,08 8941 8289,premiere,2.0,1.0,1.0,2022-05-27 15:54:05
1,Real Estate & Property for sale in DARWIN CITY...,Apartment,171.0,,171.0,Nick Mousellis Real Estate - Eview Group Member,320000.0,139463755,"14/14 Dashwood Place, Darwin City, NT 0800",14/14 Dashwood Place,Darwin City,NT,800,0411724000,premiere,3.0,2.0,2.0,2022-05-27 15:54:05
2,Real Estate & Property for sale in DARWIN CITY...,Unit,,,,Habitat Real Estate - THE GARDENS,310000.0,139462495,"13/86 Woods Street, Darwin City, NT 0800",13/86 Woods Street,Darwin City,NT,800,08 8981 0080,premiere,2.0,1.0,1.0,2022-05-27 15:54:05
3,Real Estate & Property for sale in DARWIN CITY...,House,,,,Ray White - NIGHTCLIFF,259000.0,139451679,"1309/43B Knuckey Street, Darwin City, NT 0800",1309/43B Knuckey Street,Darwin City,NT,800,08 8982 2403,premiere,1.0,1.0,0.0,2022-05-27 15:54:05
4,Real Estate & Property for sale in DARWIN CITY...,Unit,201.0,,201.0,Carol Need Real Estate - Fannie Bay,439000.0,139433803,"3/10 McLachlan Street, Darwin City, NT 0800",3/10 McLachlan Street,Darwin City,NT,800,0418885966,premiere,3.0,2.0,2.0,2022-05-27 15:54:05


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 19 columns):
 #   Column           Non-Null Count  Dtype         
---  ------           --------------  -----         
 0   category_name    1000 non-null   object        
 1   property_type    1000 non-null   object        
 2   building_size    280 non-null    float64       
 3   land_size        533 non-null    float64       
 4   preferred_size   609 non-null    float64       
 5   listing_agency   1000 non-null   object        
 6   price            787 non-null    float64       
 7   location_number  1000 non-null   int64         
 8   address          988 non-null    object        
 9   address_1        988 non-null    object        
 10  city             1000 non-null   object        
 11  state            1000 non-null   object        
 12  zip_code         1000 non-null   int64         
 13  phone            1000 non-null   object        
 14  product_depth    1000 non-null   object  

In [11]:
del og_data
df.to_csv('data/au_new.csv', index=False)