# Preprocessing the current listing property from the domain website
This notebook is to preprocess the listing property data that were scraped from the script.py.


### Import packages

In [1]:
import pandas as pd
import re
import numpy as np

# Read the Listing Data

In [2]:
listings_df = pd.read_parquet('../data/landing/listings.parquet')

In [3]:
listings_df.head()

Unnamed: 0,Location,type_property,price,school_names,school_distance,LT_resident_pcg,owner_pcg,family_pcg,state,rooms,bath,parking
https://www.domain.com.au/1507-220-spencer-street-melbourne-vic-3000-16659362,1507/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$430pw,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,[0 Beds],[1 Bath],[− Parking]
https://www.domain.com.au/4506-33-rose-lane-melbourne-vic-3000-16658749,4506/33 Rose Lane Melbourne VIC 3000,Apartment / Unit / Flat,$520 Per Week,"[Docklands Primary School, University High Sch...","[1.1 km away, 2 km away, 0.3 km away]",12%,28%,27%,VIC,[1 Bed],[1 Bath],[− Parking]
https://www.domain.com.au/1715-220-spencer-street-melbourne-vic-3000-16071670,1715/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$750,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,[2 Beds],[2 Baths],[1 Parking]
https://www.domain.com.au/3504-220-spencer-street-melbourne-vic-3000-16657664,3504/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$400 per week,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,[0 Beds],[1 Bath],[− Parking]
https://www.domain.com.au/512-118-franklin-street-melbourne-vic-3000-16383891,512/118 Franklin Street Melbourne VIC 3000,Apartment / Unit / Flat,$430,"[Carlton Gardens Primary School, University Hi...","[1.2 km away, 1.3 km away, 0.5 km away]",,29%,16%,VIC,[1 Bed],[1 Bath],[− Parking]


In [4]:
# Set a index for each row
listings_df['link'] = listings_df.index
listings_df.index = range(listings_df.shape[0])

In [5]:
# check the size of the data
listings_df.shape

(12297, 13)

# For coulmns as lists, retrieve the actual values

In [6]:
for i in range(listings_df.shape[0]):
    
    # number of beds
    num_beds_str = listings_df.loc[i, 'rooms']
    if num_beds_str:
        num_beds = re.findall('\d+', num_beds_str[0])
        if num_beds:
            listings_df.loc[i, 'rooms'] = num_beds[0]
        else:
            listings_df.loc[i, 'rooms'] = np.nan
    else:
        listings_df.loc[i, 'rooms'] = np.nan

    
    # number of baths
    num_bath_str = listings_df.loc[i, 'bath']
    if num_bath_str:
        num_bath = re.findall('\d+', num_bath_str[0])
        if num_bath:
            listings_df.loc[i, 'bath'] = num_bath[0]
        else:
            listings_df.loc[i, 'bath'] = np.nan
    else:
        listings_df.loc[i, 'bath'] = np.nan


    # number of parking
    num_parking_str = listings_df.loc[i, 'parking']
    if num_parking_str: 
        num_parking = re.findall('\d+', num_parking_str[0])
        if num_parking:
            listings_df.loc[i, 'parking'] = num_parking[0]
        else:
            listings_df.loc[i, 'parking'] = np.nan        
    else:
        listings_df.loc[i, 'parking'] = np.nan


  if num_beds_str:
  if num_bath_str:
  if num_parking_str:


In [7]:
# Column type changing
# Since int type is not supported for columns with nan, changed to float instead
listings_df['rooms'] = listings_df['rooms'].astype(float)
listings_df['bath'] = listings_df['bath'].astype(float)
listings_df['parking'] = listings_df['parking'].astype(float)

## Explode the school columns

The school column contain three schools in order, the first one being the closest public school, the second one being the second closest public school, the third one being the closest private school, the school distance contain the distance of the three schools in order 

In [8]:
def school_filler(df, i, col, name, dis):
    ''' fill the the distance and name of the ith listing's related school'''
    df.loc[i, col] = name
    df.loc[i, f'{col}_dis'] = dis

In [9]:
for i in range(listings_df.shape[0]):
    schools = listings_df.loc[i, 'school_names']
    distance = listings_df.loc[i, 'school_distance']
    if type(schools)!=type(None):
        try:
            school_filler(listings_df, i, 'public_1', schools[0], re.findall('(.+)\skm\saway', distance[0])[0])
            school_filler(listings_df, i, 'public_2', schools[1], re.findall('(.+)\skm\saway', distance[1])[0])
            school_filler(listings_df, i, 'private_1', schools[2], re.findall('(.+)\skm\saway', distance[2])[0])
        except IndexError:
            '''when an index error happens, the there's no information about the closest 1st/2nd 
            public and 1st private school and we just leave the cell being nan'''
            pass


In [10]:
listings_df.head()

Unnamed: 0,Location,type_property,price,school_names,school_distance,LT_resident_pcg,owner_pcg,family_pcg,state,rooms,bath,parking,link,public_1,public_1_dis,public_2,public_2_dis,private_1,private_1_dis
0,1507/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$430pw,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,0.0,1.0,,https://www.domain.com.au/1507-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
1,4506/33 Rose Lane Melbourne VIC 3000,Apartment / Unit / Flat,$520 Per Week,"[Docklands Primary School, University High Sch...","[1.1 km away, 2 km away, 0.3 km away]",12%,28%,27%,VIC,1.0,1.0,,https://www.domain.com.au/4506-33-rose-lane-me...,Docklands Primary School,1.1,University High School,2.0,Ozford College - Ozford College Campus,0.3
2,1715/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$750,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,2.0,2.0,1.0,https://www.domain.com.au/1715-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
3,3504/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$400 per week,"[Docklands Primary School, University High Sch...","[1 km away, 2.1 km away, 0.3 km away]",12%,28%,27%,VIC,0.0,1.0,,https://www.domain.com.au/3504-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
4,512/118 Franklin Street Melbourne VIC 3000,Apartment / Unit / Flat,$430,"[Carlton Gardens Primary School, University Hi...","[1.2 km away, 1.3 km away, 0.5 km away]",,29%,16%,VIC,1.0,1.0,,https://www.domain.com.au/512-118-franklin-str...,Carlton Gardens Primary School,1.2,University High School,1.3,River Nile School,0.5


In [11]:
# change the type of distance columns to float
listings_df['public_1_dis'] = listings_df['public_1_dis'].astype(float)
listings_df['public_2_dis'] = listings_df['public_2_dis'].astype(float)
listings_df['private_1_dis'] = listings_df['private_1_dis'].astype(float)

In [12]:
# drop the original column
listings_df.drop(columns = ['school_names', 'school_distance'], inplace = True)

# Percentage Features value extracting

In [13]:
for i in range(listings_df.shape[0]):
    # extract the numbers in the percentages
    
    try:
        listings_df.loc[i, 'LT_resident_pcg'] = re.findall('(.+)\%', listings_df.loc[i, 'LT_resident_pcg'])[0]
    except IndexError:
        listings_df.loc[i, 'LT_resident_pcg'] = np.nan
    except TypeError:
        listings_df.loc[i, 'LT_resident_pcg'] = np.nan

    
    try:
        listings_df.loc[i, 'owner_pcg'] = re.findall('(.+)\%', listings_df.loc[i, 'owner_pcg'])[0]
    except IndexError:
        listings_df.loc[i, 'owner_pcg'] = np.nan
    except TypeError:
        listings_df.loc[i, 'owner_pcg'] = np.nan


    try:
        listings_df.loc[i, 'family_pcg'] = re.findall('(.+)\%', listings_df.loc[i, 'family_pcg'])[0]
    except IndexError:
        listings_df.loc[i, 'family_pcg'] = np.nan   
    except TypeError:
        listings_df.loc[i, 'family_pcg'] = np.nan

In [14]:
# change the type of the percentage columns to float
listings_df['LT_resident_pcg'] = listings_df['LT_resident_pcg'].astype(float)
listings_df['owner_pcg'] = listings_df['owner_pcg'].astype(float)
listings_df['family_pcg'] = listings_df['family_pcg'].astype(float)

In [15]:
listings_df.head(10)

Unnamed: 0,Location,type_property,price,LT_resident_pcg,owner_pcg,family_pcg,state,rooms,bath,parking,link,public_1,public_1_dis,public_2,public_2_dis,private_1,private_1_dis
0,1507/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$430pw,12.0,28.0,27.0,VIC,0.0,1.0,,https://www.domain.com.au/1507-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
1,4506/33 Rose Lane Melbourne VIC 3000,Apartment / Unit / Flat,$520 Per Week,12.0,28.0,27.0,VIC,1.0,1.0,,https://www.domain.com.au/4506-33-rose-lane-me...,Docklands Primary School,1.1,University High School,2.0,Ozford College - Ozford College Campus,0.3
2,1715/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$750,12.0,28.0,27.0,VIC,2.0,2.0,1.0,https://www.domain.com.au/1715-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
3,3504/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$400 per week,12.0,28.0,27.0,VIC,0.0,1.0,,https://www.domain.com.au/3504-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3
4,512/118 Franklin Street Melbourne VIC 3000,Apartment / Unit / Flat,$430,,29.0,16.0,VIC,1.0,1.0,,https://www.domain.com.au/512-118-franklin-str...,Carlton Gardens Primary School,1.2,University High School,1.3,River Nile School,0.5
5,5801/648 Lonsdale Street Melbourne VIC 3000,Apartment / Unit / Flat,$800,12.0,20.0,34.0,VIC,2.0,2.0,,https://www.domain.com.au/5801-648-lonsdale-st...,Docklands Primary School,1.0,University High School,1.9,Ozford College - Ozford College Campus,0.2
6,1.2/187 Collins Street Melbourne VIC 3000,Apartment / Unit / Flat,$475 pw,37.0,49.0,51.0,VIC,1.0,1.0,,https://www.domain.com.au/1-2-187-collins-stre...,Carlton Gardens Primary School,1.5,University High School,2.3,Stott's Colleges,0.5
7,521/422 Collins St Melbourne VIC 3000,Apartment / Unit / Flat,$500 weekly,18.0,28.0,39.0,VIC,1.0,1.0,,https://www.domain.com.au/521-422-collins-st-m...,Docklands Primary School,1.7,University High School,2.3,Eltham College - King Street Campus,0.4
8,603/199 William Street Melbourne VIC 3000,Apartment / Unit / Flat,$700,22.0,29.0,24.0,VIC,2.0,2.0,,https://www.domain.com.au/603-199-william-stre...,Docklands Primary School,1.4,University High School,2.0,Ozford College - Ozford College Campus,0.3
9,1111/568 St Kilda Road Melbourne VIC 3000,Apartment / Unit / Flat,$450,,47.0,56.0,VIC,1.0,1.0,,https://www.domain.com.au/1111-568-st-kilda-ro...,South Yarra Primary School,0.9,Prahran High School,1.2,Port Phillip Specialist School,3.4


# Extract the weekly price

In [16]:
lease_type = []
for i in range(listings_df.shape[0]):
    # extract the weekly price and lower all the other letters in the price column
    try:
        listings_df.loc[i, 'rent'] = re.findall('\$([\d\,]+)', listings_df.loc[i, 'price'])[0]
        listings_df.loc[i, 'rent'] = listings_df.loc[i, 'rent'].replace(',','')
        listings_df.loc[i, 'price'] = listings_df.loc[i, 'price'].lower()
        if re.findall('[A-z]+',listings_df.loc[i, 'price']) not in lease_type:
            lease_type.append(re.findall('[a-z]+',listings_df.loc[i, 'price']))
    except TypeError:
        pass
    except IndexError:
        pass

In [17]:
# change the type of the column to float
listings_df['rent'] = listings_df['rent'].astype(float)

In [18]:
# number of weeks per month constant 
wpm = 4.35
# number of week per year
wpy = wpm * 12
# number of week per season
wps = wpm * 3
for i in range(listings_df.shape[0]):
    # check if the price is weekly or monthly, weekly is assumed if unspecified
    # It has been checked there is no annualy contracts available
    try:
        text = listings_df.loc[i, 'price']
        if 'month' in text or 'mth' in text or 'pm' in text and 'pw' not in text and 'week' not in text:
            listings_df.loc[i, 'rent'] = listings_df.loc[i, 'rent'] / wpm

        elif 'season' in text:
            listings_df.loc[i, 'rent'] = listings_df.loc[i, 'rent'] / wps

        elif 'p.a' in text:
            listings_df.loc[i, 'rent'] = listings_df.loc[i, 'rent'] / wpy
            
        if ('furnished' in text or 'furniture' in text or 'furn' in text) and 'not' not in text:
            listings_df.loc[i, 'is_furnished'] = True
        else:
            listings_df.loc[i, 'is_furnished'] = False 
    except TypeError:
        pass

In [19]:
listings_df.head()

Unnamed: 0,Location,type_property,price,LT_resident_pcg,owner_pcg,family_pcg,state,rooms,bath,parking,link,public_1,public_1_dis,public_2,public_2_dis,private_1,private_1_dis,rent,is_furnished
0,1507/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$430pw,12.0,28.0,27.0,VIC,0.0,1.0,,https://www.domain.com.au/1507-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3,430.0,False
1,4506/33 Rose Lane Melbourne VIC 3000,Apartment / Unit / Flat,$520 per week,12.0,28.0,27.0,VIC,1.0,1.0,,https://www.domain.com.au/4506-33-rose-lane-me...,Docklands Primary School,1.1,University High School,2.0,Ozford College - Ozford College Campus,0.3,520.0,False
2,1715/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$750,12.0,28.0,27.0,VIC,2.0,2.0,1.0,https://www.domain.com.au/1715-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3,750.0,False
3,3504/220 Spencer Street Melbourne VIC 3000,Apartment / Unit / Flat,$400 per week,12.0,28.0,27.0,VIC,0.0,1.0,,https://www.domain.com.au/3504-220-spencer-str...,Docklands Primary School,1.0,University High School,2.1,Eltham College - King Street Campus,0.3,400.0,False
4,512/118 Franklin Street Melbourne VIC 3000,Apartment / Unit / Flat,$430,,29.0,16.0,VIC,1.0,1.0,,https://www.domain.com.au/512-118-franklin-str...,Carlton Gardens Primary School,1.2,University High School,1.3,River Nile School,0.5,430.0,False


In [21]:
# save the file to raw
listings_df.to_csv('../data/raw/listings.csv', index = False)