## I.3 INITAL DATA PREPARATION

The complete dataset of the Population: Properties in B28 Area (Birmingham - UK)

In [1]:
# import packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy
from geopy.geocoders import Nominatim
from geopy.extra.rate_limiter import RateLimiter

In [2]:
# Import the main dataset
df = pd.read_csv('B28_properties_dataset_cleaned.csv')

# Inspect the dataframe
df.head()

Unnamed: 0,address,price,type,no_bed,no_bath,agent,url,garden,parking,new_home,postcode
0,"121 Brook Lane, Birmingham B13 0AB",99500,Terraced,2,1,"SDL Property Auctions, Nationwide",https://www.rightmove.co.uk/properties/1545741...,False,True,False,B13 0AB
1,"14 Priory Gardens, Birmingham, B28 0TQ",160000,Apartment,2,2,"Keogh Estates, Coventry",https://www.rightmove.co.uk/properties/1511360...,False,True,False,B28 0TQ
2,"294 Haslucks Green Road, Solihull, B90",190000,Apartment,2,2,"Purplebricks, covering Birmingham",https://www.rightmove.co.uk/properties/1459222...,True,True,False,B90
3,"40 Pegasus Court, Union Road, Solihull, West M...",45000,Flat,1,1,"Allsop, Auction",https://www.rightmove.co.uk/properties/1540831...,True,False,False,B90
4,"59 Woodstock Road, Moseley, Birmingham",450000,Detached,4,1,"Rice Chamberlains LLP, Moseley",https://www.rightmove.co.uk/properties/1538591...,True,True,False,


In [3]:
# Dataframe info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 719 entries, 0 to 718
Data columns (total 11 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   address   719 non-null    object
 1   price     719 non-null    int64 
 2   type      719 non-null    object
 3   no_bed    719 non-null    int64 
 4   no_bath   719 non-null    int64 
 5   agent     719 non-null    object
 6   url       719 non-null    object
 7   garden    719 non-null    bool  
 8   parking   719 non-null    bool  
 9   new_home  719 non-null    bool  
 10  postcode  329 non-null    object
dtypes: bool(3), int64(3), object(5)
memory usage: 47.2+ KB


- Population size:   834
- Inital attributes: 10

To-do(s):
1. Get a sample of 120 properties
2. Get the location (longtitude, lattitude) of these properties for geospatial visualisations
3. Scrapping the nearest Station/School -> Go back to add properties URL in the scrapping step

### 1. Get a sample of 120 by simple sampling:

In [None]:
# Get a sample of 120 using Simple Random
df_sample = df.sample(n =120, random_state = 42)

### 2. Get the location (longtitude, lattitude) of these properties for geospatial visualisations

In [5]:
geolocator = Nominatim(user_agent="property_locator")
geocode = RateLimiter(geolocator.geocode, min_delay_seconds=1)

# Apply geocoding to each row with error handling for null or unfound addresses
df_sample['location'] = df_sample['address'].apply(lambda x: geocode(x) if pd.notnull(x) and x.strip() != "" else None)
df_sample['latitude'] = df_sample['location'].apply(lambda loc: loc.latitude if loc else None)
df_sample['longitude'] = df_sample['location'].apply(lambda loc: loc.longitude if loc else None)

In [10]:
# Properties than can't be decoded by geolocator
df_sample[df_sample['location'].isna()]

Unnamed: 0,address,price,type,no_bed,no_bath,agent,url,garden,parking,new_home,postcode,location,latitude,longitude
596,"Tenchlee Place, Hall Green, Birmingham",330000,Semi-Detached,3,2,"Connells, Birmingham City",https://www.rightmove.co.uk/properties/1411881...,True,True,True,,,,
155,"Croome Close, Sparkhill, Birmingham, B11",345000,Semi-Detached,3,1,"Robert Oulsnam & Company, Moseley",https://www.rightmove.co.uk/properties/1522360...,True,True,False,B11,,,
412,"Park Road, Birmingham, B11",400000,Terraced,5,2,"Purplebricks, covering Birmingham",https://www.rightmove.co.uk/properties/1212506...,True,False,False,B11,,,
599,"Tenchlee Place, Hall Green, Birmingham",520000,Detached,4,3,"Connells, Birmingham City",https://www.rightmove.co.uk/properties/1351126...,True,True,True,,,,
622,"Trinity Court, Moseley",215000,Apartment,2,1,"Rice Chamberlains LLP, Moseley",https://www.rightmove.co.uk/properties/1532423...,True,True,False,,,,
76,"Brandon Court, 365 Wake Green Road, Moseley, B...",199950,Duplex,2,2,"Nicholas George Ltd, Moseley",https://www.rightmove.co.uk/properties/1539448...,False,True,False,B13,,,
331,"Loxley Court, Baldwins Lane, Birmingham, B28 0FH",210000,Flat,2,1,"Arden Estates, Solihull",https://www.rightmove.co.uk/properties/1546199...,False,True,False,B28 0FH,,,
324,"Littlemead Road, Shirley, Solihull",495000,Semi-Detached,4,2,"Sterling Homes, Birmingham",https://www.rightmove.co.uk/properties/1532839...,True,True,False,,,,
30,"Ashdale Drive, Nr Hollywood",250000,Terraced,3,1,"Melvyn Danes, Wythall",https://www.rightmove.co.uk/properties/1464147...,True,True,False,,,,


In [12]:
def adding_location(row, location, latitude, longitude):
    """
    This function corrects the location and coordinates of properties by updating null cells
    with the provided location, latitude, and longitude values.

    Arguments:
        row -- index of the row to update
        location -- new location/address to add if missing
        latitude -- latitude coordinate to add if missing
        longitude -- longitude coordinate to add if missing
    """
    if pd.isna(df_sample.loc[row, 'location']):
        df_sample.loc[row, 'location'] = location
    if pd.isna(df_sample.loc[row, 'latitude']):
        df_sample.loc[row, 'latitude'] = latitude
    if pd.isna(df_sample.loc[row, 'longitude']):
        df_sample.loc[row, 'longitude'] = longitude


In [13]:
# mannually adding the locations and coordinates of 12 properties whose location cound't be found using google map:
adding_location(30, 'Ashdale Drive, Maypole, Hollywood, Birmingham, West Midlands, England, B14, United Kingdom', 52.402974437344255, -1.8791687366426464)
adding_location(599, 'Shaftmoor Lane, Hall Green, Birmingham, West Midlands, England, B28 8SW, United Kingdom', 52.442364101124284, -1.8427314180797243)
adding_location(596, 'Shaftmoor Lane, Hall Green, Birmingham, West Midlands, England, B28 8SW, United Kingdom', 52.442364101124284, -1.8427314180797243)
adding_location(76, '356 Wake Green Road, Moseley, Birmingham, West Midlands, England, B13, United Kingdom', 52.431594828004734, -1.8572746283589694)
adding_location(155, 'Croome Close, Sparkhill, Birmingham, West Midlands, England, B11, United Kingdom', 52.44882446111353, -1.870650026528147)
adding_location(412, 'Park Road, Birmingham, West Midlands, England, B11, United Kingdom', 52.44853865010483, -1.868405114984758)
adding_location(622, '53 Wake Green Road, Moseley, Birmingham, West Midlands, England, B13 9HW, United Kingdom', 52.44380463429303, -1.878170617197836)
adding_location(331, 'Loxley Court, Baldwins Lane, Birmingham, West Midlands, England, B28 0FH,	United Kingdom', 52.42095047459805, -1.8383903929671765)
adding_location(324, 'Littlemead Road, Shirley, Solihull, Birmingham, West Midlands, England, B90 , United Kingdom', 2.39688028331134, -1.8501598271600033)

In [15]:
df_sample['location']

120    (Chamberlain Crescent, Shirley, Solihull, West...
164    (Dolphin Lane, Acocks Green, Fox Hollies, Birm...
39     (Barrington Road, Olton, Solihull, West Midlan...
551    (St Bernards Road, Kineton Green, Solihull, We...
199    (Fircroft, Ulverley Green, Solihull, West Midl...
                             ...                        
60     (Blenheim Road, Kings Heath, Wake Green, Birmi...
165    (Douglas Road, Acocks Green, Tyseley, Birmingh...
30     Ashdale Drive, Maypole, Hollywood, Birmingham,...
69     (Braceby Avenue, Billesley, Yardley Wood, Birm...
408    (Pailton Road, Shirley, Solihull, West Midland...
Name: location, Length: 120, dtype: object

In [None]:
# Export the Sampled Dataset with location information for later analysis
df.to_csv('[120sample]B28_properties.csv]')

### 3. Get a separate dataset for testing the Trained Model:

In [3]:
# Get a sample of 30 using Simple Random
df_test = df.sample(n =30, random_state = 23)

In [4]:
df_test.head()

Unnamed: 0,address,price,type,no_bed,no_bath,agent,url,garden,parking,new_home,postcode
25,"Arlington Grove, Birmingham",200000,Terraced,2,1,"Shipways, Shirley",https://www.rightmove.co.uk/properties/1539041...,True,True,False,
435,"Prospect Lane, Solihull, B91",795000,Detached,4,1,"Shepherd Cullen, Covering Solihull & Warwickshire",https://www.rightmove.co.uk/properties/1536784...,True,True,False,B91
302,"James Court, Wake Green Park",100000,Flat,1,1,"Rice Chamberlains LLP, Moseley",https://www.rightmove.co.uk/properties/1530129...,False,True,False,
19,"Allcroft Road, Birmingham",220000,Terraced,3,1,"Manny Klarico, Hall Green",https://www.rightmove.co.uk/properties/1541421...,True,True,False,
603,"The Avenue, Acocks Green, Birmingham",235000,Terraced,3,1,"Oakmans Estate Agents, Shirley",https://www.rightmove.co.uk/properties/1483967...,True,False,False,


In [5]:
df_test.to_csv('[Test]B28_30_Properties.csv')