# **Australian Cities dataset with Suburbs** 
## **Author: Khizer Rehman**
### ***Date: 25-11-2024***
### Dataset from [opendatasoft.com](https://public.opendatasoft.com/explore/dataset/georef-australia-state-suburb-millesime/table/?disjunctive.ste_code&disjunctive.ste_name&disjunctive.lga_code&disjunctive.lga_name&disjunctive.scc_code&disjunctive.scc_name&refine.year=2021)

# Import libraries

In [1]:
# importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ydata_profiling import ProfileReport

  from .autonotebook import tqdm as notebook_tqdm


**Load the dataset**

In [2]:
df = pd.read_excel('georef-australia-state-suburb-millesime.xlsx')

# Dataset basic Info 

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15334 entries, 0 to 15333
Data columns (total 11 columns):
 #   Column                               Non-Null Count  Dtype 
---  ------                               --------------  ----- 
 0   Geo Point                            15333 non-null  object
 1   Geo Shape                            15333 non-null  object
 2   Year                                 15334 non-null  int64 
 3   Official Code State                  15334 non-null  int64 
 4   Official Name State                  15334 non-null  object
 5   Official Code Local Government Area  15334 non-null  object
 6   Official Name Local Government Area  15334 non-null  object
 7   Official Code Suburb                 15334 non-null  int64 
 8   Official Name Suburb                 15334 non-null  object
 9   Iso 3166-3 Area Code                 15334 non-null  object
 10  Type                                 15334 non-null  object
dtypes: int64(3), object(8)
memory usage: 1.3+

# Drop these columns from the dataset 'Iso 3166-3 Area Code' , 'Year', 'Type'

In [4]:
df['Iso 3166-3 Area Code'].nunique()

1

In [5]:
df['Year'].nunique()

1

In [6]:
df['Type'].nunique()

1

In [7]:
df.drop(['Iso 3166-3 Area Code', 'Year', 'Type'], axis=1, inplace=True)

In [8]:
df.columns

Index(['Geo Point', 'Geo Shape', 'Official Code State', 'Official Name State',
       'Official Code Local Government Area',
       'Official Name Local Government Area', 'Official Code Suburb',
       'Official Name Suburb'],
      dtype='object')

In [9]:
df.head()

Unnamed: 0,Geo Point,Geo Shape,Official Code State,Official Name State,Official Code Local Government Area,Official Name Local Government Area,Official Code Suburb,Official Name Suburb
0,"-36.01193158762528, 148.786320441595","{""coordinates"":[[[148.71675360000006,-36.06054...",1,New South Wales,17040,Snowy Monaro Regional,10015,Adaminaby
1,"-36.07369799464406, 146.91346780765934","{""coordinates"":[[[146.92431042300007,-36.08614...",1,New South Wales,10050,Albury,10027,Albury
2,"-32.91425516102453, 148.26802074020546","{""coordinates"":[[[148.19998799900006,-32.96692...",1,New South Wales,16200,Parkes,10029,Alectown
3,"-34.40869358826243, 150.53677066431447","{""coordinates"":[[[150.5122384670001,-34.422367...",1,New South Wales,18350,Wingecarribee,10044,Alpine
4,"-34.91797895821861, 145.6572484339588","{""coordinates"":[[[145.66848513900004,-35.01702...",1,New South Wales,15560,Murrumbidgee,10081,Argoon


In [10]:
df['Official Name Suburb'].value_counts()

Official Name Suburb
Adaminaby             1
Safety Beach (NSW)    1
Peacock Creek         1
Peak Hill (NSW)       1
Peak View             1
                     ..
Kudla                 1
Kybybolite            1
Lake Alexandrina      1
Leasingham            1
O'Connor (ACT)        1
Name: count, Length: 15334, dtype: int64

In [11]:
df['Official Name Suburb'].count()

15334

In [12]:
df['Official Name Local Government Area'].nunique()

1294

In [13]:
df.head()

Unnamed: 0,Geo Point,Geo Shape,Official Code State,Official Name State,Official Code Local Government Area,Official Name Local Government Area,Official Code Suburb,Official Name Suburb
0,"-36.01193158762528, 148.786320441595","{""coordinates"":[[[148.71675360000006,-36.06054...",1,New South Wales,17040,Snowy Monaro Regional,10015,Adaminaby
1,"-36.07369799464406, 146.91346780765934","{""coordinates"":[[[146.92431042300007,-36.08614...",1,New South Wales,10050,Albury,10027,Albury
2,"-32.91425516102453, 148.26802074020546","{""coordinates"":[[[148.19998799900006,-32.96692...",1,New South Wales,16200,Parkes,10029,Alectown
3,"-34.40869358826243, 150.53677066431447","{""coordinates"":[[[150.5122384670001,-34.422367...",1,New South Wales,18350,Wingecarribee,10044,Alpine
4,"-34.91797895821861, 145.6572484339588","{""coordinates"":[[[145.66848513900004,-35.01702...",1,New South Wales,15560,Murrumbidgee,10081,Argoon


In [14]:
columns = df.columns
for i in columns:
    print(i)

Geo Point
Geo Shape
Official Code State
Official Name State
Official Code Local Government Area
Official Name Local Government Area
Official Code Suburb
Official Name Suburb


# Rename all the columns according to mysql database 

In [15]:
df.rename(columns={'Geo Point':"geo_point", 
                   "Geo Shape":"geo_shape", 
                   "Official Code State":"stateid", 
                   "Official Name State":"statename", 
                   "Official Code Local Government Area":"cityid", 
                   "Official Name Local Government Area":"cityname", 
                   "Official Code Suburb":"suburbid", 
                   "Official Name Suburb":"suburbname"}, inplace=True)

In [16]:
columns = df.columns
for i in columns:
    print(i)

geo_point
geo_shape
stateid
statename
cityid
cityname
suburbid
suburbname


In [17]:
df.dtypes

geo_point     object
geo_shape     object
stateid        int64
statename     object
cityid        object
cityname      object
suburbid       int64
suburbname    object
dtype: object

# Rearrange the dataset

In [18]:
df = df[['stateid','statename','cityid' , 'cityname','suburbid', 'suburbname', 'geo_point', 'geo_shape']] 
df

Unnamed: 0,stateid,statename,cityid,cityname,suburbid,suburbname,geo_point,geo_shape
0,1,New South Wales,17040,Snowy Monaro Regional,10015,Adaminaby,"-36.01193158762528, 148.786320441595","{""coordinates"":[[[148.71675360000006,-36.06054..."
1,1,New South Wales,10050,Albury,10027,Albury,"-36.07369799464406, 146.91346780765934","{""coordinates"":[[[146.92431042300007,-36.08614..."
2,1,New South Wales,16200,Parkes,10029,Alectown,"-32.91425516102453, 148.26802074020546","{""coordinates"":[[[148.19998799900006,-32.96692..."
3,1,New South Wales,18350,Wingecarribee,10044,Alpine,"-34.40869358826243, 150.53677066431447","{""coordinates"":[[[150.5122384670001,-34.422367..."
4,1,New South Wales,15560,Murrumbidgee,10081,Argoon,"-34.91797895821861, 145.6572484339588","{""coordinates"":[[[145.66848513900004,-35.01702..."
...,...,...,...,...,...,...,...,...
15329,8,Australian Capital Territory,89399,Unincorporated ACT,80023,Beard,"-35.341729402139414, 149.2105302094079","{""coordinates"":[[[149.21876130200005,-35.34132..."
15330,8,Australian Capital Territory,89399,Unincorporated ACT,80028,Bruce (ACT),"-35.24548030806253, 149.09155194266856","{""coordinates"":[[[149.09199952500012,-35.23483..."
15331,8,Australian Capital Territory,89399,Unincorporated ACT,80037,Chisholm (ACT),"-35.42196479042136, 149.12491232833983","{""coordinates"":[[[149.114427619,-35.4162914309..."
15332,8,Australian Capital Territory,89399,Unincorporated ACT,80038,City,"-35.281329376797565, 149.12913206614766","{""coordinates"":[[[149.12790347300006,-35.28601..."


# Drope the geo_shape column from the dataset

In [19]:
df.drop('geo_shape', axis=1, inplace = True)

# Change the statename with its short form

In [20]:
names = df['statename'].unique()
for i in names:
    print(i)

New South Wales
Victoria
Queensland
South Australia
Western Australia
Northern Territory, Western Australia
Tasmania
Northern Territory
Australian Capital Territory
New South Wales, Victoria
South Australia, Victoria
South Australia, Western Australia
Other Territories
Northern Territory, Queensland
Australian Capital Territory, New South Wales
New South Wales, South Australia
New South Wales, Queensland
Northern Territory, South Australia, Western Australia


In [21]:
state_short_form = {
    'New South Wales': 'NSW',
    'Victoria': 'Vic',
    'Queensland': 'Qld',
    'South Australia': 'SA',
    'Western Australia': 'WA',
    'Northern Territory, Western Australia': 'NT, WA',
    'Tasmania': 'Tas',
    'Northern Territory': 'NT',
    'Australian Capital Territory': 'ACT',
    'New South Wales, Victoria': 'NSW, Vic',
    'South Australia, Victoria': 'SA, Vic',
    'South Australia, Western Australia': 'SA, WA',
    'Other Territories': 'OT',
    'Northern Territory, Queensland': 'NT, Qld',
    'Australian Capital Territory, New South Wales': 'ACT, NSW',
    'New South Wales, South Australia': 'NSW, SA',
    'New South Wales, Queensland': 'NSW, Qld',
    'Northern Territory, South Australia, Western Australia': 'NT, SA, WA'

}

In [22]:
df['statename'] = df['statename'].map(state_short_form)

In [23]:
df['statename'].value_counts()

statename
NSW           4525
Qld           3229
Vic           2940
WA            1697
SA            1692
Tas            776
NT             302
ACT            136
NSW, Vic         9
NSW, Qld         7
OT               5
ACT, NSW         4
NT, WA           3
SA, Vic          3
NT, Qld          2
NSW, SA          2
SA, WA           1
NT, SA, WA       1
Name: count, dtype: int64

In [24]:
df['stateid'].value_counts()

stateid
1    4542
3    3233
2    2945
5    1700
4    1694
6     776
7     303
8     136
9       5
Name: count, dtype: int64

In [25]:
df['cityid'].isnull().sum()

0

In [26]:
df['cityid'].value_counts()

cityid
36910                                215
15240                                191
31000                                190
11650                                149
11730                                147
                                    ... 
62410, 64610, 65010                    1
10300, 11600, 13850                    1
10180, 13660, 14220, 17310, 17650      1
12150, 15750, 17900                    1
70420                                  1
Name: count, Length: 1319, dtype: int64

In [27]:
df['cityid'] = df['cityid'].astype(str).str.split(',').str[0]
df['cityid'].astype('Int64')

0        17040
1        10050
2        16200
3        18350
4        15560
         ...  
15329    89399
15330    89399
15331    89399
15332    89399
15333    89399
Name: cityid, Length: 15334, dtype: Int64

In [28]:
df['cityid'].isnull().sum()

0

In [29]:
df['suburbid'].nunique()

15334

In [30]:
df[df['suburbname'] == 'Angaston']

Unnamed: 0,stateid,statename,cityid,cityname,suburbid,suburbname,geo_point
7055,4,SA,40310,Barossa,40029,Angaston,"-34.52952213716274, 139.0614373105911"


In [31]:
values = df['cityid'].unique()
for x in values:
    print(x)

17040
10050
16200
18350
15560
10300
16380
17640
11650
13910
15050
15240
10800
15270
14920
11400
17080
15900
19399
14550
12750
15850
12850
14170
14650
17200
10950
11350
14850
13660
15950
12160
12390
16700
16260
16400
10850
14400
17550
15990
10550
11150
13850
15700
14300
16100
16490
11730
16950
11720
18450
10500
12930
18400
10900
12730
18250
12870
13010
13550
11450
10600
17000
10750
13800
17350
10470
16610
17150
12700
13310
17750
14220
18200
17400
15520
17850
11600
13450
14500
13340
14950
20830
22830
23940
25990
25340
22250
22750
26980
26260
21010
24900
23270
22620
23430
20570
22110
25620
25710
21670
20260
24850
26890
26170
24780
23810
24250
21750
20660
26490
22980
26610
21610
24330
22310
26700
26810
27450
21370
25250
22490
25490
20740
21110
25810
24970
24600
24210
21270
32270
33220
31820
34770
35760
36720
34590
36630
36910
34860
30760
36660
35010
36510
32260
31000
33980
34580
33610
36250
32310
32810
37010
33360
33620
32080
32250
34880
33960
31900
37310
36580
34420
35740
33800
33430
3637

In [32]:

df_city_suburbs = pd.DataFrame(df.groupby('cityname')['suburbname'].value_counts())
df_city_suburbs


Unnamed: 0_level_0,Unnamed: 1_level_0,count
cityname,suburbname,Unnamed: 2_level_1
Adelaide,Adelaide,1
Adelaide,North Adelaide,1
Adelaide Hills,Aldgate,1
Adelaide Hills,Ashton,1
Adelaide Hills,Balhannah,1
...,...,...
Yorke Peninsula,White Hut,1
Yorke Peninsula,Winulta,1
Yorke Peninsula,Wool Bay,1
Yorke Peninsula,Yorke Valley,1


# Feature Engineering the 'geo_point' and 'geo_shape'

In [33]:
max_geo_point_chars = df['geo_point'].str.len().max()
max_geo_point_chars

39.0

In [34]:
df.columns

Index(['stateid', 'statename', 'cityid', 'cityname', 'suburbid', 'suburbname',
       'geo_point'],
      dtype='object')

In [35]:
df1 =df.iloc[0:2000, 0:]

In [36]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   stateid     2000 non-null   int64 
 1   statename   2000 non-null   object
 2   cityid      2000 non-null   object
 3   cityname    2000 non-null   object
 4   suburbid    2000 non-null   int64 
 5   suburbname  2000 non-null   object
 6   geo_point   2000 non-null   object
dtypes: int64(2), object(5)
memory usage: 109.5+ KB


In [37]:
df_city_suburbs = pd.DataFrame(df.groupby('cityname')['suburbname'].value_counts())
df_city_suburbs

Unnamed: 0_level_0,Unnamed: 1_level_0,count
cityname,suburbname,Unnamed: 2_level_1
Adelaide,Adelaide,1
Adelaide,North Adelaide,1
Adelaide Hills,Aldgate,1
Adelaide Hills,Ashton,1
Adelaide Hills,Balhannah,1
...,...,...
Yorke Peninsula,White Hut,1
Yorke Peninsula,Winulta,1
Yorke Peninsula,Wool Bay,1
Yorke Peninsula,Yorke Valley,1


In [38]:
df.to_csv('australian_cities.csv')

In [39]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv('australian_cities.csv')

# Define a function to extract the town name from the suburb name
def extract_town(suburb):
    # Use regular expressions to extract the town name
    town = re.search(r'([A-Za-z]+) ?(North|South|East|West)?', suburb)
    if town:
        return town.group(0)
    else:
        return None

# Apply the function to the suburb column
df['townname'] = df['suburbname'].apply(extract_town)

# Drop the suburbs that don't have a corresponding town
df = df.dropna(subset=['townname'])

# Create a new column for the suburb name without the town
df['suburbname'] = df.apply(lambda row: row['suburbname'].replace(row['townname'], ''), axis=1)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15334 entries, 0 to 15333
Data columns (total 9 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  15334 non-null  int64 
 1   stateid     15334 non-null  int64 
 2   statename   15334 non-null  object
 3   cityid      15334 non-null  int64 
 4   cityname    15334 non-null  object
 5   suburbid    15334 non-null  int64 
 6   suburbname  15334 non-null  object
 7   geo_point   15333 non-null  object
 8   townname    15334 non-null  object
dtypes: int64(4), object(5)
memory usage: 1.1+ MB


In [41]:
df.sample(10)

Unnamed: 0.1,Unnamed: 0,stateid,statename,cityid,cityname,suburbid,suburbname,geo_point,townname
9503,9503,1,NSW,15240,"Mid-Coast, Port Stephens",12084,,"-32.63223446393546, 151.95708361528494",Karuah
2020,2020,3,Qld,36510,Scenic Rim,31628,(Qld),"-28.267623627223003, 153.01702026033266",Lamington
3242,3242,5,WA,51540,Carnarvon,50763,(WA),"-24.863948874317437, 113.69553069143667",Kingsford
7906,7906,1,NSW,14900,Liverpool,12431,,"-33.935784504778816, 150.89705805689786",Lurnea
10951,10951,5,WA,54900,Lake Grace,51378,Lake Grace,"-33.21285902029818, 118.4456075084287",South
13392,13392,2,Vic,21270,Buloke,22324,(Buloke - Vic.),"-35.57471947411776, 143.11431553328774",Springfield
783,783,2,Vic,21450,Cardinia,22221,,"-38.13954604867074, 145.46265263440387",Rythdale
4692,4692,2,Vic,26810,Wellington,20675,,"-37.484076407543554, 146.62143834922665",Crookayan
1294,1294,1,NSW,16950,Shoalhaven,14469,,"-35.39063888615545, 150.17649107583705",Yadboro
13942,13942,3,Qld,31000,Brisbane,30910,Park,"-27.49556708731966, 153.02616560021676",Dutton


In [42]:
import re
import pandas as pd

# Input data
data = """
{{short description|None}}

{{Use Australian English|date=March 2018}}
{{Use dmy dates|date=May 2024}}
This is a list of towns in [[Australia]] by state.

==Australian Capital Territory==
{{see also|Category:Towns in the Australian Capital Territory}}
* [[Hall, Australian Capital Territory|Hall]]
* [[Canberra]]
* [[Tharwa, Australian Capital Territory|Tharwa]]

==New South Wales==
{{see also|Category:Towns in New South Wales}}
{{As of|2019|2|21}} the Geographical Names Register (GNR) of NSW, which is maintained by the [[Geographical Names Board of New South Wales]], lists 265 places that are assigned or recorded as towns in New South Wales.<ref>{{cite web|url=http://www.gnb.nsw.gov.au/__gnbfile?transaction=savefile&placename=&status=ASSIGNED&designation=TOWN&lga=None&map=&parish=|title=List of placenames ASSIGNED as TOWNs|work=Geographical Names Register (GNR) of NSW|publisher=[[Geographical Names Board of New South Wales]]|accessdate=21 February 2019}}</ref><ref>{{cite web|url=http://www.gnb.nsw.gov.au/__gnbfile?transaction=savefile&placename=&status=RECORDED&designation=TOWN&lga=None&map=&parish=|title=List of placenames RECORDED as TOWNs|work=Geographical Names Register (GNR) of NSW|publisher=[[Geographical Names Board of New South Wales]]|accessdate=21 February 2019}}</ref><ref>{{cite web|url=http://www.gnb.nsw.gov.au/__gnbfile?transaction=savefile&placename=&status=VARIANT&designation=TOWN&lga=None&map=&parish=|title=List of placenames recorded as being VARIANTS of TOWNs that are not listed as ASSIGNED|work=Geographical Names Register (GNR) of NSW|publisher=[[Geographical Names Board of New South Wales]]|accessdate=21 February 2019}}</ref><!-- Multiple references are needed to accurately identify all locations as some of the places ASSIGNED as towns are discontinued, while some of the current variants reflect valid towns that are not listed as ASSIGNED. For example, "Town of Raymond Terrace" is a variant for "Raymond Terrace", which is recorded on the GNR as a suburb instead of a town. -->

{{Div col|colwidth=15em}}
* [[Aberdare, New South Wales|Aberdare]]
* [[Abermain, New South Wales|Abermain]]
* [[Adaminaby]]
* [[Adelong, New South Wales|Adelong]]
* [[Agnes Banks, New South Wales|Agnes Banks]]
* [[Anna Bay]]
* [[Ardlethan]]
* [[Ariah Park]]
* [[Ashford, New South Wales|Ashford]]
* [[Austinmer, New South Wales|Austinmer]]
* [[Avoca Beach]]
* [[Ballina, New South Wales|Ballina]]
* [[Balranald]]
* [[Bangalow]]
* [[Baradine]]
* [[Bargo, New South Wales|Bargo]]
* [[Barham, New South Wales|Barham]]
* [[Barraba, New South Wales|Barraba]]
* [[Batemans Bay]]
* [[Batlow, New South Wales|Batlow]]
* [[Bega, New South Wales|Bega]]
* [[Bellbird, New South Wales|Bellbird]]
* [[Bellingen, New South Wales|Bellingen]]
* [[Berkeley Vale, New South Wales|Berkeley Vale]]
* [[Bermagui, New South Wales|Bermagui]]
* [[Berridale, New South Wales|Berridale]]
* [[Berrigan, New South Wales|Berrigan]]
* [[Berrima, New South Wales|Berrima]]
* [[Berry, New South Wales|Berry]]
* [[Bilpin, New South Wales|Bilpin]]
* [[Binalong]]
* [[Bingara, New South Wales|Bingara]]
* [[Binnaway, New South Wales|Binnaway]]
* [[Blackheath, New South Wales|Blackheath]]
* [[Blaxland, New South Wales|Blaxland]]
* [[Blayney, New South Wales|Blayney]]
* [[Boggabilla]]
* [[Boggabri]]
* [[Bolwarra, New South Wales|Bolwarra]]
* [[Bomaderry, New South Wales|Bomaderry]]
* [[Bombala]]
* [[Bonalbo]]
* [[Bonnells Bay, New South Wales|Bonnells Bay]]
* [[Bowenfels, New South Wales|Bowenfels]]
* [[Bowraville, New South Wales|Bowraville]]
* [[Braidwood, New South Wales|Braidwood]]
* [[Branxton, New South Wales|Branxton]]
* [[Brewarrina]]
* [[Brooklyn, New South Wales|Brooklyn]]
* [[Brunswick Heads]]
* [[Bulahdelah, New South Wales|Bulahdelah]]
* [[Bullaburra, New South Wales|Bullaburra]]
* [[Bulli, New South Wales|Bulli]]
* [[Bundarra, New South Wales|Bundarra]]
* [[Bungendore]]
* [[Buronga, New South Wales|Buronga]]
* [[Burradoo, New South Wales|Burradoo]]
* [[Canowindra]]
* [[Captains Flat]]
* [[Clarence Town, New South Wales|Clarence Town]]
* [[Cobar]]
* [[Coledale, New South Wales|Coledale]]
* [[Collarenebri]]
* [[Condobolin]]
* [[Coolamon, New South Wales|Coolamon]]
* [[Cooma]]
* [[Coonabarabran]]
* [[Coonamble]]
* [[Cooranbong, New South Wales|Cooranbong]]
* [[Cootamundra]]
* [[Corowa]]
* [[Cowra]]
* [[Crescent Head, New South Wales|Crescent Head]]
* [[Crookwell, New South Wales|Crookwell]]
* [[Cudal, New South Wales|Cudal]]
* [[Cumnock, New South Wales|Cumnock]]
* [[Dapto, New South Wales|Dapto]]
* [[Dareton, New South Wales|Dareton]]
* [[Deepwater, New South Wales|Deepwater]]
* [[Delegate, New South Wales|Delegate]]
* [[Delungra, New South Wales|Delungra]]
* [[Deniliquin]]
* [[Denman, New South Wales|Denman]]
* [[Dorrigo, New South Wales|Dorrigo]]
* [[Dunedoo]]
* [[Dungog, New South Wales|Dungog]]
* [[Edgeworth, New South Wales|Edgeworth]]
* [[Ellalong, New South Wales|Ellalong]]
* [[Emu Plains, New South Wales|Emu Plains]]
* [[Eugowra]]
* [[Euston, New South Wales|Euston]]
* [[Evans Head, New South Wales|Evans Head]]
* [[Faulconbridge, New South Wales|Faulconbridge]]
* [[Fingal Head, New South Wales|Fingal Head]]
* [[Forbes, New South Wales|Forbes]]
* [[Forster, New South Wales|Forster]]
* [[Frederickton, New South Wales|Frederickton]]
* [[Galong, New South Wales|Galong]]
* [[Ganmain]]
* [[Gerringong, New South Wales|Gerringong]]
* [[Geurie, New South Wales|Geurie]]
* [[Gilgandra, New South Wales|Gilgandra]]
* [[Gillieston Heights]]
* [[Gladstone, New South Wales|Gladstone]]
* [[Glen Innes, New South Wales|Glen Innes]]
* [[Glenbrook, New South Wales|Glenbrook]]
* [[Gol Gol, New South Wales|Gol Gol]]
* [[Gorokan, New South Wales|Gorokan]]
* [[Greenhill, New South Wales|Greenhill]]
* [[Greenwell Point]]
* [[Grenfell, New South Wales|Grenfell]]
* [[Greta, New South Wales|Greta]]
* [[Grose Vale, New South Wales|Grose Vale]]
* [[Gulargambone]]
* [[Gundagai]]
* [[Gunnedah]]<ref>{{NSW GNR|id=MaqwoesEuj|title=Gunnedah|accessdate=21 February 2019}}</ref>
* [[Gunning, New South Wales|Gunning]]
* [[Guyra, New South Wales|Guyra]]
* [[Harden, New South Wales|Harden]]
* [[Hawks Nest, New South Wales|Hawks Nest]]
* [[Hay, New South Wales|Hay]]
* [[Hazelbrook, New South Wales|Hazelbrook]]
* [[Heddon Greta, New South Wales|Heddon Greta]]
* [[Helensburgh, New South Wales|Helensburgh]]
* [[Henty, New South Wales|Henty]]
* [[Hexham, New South Wales|Hexham]]
* [[Hillston, New South Wales|Hillston]]
* [[Holbrook, New South Wales|Holbrook]]
* [[Huskisson, New South Wales|Huskisson]]
* [[Inverell]]
* [[Ivanhoe, New South Wales|Ivanhoe]]
* [[Jennings, New South Wales|Jennings]]
* [[Jerilderie]]
* [[Jindabyne, New South Wales|Jindabyne]]
* [[Jugiong]]
* [[Junee]]
* [[Kandos, New South Wales|Kandos]]
* [[Katoomba, New South Wales|Katoomba]]
* [[Kearsley, New South Wales|Kearsley]]
* [[Kempsey, New South Wales|Kempsey]]
* [[Kimovale, New South Wales|Kimovale]]
* [[Kinchela, New South Wales|Kinchela]]
* [[Kurrajong, New South Wales|Kurrajong]]
* [[Kurri Kurri, New South Wales|Kurri Kurri]]
* [[Kyogle]]
* [[Lake Cargelligo, New South Wales|Lake Cargelligo]]
* [[Lake Illawarra, New South Wales|Lake Illawarra]]
* [[Lapstone, New South Wales|Lapstone]]
* [[Lawson, New South Wales|Lawson]]
* [[Leura, New South Wales|Leura]]
* [[Lightning Ridge, New South Wales|Lightning Ridge]]
* [[Macksville, New South Wales|Macksville]]
* [[Mallanganee, New South Wales|Mallanganee]]
* [[Manildra, New South Wales|Manildra]]
* [[Manilla, New South Wales|Manilla]]
* [[Marulan]]
* [[Mathoura]]
* [[Medlow Bath, New South Wales|Medlow Bath]]
* [[Mendooran]]
* [[Menindee, New South Wales|Menindee]]
* [[Merimbula]]
* [[Millfield, New South Wales|Millfield]]
* [[Milton, New South Wales|Milton]]
* [[Moama]]
* [[Molong]]
* [[Moree, New South Wales|Moree]]
* [[Moulamein]]
* [[Mount Riverview, New South Wales|Mount Riverview]]
* [[Mount Victoria, New South Wales|Mount Victoria]]
* [[Mudgee]]
* [[Murrumburrah]]
* [[Murwillumbah]]
* [[Muswellbrook, New South Wales|Muswellbrook]]
* [[Nabiac, New South Wales|Nabiac]]
* [[Nambucca Heads]]
* [[Narooma, New South Wales|Narooma]]
* [[Narrandera]]
* [[Narromine]]
* [[Neath, New South Wales|Neath]]
* [[Nelson Bay, New South Wales|Nelson Bay]]
* [[Nimmitabel]]
* [[Noraville]]
* [[North Richmond, New South Wales|North Richmond]]
* [[Nowra]]
* [[Numbaa, New South Wales|Numbaa]]
* [[Nundle, New South Wales|Nundle]]
* [[Nyngan]]
* [[Old Junee]]
* [[Ourimbah]]
* [[Oxley Vale, New South Wales|Oxley Vale]]
* [[Pallamallawa]]
* [[Pambula, New South Wales|Pambula]]
* [[Parkes, New South Wales|Parkes]]
* [[Paterson, New South Wales|Paterson]]
* [[Paxton, New South Wales|Paxton]]
* [[Peak Hill, New South Wales|Peak Hill]]
* [[Picton, New South Wales|Picton]]
* [[Port Kembla, New South Wales|Port Kembla]]
* [[Port Macquarie]]<ref>{{NSW GNR|id=ujjtvqtLJP|title=Port Macquarie|accessdate=21 February 2019}}</ref>
* [[Raleigh, New South Wales|Raleigh]]
* [[Raymond Terrace]]
* [[Rylstone, New South Wales|Rylstone]]
* [[Scarborough, New South Wales|Scarborough]]
* [[Smithtown, New South Wales|Smithtown]]
* [[Gundagai|South Gundagai]]
* [[South West Rocks, New South Wales|South West Rocks]]
* [[Springwood, New South Wales|Springwood]]
* [[Stanwell Park, New South Wales|Stanwell Park]]
* [[Stuart Town, New South Wales|Stuart Town]]
* [[Sussex Inlet, New South Wales|Sussex Inlet]]
* [[Talbingo, New South Wales|Talbingo]]
* [[Tarcutta]]
* [[Taree]]<ref>{{NSW GNR|id=TRjtvqKmSX|title=Taree|accessdate=21 February 2019}}</ref>
* [[Tea Gardens, New South Wales|Tea Gardens]]
* [[Tenterfield, New South Wales|Tenterfield]]
* [[Terrigal]]
* [[The Oaks, New South Wales|The Oaks]]
* [[The Rock, New South Wales|The Rock]]
* [[Thirlmere, New South Wales|Thirlmere]]
* [[Thirroul, New South Wales|Thirroul]]
* [[Thurgoona, New South Wales|Thurgoona]]
* [[Tingha, New South Wales|Tingha]]
* [[Tomerong]]
* [[Tottenham, New South Wales|Tottenham]]
* [[Toukley, New South Wales|Toukley]]
* [[Trangie]]
* [[Trundle, New South Wales|Trundle]]
* [[Tullamore, New South Wales|Tullamore]]
* [[Tumbarumba]]
* [[Tumut]]
* [[Tuncurry, New South Wales|Tuncurry]]
* [[Tuntable Creek, New South Wales|Tuntable Creek]]
* [[Tweed Heads, New South Wales|Tweed Heads]]
* [[Ulladulla, New South Wales|Ulladulla]]
* [[Ungarie]]
* [[Uralla, New South Wales|Uralla]]
* [[Urana]]
* [[Uranquinty]]
* [[Urbenville, New South Wales|Urbenville]]
* [[Urunga, New South Wales|Urunga]]
* [[Valley Heights, New South Wales|Valley Heights]]
* [[Walcha, New South Wales|Walcha]]
* [[Walgett, New South Wales|Walgett]]
* [[Warialda]]
* [[Warrell Creek, New South Wales|Warrell Creek]]
* [[Warren, New South Wales|Warren]]
* [[Warrimoo, New South Wales|Warrimoo]]
* [[Waterfall, New South Wales|Waterfall]]
* [[Wellington, New South Wales|Wellington]]
* [[Wentworth, New South Wales|Wentworth]]
* [[Wentworth Falls, New South Wales|Wentworth Falls]]
* [[West Wallsend]]
* [[West Wyalong]]
* [[Weston, New South Wales|Weston]]
* [[Wilcannia]]
* [[Wingham, New South Wales|Wingham]]<ref>{{NSW GNR|id=SXYbBKxOJP|title=Wingham|accessdate=21 February 2019}}</ref>
* [[Wombarra, New South Wales|Wombarra]]
* [[Woodburn, New South Wales|Woodburn]]
* [[Woodenbong]]
* [[Woodford, New South Wales|Woodford]]
* [[Woodstock, New South Wales|Woodstock]]
* [[Wyalong]]
* [[Wyong, New South Wales|Wyong]]
* [[Yass, New South Wales|Yass]]
* [[Yeoval, New South Wales|Yeoval]]
* [[Young, New South Wales|Young]]
* [[Scone, New South Wales|Scone]]
{{div col end}}

==Northern Territory==
{{see also|Category:Towns in the Northern Territory}}
{{Div col|colwidth=15em}}
* [[Adelaide River, Northern Territory|Adelaide River]]
* [[Ali Curung]]
* [[Alice Springs]]
* [[Alpurrurulam, Northern Territory|Alpurrurulam]]
* [[Alyangula, Northern Territory|Alyangula]]
* [[Amoonguna]]
* [[Angurugu, Northern Territory|Angurugu]]
* [[Aputula]]
* [[Areyonga, Northern Territory|Areyonga]]
* [[Atitjere, Northern Territory|Atitjere]]
* [[Barrow Creek, Northern Territory|Barrow Creek]]
* [[Barunga, Northern Territory|Barunga]]
* [[Batchelor, Northern Territory|Batchelor]]
* [[Birdum, Northern Territory|Birdum]]
* [[Borroloola]]
* [[Bulman, Northern Territory|Bulman]]
* [[Daly River, Northern Territory|Daly River]]
* [[Daly Waters, Northern Territory|Daly Waters]]
* [[Elliott, Northern Territory|Elliott]]
* [[Fleming, Northern Territory|Fleming]]
* [[Gunbalanya, Northern Territory|Gunbalanya]]
* [[Haasts Bluff, Northern Territory|Haasts Bluff]]
* [[Hart Range, Northern Territory|Harts Range]]
* [[Hermannsburg, Northern Territory|Hermannsburg]]
* [[Humpty Doo]]
* [[Imanpa, Northern Territory|Imanpa]]
* [[Jabiru, Northern Territory|Jabiru]]
* [[Kalkarindji]]
* [[Kaltukatjara]]
* [[Katherine, Northern Territory|Katherine]]
* [[Kintore, Northern Territory|Kintore]]
* [[Kulgera, Northern Territory|Kulgera]]
* [[Lajamanu, Northern Territory|Lajamanu]]
* [[Larrimah, Northern Territory|Larrimah]]
* [[Maningrida, Northern Territory|Maningrida]]
* [[Mataranka, Northern Territory|Mataranka]]
* [[Milikapiti, Northern Territory|Milikapiti]]
* [[Minjilang, Northern Territory|Minjilang]]
* [[Mutitjulu]]
* [[Newcastle Waters]]
* [[Nganmarriyanga, Northern Territory|Nganmarriyanga]]
* [[Ngukurr]]
* [[Nhulunbuy]]
* [[Numbulwar]]
* [[Nyirripi, Northern Territory|Nyirripi]]
* [[Maningrida, Northern Territory|Maningrida]]
* [[Papunya]]
* [[Peppimenarti, Northern Territory|Peppimenarti]]
* [[Pine Creek, Northern Territory|Pine Creek]]
* [[Tennant Creek]]
* [[Ti-Tree, Northern Territory|Ti-Tree]]
* [[Timber Creek, Northern Territory|Timber Creek]]
* [[Wadeye]]
* [[Warruwi, Northern Territory|Warruwi]]
* [[Wurrumiyanga]]
* [[Yarralin, Northern Territory|Yarralin]]
* [[Yirrkala]]
* [[Yuendumu]]
* [[Yulara, Northern Territory|Yulara]]
{{Div col end}}

==Queensland==
{{see also|Category:Towns in Queensland}}

{{Div col|colwidth=15em}}
* [[Adavale]]
* [[Allora, Queensland|Allora]]
* [[Aramac, Queensland|Aramac]]
* [[Atherton, Queensland|Atherton]]
* [[Augathella]]
* [[Ayr, Queensland|Ayr]]
* [[Banana, Queensland|Banana]]
* [[Barcaldine, Queensland|Barcaldine]]
* [[Beaudesert, Queensland|Beaudesert]]
* [[Bedourie, Queensland|Bedourie]]
* [[Biloela]]
* [[Birdsville]]
* [[Blackall, Queensland|Blackall]]
* [[Blackbutt, Queensland|Blackbutt]]
* [[Blackwater, Queensland|Blackwater]]
* [[Bogantungan]]
* [[Boonah, Queensland|Boonah]]
* [[Bouldercombe]]
* [[Boulia, Queensland|Boulia]]
* [[Bowen, Queensland|Bowen]]
* [[Bray Park, Queensland|Bray Park]]
* [[Calliope, Queensland|Calliope]]
* [[Cambooya, Queensland|Cambooya]]
* [[Camooweal]]
* [[Capella, Queensland|Capella]]
* [[Cardwell, Queensland|Cardwell]]
* [[Cashmere, Queensland|Cashmere]]
* [[Charleville, Queensland|Charleville]]
* [[Childers, Queensland|Childers]]
* [[Chillagoe, Queensland|Chillagoe]]
* [[Chinchilla, Queensland|Chinchilla]]
* [[Clermont, Queensland|Clermont]]
* [[Clifton, Queensland|Clifton]]
* [[Cloncurry, Queensland|Cloncurry]]
* [[Coen, Queensland|Coen]]
* [[Collinsville, Queensland|Collinsville]]
* [[Cooktown, Queensland|Cooktown]]
* [[Cooroy, Queensland|Cooroy]]
* [[Crows Nest, Queensland|Crows Nest]]
* [[Cunnamulla]]
* [[Dysart, Queensland]]
* [[Dalby, Queensland|Dalby]]
* [[Dirranbandi, Queensland|Dirranbandi]]
* [[Duaringa]]
* [[Eidsvold, Queensland|Eidsvold]]
* [[Emerald, Queensland|Emerald]]
* [[Emu Park, Queensland|Emu Park]]
* [[Esk, Queensland|Esk]]
* [[Gatton, Queensland|Gatton]]
* [[Gayndah]]
* [[Georgetown, Queensland|Georgetown]]
* [[Goombungee, Queensland|Goombungee]]
* [[Goondiwindi]]
* [[Gordonvale, Queensland|Gordonvale]]
* [[Gracemere, Queensland|Gracemere]]
* [[Grantham, Queensland|Grantham]]
* [[Greenvale, Queensland|Greenvale]]
* [[Hebel, Queensland|Hebel]]
* [[Herberton, Queensland|Herberton]]
* [[Highfields, Queensland|Highfields]]
* [[Hughenden, Queensland|Hughenden]]
* [[Ingham, Queensland|Ingham]]
* [[Inglewood, Queensland|Inglewood]]
* [[Injune]]
* [[Innisfail, Queensland|Innisfail]]
* [[Isisford, Queensland|Isisford]]
* [[Jimboomba]]
* [[Kilcoy, Queensland|Kilcoy]]
* [[Kilkivan, Queensland|Kilkivan]]
* [[Kingaroy]]
* [[Laidley, Queensland|Laidley]]
* [[Longreach, Queensland|Longreach]]
* [[Maleny, Queensland|Maleny]]
* [[Mareeba]]
* [[Marlborough, Queensland|Marlborough]]
* [[Miles, Queensland|Miles]]
* [[Mitchell, Queensland|Mitchell]]
* [[Monto, Queensland|Monto]]
* [[Mount Morgan, Queensland|Mount Morgan]]
* [[Mount Samson, Queensland|Mount Samson]] 
* [[Moura, Queensland|Moura]]
* [[Mundubbera]]
* [[Murgon]]
* [[Muttaburra]]
* [[Nambour]]
* [[Nobby, Queensland|Nobby]]
* [[Normanton, Queensland|Normanton]]
* [[Oakey, Queensland|Oakey]]
* [[Proserpine, Queensland|Proserpine]]
* [[Quilpie, Queensland|Quilpie]]
* [[Rathdowney, Queensland|Rathdowney]]
* [[Richmond, Queensland|Richmond]]
* [[Rolleston, Queensland|Rolleston]]
* [[Roma, Queensland|Roma]]
* [[Rosewood, Queensland|Rosewood]]
* [[St George, Queensland|St George]]
* [[Samford, Queensland|Samford]]
* [[Samsonvale, Queensland|Samsonvale]]
* [[Sapphire, Queensland|Sapphire]]
* [[Sarina, Queensland|Sarina]]
* [[Springsure]]
* [[Stanthorpe, Queensland|Stanthorpe]]
* [[Strathpine, Queensland|Strathpine]]
* [[Surat, Queensland|Surat]]
* [[Tambo, Queensland|Tambo]]
* [[Taroom]]
* [[Texas, Queensland|Texas]]
* [[Tolga, Queensland|Tolga]]
* [[Toowoomba]]
* [[Thargomindah]]
* [[Theodore, Queensland|Theodore]]
* [[Tully, Queensland|Tully]]
* [[Wallangarra, Queensland|Wallangarra]]
* [[Warwick, Queensland|Warwick]]
* [[Warner, Queensland|Warner]]
* [[Weipa]]
* [[Westwood, Queensland|Westwood]]
* [[Windorah]]
* [[Winton, Queensland|Winton]]
* [[Yeppoon]]
* [[Yungaburra]]
{{Div col end}}

==South Australia==
{{main|List of cities and towns in South Australia}}
{{see also|List of towns in the Adelaide Hills|Category:Towns in South Australia}}

==Tasmania==
{{main|List of localities in Tasmania}}
{{see also|Category:Towns in Tasmania}}

==Victoria==
{{main|List of localities in Victoria}}
{{see also|Category:Towns in Victoria (state)}}

==Western Australia==
{{main|List of towns in Western Australia}}
{{see also|Category:Towns in Western Australia}}

== See also ==
* [[List of towns and cities in Australia by year of settlement]]

== References ==
{{Reflist}}

{{Oceania topic|List of towns in}}

[[Category:Towns in Australia by state or territory| ]]
[[Category:Lists of towns in Australia| ]]

"""

# Extract state and town information
sections = re.split(r"==([a-zA-Z\s]+)==", data)
states_towns = []

for i in range(1, len(sections), 2):
    state = sections[i].strip()
    towns_raw = sections[i + 1]
    # Match town names (ignore templates and non-town-related data)
    towns = re.findall(r"\[\[([\w\s,]+)\|?[\w\s,]*\]\]", towns_raw)
    for town in towns:
        # Remove the state name after a comma, if present
        clean_town = town.split(",")[0].strip()
        states_towns.append((state, clean_town))

# Create a DataFrame
df1 = pd.DataFrame(states_towns, columns=["State", "Town"])

# Save to Excel
output_file = "towns_by_state_cleaned.xlsx"
df1.to_excel(output_file, index=False)
print(f"Data saved to {output_file}")


Data saved to towns_by_state_cleaned.xlsx


In [43]:
import pandas as pd

# Load the main dataset and the towns list
main_df = pd.read_csv("australian_cities.csv")  # Replace with your main dataset filename
towns_df = pd.read_excel("towns_by_state_cleaned.xlsx")  # Replace with your towns list filename

# Assuming the relevant columns are named 'Suburb Name' in main_df and 'Town Name' in towns_df
towns_list = towns_df['Town'].str.strip().unique()  # Get unique town names

# Add a column to indicate if the row is a town or a suburb
main_df['Type'] = main_df['suburbname'].str.strip().apply(
    lambda name: 'Town' if name in towns_list else 'Suburb'
)

# Save the updated DataFrame
main_df.to_excel("updated_dataset.xlsx", index=False)

print("Classification complete. Updated dataset saved to 'updated_dataset.xlsx'.")


Classification complete. Updated dataset saved to 'updated_dataset.xlsx'.


In [44]:
main_df[main_df['suburbname'] == 'Bangalow']

Unnamed: 0.1,Unnamed: 0,stateid,statename,cityid,cityname,suburbid,suburbname,geo_point,Type
11182,11182,1,NSW,11350,Byron,10175,Bangalow,"-28.68325067650936, 153.5219952341208",Town


In [45]:
towns_list.size

449

In [46]:
towns_df['Town'].nunique()

449

In [47]:
df_type = pd.read_excel('updated_dataset.xlsx')


In [48]:
df_type.head()

Unnamed: 0.1,Unnamed: 0,stateid,statename,cityid,cityname,suburbid,suburbname,geo_point,Type
0,0,1,NSW,17040,Snowy Monaro Regional,10015,Adaminaby,"-36.01193158762528, 148.786320441595",Town
1,1,1,NSW,10050,Albury,10027,Albury,"-36.07369799464406, 146.91346780765934",Suburb
2,2,1,NSW,16200,Parkes,10029,Alectown,"-32.91425516102453, 148.26802074020546",Suburb
3,3,1,NSW,18350,Wingecarribee,10044,Alpine,"-34.40869358826243, 150.53677066431447",Suburb
4,4,1,NSW,15560,Murrumbidgee,10081,Argoon,"-34.91797895821861, 145.6572484339588",Suburb


In [49]:
df_type['Type'].value_counts()

Type
Suburb    14958
Town        376
Name: count, dtype: int64

In [50]:
matching_entries = towns_df[towns_df['Town'].str.strip().isin(main_df['suburbname'].str.strip())]
matching_entries

Unnamed: 0,State,Town
0,Australian Capital Territory,Hall
2,Australian Capital Territory,Tharwa
7,New South Wales,Aberdare
8,New South Wales,Abermain
9,New South Wales,Adaminaby
...,...,...
444,Queensland,Wallangarra
446,Queensland,Warner
449,Queensland,Windorah
451,Queensland,Yeppoon


In [51]:
# removing the max limit in vscode
pd.set_option('display.max_rows', None)

In [52]:
towns_not_in_main_df = towns_df[~towns_df['Town'].str.strip().isin(main_df['suburbname'].str.strip())]
towns_not_in_main_df

Unnamed: 0,State,Town
1,Australian Capital Territory,Canberra
3,New South Wales,Geographical Names Board of New South Wales
4,New South Wales,Geographical Names Board of New South Wales
5,New South Wales,Geographical Names Board of New South Wales
6,New South Wales,Geographical Names Board of New South Wales
15,New South Wales,Ashford
41,New South Wales,Blaxland
45,New South Wales,Bolwarra
55,New South Wales,Brooklyn
72,New South Wales,Cooma
