# Preprocess shopping center data
This notebook is to preprocess the shopping center data, at the end, we will get the number of shopping centers in each suburb and their geo location

### Import packages

In [1]:
import re
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point

### Read the files

In [2]:
shopping_centers = pd.read_csv('../data/landing/shopping_center.csv')

In [3]:
shopping_centers.head()

Unnamed: 0,name,information
0,206 Bourke Street,"Victoria, Melbourne, GPS: -37.812733, 144.9669..."
1,670 Chapel,"Victoria, Melbourne, GPS: -37.837395, 144.9961..."
2,Acland Court Shopping Centre,"Victoria, St Kilda, GPS: -37.868967, 144.98061..."
3,Altona Gate Shopping Centre,"Victoria, Melbourne, GPS: -37.828989, 144.8462..."
4,Arena Shopping Centre,"Victoria, Officer, GPS: -38.064493171914, 145...."


# Retrieve information for each shopping center 

In [4]:
def fill_shopping_info (i, state, city, lantitude, longitude, num_store):
    '''fill in the ith row of a dataframe with relevant information about a shopping center'''
    shopping_centers.loc[i, 'state'] = state
    shopping_centers.loc[i, 'city'] = city 
    shopping_centers.loc[i, 'longitude'] = longitude
    shopping_centers.loc[i, 'latitude'] = lantitude
    shopping_centers.loc[i, 'num_store'] = num_store

In [5]:
for i in range(shopping_centers.shape[0]):
    info = shopping_centers.loc[i, 'information']
    # match the informations
    try:
        retrieved_info= re.findall('(.+),\s(.+),\sGPS:\s(.+),\s(.+)\s\|\sphone:\s(?:\+61\s)?.+\s\|\sstores:\s(.+)', info)[0]
        fill_shopping_info(i, *retrieved_info)

    # some shopping contain information in a different format, we deal with them seperately
    except IndexError:
        if i in [30,174,189,198]:
            retrieved_info = re.findall('(.+),\s(.+),\sGPS:\s(.+),\s(.+)\s\|\sphone:.+', info)[0]
            fill_shopping_info(i, *retrieved_info, num_store=np.nan)

        else:
            retrieved_info = re.findall('(.+),\s(.+),\sGPS:\s(.+),\s(.+)\s\s\|\sstores:\s(.+)', info)[0]
            fill_shopping_info(i, *retrieved_info)           

In [6]:
shopping_centers.drop('information', inplace=True, axis=1)
shopping_centers.head()

Unnamed: 0,name,state,city,longitude,latitude,num_store
0,206 Bourke Street,Victoria,Melbourne,144.966947,-37.812733,14
1,670 Chapel,Victoria,Melbourne,144.996158,-37.837395,28
2,Acland Court Shopping Centre,Victoria,St Kilda,144.980617,-37.868967,11
3,Altona Gate Shopping Centre,Victoria,Melbourne,144.84627,-37.828989,63
4,Arena Shopping Centre,Victoria,Officer,145.43517539621,-38.064493171914,30


In [7]:
# number of store should be an integer however, since it contain nan, we leave it as float
shopping_centers['num_store'] = shopping_centers['num_store'].astype(float)

# the longitude and lantitude info should be float instead of string
shopping_centers['longitude'] = shopping_centers['longitude'].astype(float)
shopping_centers['latitude'] = shopping_centers['latitude'].astype(float)

In [8]:
shopping_centers.dtypes

name          object
state         object
city          object
longitude    float64
latitude     float64
num_store    float64
dtype: object

# Get the SA2 region for each shopping center

In [9]:
# read the shpae file that contain SA2 info
suburbs = gpd.read_file("../data/curated/SA2_2021_AUST_GDA2020.shp")

In [10]:
# get the relevant columns
suburbs = suburbs.loc[suburbs['STE_NAME21']=='Victoria']
suburbs = suburbs[['SA2_CODE21','SA2_NAME21','geometry']]

In [11]:
# Convert the shopping center dataframe to a gdf
geometry = [Point(xy) for xy in zip(shopping_centers['longitude'], shopping_centers['latitude'])]
shopping_gdf = gpd.GeoDataFrame(shopping_centers, geometry=geometry)

In [12]:
# Use sjoin to find out which suburb each point belongs to
joined = gpd.sjoin(shopping_gdf, suburbs, how="left", op="within")

# If your suburbs shapefile has a column named 'suburb_name' that specifies the name of the suburb
shopping_gdf['SA2_NAME'] = joined['SA2_NAME21']
shopping_gdf['SA2_CODE'] = joined['SA2_CODE21']

  if await self.run_code(code, result, async_=asy):
Use `to_crs()` to reproject one of the input geometries to match the CRS of the other.

Left CRS: None
Right CRS: EPSG:7844

  joined = gpd.sjoin(shopping_gdf, suburbs, how="left", op="within")


In [16]:
# saved the shopping centers to curated data file
shopping_gdf.to_csv('../data/curated/shopping_centers.csv', index=False)