In [36]:
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect

**bold text**### Store  ABS ERP CSV into DataFrame

In [70]:
csv_file = "Resources/ABS_ERP_COMP.csv"
erp_data_df = pd.read_csv(csv_file)
erp_data_df.head()

Unnamed: 0,DATAFLOW,MEASURE: Measure,REGION: Region,FREQ: Frequency,TIME_PERIOD: Time Period,OBS_VALUE,UNIT_MEASURE: Unit of Measure,UNIT_MULT: Unit of Multiplier,OBS_STATUS: Observation Status,OBS_COMMENT: Observation Comment
0,ABS:ERP_COMP_Q(1.0.0),3: Natural Increase,3: Queensland,Q: Quarterly,1981-Q2,6191.0,NUM: Number,0: Units,,
1,ABS:ERP_COMP_Q(1.0.0),3: Natural Increase,3: Queensland,Q: Quarterly,1981-Q3,4920.0,NUM: Number,0: Units,,
2,ABS:ERP_COMP_Q(1.0.0),3: Natural Increase,3: Queensland,Q: Quarterly,1981-Q4,4756.0,NUM: Number,0: Units,,
3,ABS:ERP_COMP_Q(1.0.0),3: Natural Increase,3: Queensland,Q: Quarterly,1982-Q1,6331.0,NUM: Number,0: Units,,
4,ABS:ERP_COMP_Q(1.0.0),3: Natural Increase,3: Queensland,Q: Quarterly,1982-Q2,6081.0,NUM: Number,0: Units,,


Check Data Types

In [71]:
erp_data_df.dtypes

DATAFLOW                             object
MEASURE: Measure                     object
REGION: Region                       object
FREQ: Frequency                      object
TIME_PERIOD: Time Period             object
OBS_VALUE                           float64
UNIT_MEASURE: Unit of Measure        object
UNIT_MULT: Unit of Multiplier        object
OBS_STATUS: Observation Status       object
OBS_COMMENT: Observation Comment    float64
dtype: object

### Create new data with select columns

In [72]:
# Delete column we don't want
del erp_data_df['DATAFLOW']
del erp_data_df['OBS_STATUS: Observation Status']
del erp_data_df['OBS_COMMENT: Observation Comment']
del erp_data_df['UNIT_MEASURE: Unit of Measure']
del erp_data_df['UNIT_MULT: Unit of Multiplier']
del erp_data_df['FREQ: Frequency']
erp_data_df.head()

Unnamed: 0,MEASURE: Measure,REGION: Region,TIME_PERIOD: Time Period,OBS_VALUE
0,3: Natural Increase,3: Queensland,1981-Q2,6191.0
1,3: Natural Increase,3: Queensland,1981-Q3,4920.0
2,3: Natural Increase,3: Queensland,1981-Q4,4756.0
3,3: Natural Increase,3: Queensland,1982-Q1,6331.0
4,3: Natural Increase,3: Queensland,1982-Q2,6081.0


In [76]:
# Rename columns 
erp_data_df = erp_data_df.rename(columns={"MEASURE: Measure": "measure_id","REGION: Region": "state_id", "TIME_PERIOD: Time Period": "quarter_id",
                                              "OBS_VALUE": "net_change"})
erp_data_df

Unnamed: 0,measure_id,state_id,quarter_id,net_change
0,3: Natural Increase,3: Queensland,1981-Q2,6191.0
1,3: Natural Increase,3: Queensland,1981-Q3,4920.0
2,3: Natural Increase,3: Queensland,1981-Q4,4756.0
3,3: Natural Increase,3: Queensland,1982-Q1,6331.0
4,3: Natural Increase,3: Queensland,1982-Q2,6081.0
...,...,...,...,...
7210,13: Change Over Previous Quarter,6: Tasmania,2021-Q1,1529.0
7211,13: Change Over Previous Quarter,6: Tasmania,2021-Q2,823.0
7212,13: Change Over Previous Quarter,6: Tasmania,2021-Q3,-197.0
7213,13: Change Over Previous Quarter,6: Tasmania,2021-Q4,2115.0


In [77]:
# Identify incomplete rows
erp_data_df.count()

measure_id    7215
state_id      7215
quarter_id    7215
net_change    7081
dtype: int64

In [78]:
# Drop all rows with missing information
#erp_data_df = erp_data_df.dropna(how='any')

In [81]:
# Verify dropped rows
erp_data_df.count()

measure_id    7215
state_id      7215
quarter_id    7215
net_change    7081
dtype: int64

In [83]:
new_erp_data_df = erp_data_df[['measure_id', 'state_id','quarter_id','net_change']].copy()
new_erp_data_df.head()

Unnamed: 0,measure_id,state_id,quarter_id,net_change
0,3: Natural Increase,3: Queensland,1981-Q2,6191.0
1,3: Natural Increase,3: Queensland,1981-Q3,4920.0
2,3: Natural Increase,3: Queensland,1981-Q4,4756.0
3,3: Natural Increase,3: Queensland,1982-Q1,6331.0
4,3: Natural Increase,3: Queensland,1982-Q2,6081.0


In [84]:
new_erp_data_df.to_csv("Resources/johan_erp_clean.csv", index=False)

**bold text**### Store  ABS AWE CSV into DataFrame

In [112]:
csv_file = "Resources/ABS_AWE.csv"
awe_data_df = pd.read_csv(csv_file)
awe_data_df.head()

Unnamed: 0,DATAFLOW,MEASURE: Measure,ESTIMATE_TYPE: Estimate Type,SEX: Sex,SECTOR: Sector,INDUSTRY: Industry,TSEST: Adjustment Type,REGION: Region,FREQ: Frequency,TIME_PERIOD: Time Period,OBS_VALUE,UNIT_MEASURE: Unit of Measure,OBS_STATUS: Observation Status,OBS_COMMENT: Observation Comment
0,ABS:AWE(1.0.0),1: All employees average weekly total earnings,1: Earnings,3: Persons,7: Private and Public,TOT: All Industries,10: Original,7: Northern Territory,Q: Quarterly,1994-Q4,564.4,AUD: Australian Dollars,,
1,ABS:AWE(1.0.0),1: All employees average weekly total earnings,1: Earnings,3: Persons,7: Private and Public,TOT: All Industries,10: Original,7: Northern Territory,Q: Quarterly,1995-Q2,564.7,AUD: Australian Dollars,,
2,ABS:AWE(1.0.0),1: All employees average weekly total earnings,1: Earnings,3: Persons,7: Private and Public,TOT: All Industries,10: Original,7: Northern Territory,Q: Quarterly,1995-Q4,567.6,AUD: Australian Dollars,,
3,ABS:AWE(1.0.0),1: All employees average weekly total earnings,1: Earnings,3: Persons,7: Private and Public,TOT: All Industries,10: Original,7: Northern Territory,Q: Quarterly,1996-Q2,571.0,AUD: Australian Dollars,,
4,ABS:AWE(1.0.0),1: All employees average weekly total earnings,1: Earnings,3: Persons,7: Private and Public,TOT: All Industries,10: Original,7: Northern Territory,Q: Quarterly,1996-Q4,581.6,AUD: Australian Dollars,,


Check Data Types

In [113]:
awe_data_df.dtypes

DATAFLOW                             object
MEASURE: Measure                     object
ESTIMATE_TYPE: Estimate Type         object
SEX: Sex                             object
SECTOR: Sector                       object
INDUSTRY: Industry                   object
TSEST: Adjustment Type               object
REGION: Region                       object
FREQ: Frequency                      object
TIME_PERIOD: Time Period             object
OBS_VALUE                           float64
UNIT_MEASURE: Unit of Measure        object
OBS_STATUS: Observation Status       object
OBS_COMMENT: Observation Comment    float64
dtype: object

In [114]:
# Delete column we don't want
del awe_data_df['DATAFLOW']
del awe_data_df['OBS_STATUS: Observation Status']
del awe_data_df['OBS_COMMENT: Observation Comment']
del awe_data_df['UNIT_MEASURE: Unit of Measure']
del awe_data_df['TSEST: Adjustment Type']
del awe_data_df['SEX: Sex']
del awe_data_df['FREQ: Frequency']
awe_data_df.head()

Unnamed: 0,MEASURE: Measure,ESTIMATE_TYPE: Estimate Type,SECTOR: Sector,INDUSTRY: Industry,REGION: Region,TIME_PERIOD: Time Period,OBS_VALUE
0,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1994-Q4,564.4
1,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1995-Q2,564.7
2,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1995-Q4,567.6
3,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1996-Q2,571.0
4,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1996-Q4,581.6


In [115]:
# Identify incomplete rows
awe_data_df.count()

MEASURE: Measure                27726
ESTIMATE_TYPE: Estimate Type    27726
SECTOR: Sector                  27726
INDUSTRY: Industry              27726
REGION: Region                  27726
TIME_PERIOD: Time Period        27726
OBS_VALUE                       27726
dtype: int64

In [116]:
# Rename columns 
awe_data_df = awe_data_df.rename(columns={"MEASURE: Measure": "measure_id","REGION: Region": "state_id", "TIME_PERIOD: Time Period": "quarter_id",
                                              "OBS_VALUE": "net_change","ESTIMATE_TYPE: Estimate Type":"type_id","SECTOR: Sector":"sector_id","INDUSTRY: Industry":"industry_id"})
awe_data_df

Unnamed: 0,measure_id,type_id,sector_id,industry_id,state_id,quarter_id,net_change
0,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1994-Q4,564.4
1,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1995-Q2,564.7
2,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1995-Q4,567.6
3,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1996-Q2,571.0
4,1: All employees average weekly total earnings,1: Earnings,7: Private and Public,TOT: All Industries,7: Northern Territory,1996-Q4,581.6
...,...,...,...,...,...,...,...
27721,3: Full-time adult average weekly ordinary tim...,2: Standard Error,7: Private and Public,R: Arts and Recreation Services,AUS: Australia,2020-Q2,45.4
27722,3: Full-time adult average weekly ordinary tim...,2: Standard Error,7: Private and Public,R: Arts and Recreation Services,AUS: Australia,2020-Q4,35.9
27723,3: Full-time adult average weekly ordinary tim...,2: Standard Error,7: Private and Public,R: Arts and Recreation Services,AUS: Australia,2021-Q2,26.7
27724,3: Full-time adult average weekly ordinary tim...,2: Standard Error,7: Private and Public,R: Arts and Recreation Services,AUS: Australia,2021-Q4,30.9


In [117]:
# Display a statistical overview
# We can infer the maximum allowable individual contribution from 'max'
awe_data_df.describe()

Unnamed: 0,net_change
count,27726.0
mean,812.616212
std,625.392315
min,2.3
25%,43.5
50%,837.15
75%,1305.7
max,2873.4


In [118]:
awe_data_df.to_csv("Resources/johan_awe_employment_clean.csv", index=False)

### Clean DataFrame

In [119]:
new_awe_data_df = awe_data_df[["MEASURE: Measure", "REGION: Region", "OBS_VALUE", "TIME_PERIOD: Time Period"]].copy()
new_awe_data_df.head()

KeyError: "None of [Index(['MEASURE: Measure', 'REGION: Region', 'OBS_VALUE',\n       'TIME_PERIOD: Time Period'],\n      dtype='object')] are in the [columns]"

### Connect to local database

In [None]:
protocol = 'postgresql'
username = '<user name>'
password = '<password>'
host = 'localhost'
port = 5432
database_name = 'customer_db'
rds_connection_string = f'{protocol}://{username}:{password}@{host}:{port}/{database_name}'
engine = create_engine(rds_connection_string)
insp = inspect(engine)

### Check for tables

In [None]:
insp.get_table_names()

['customer_name', 'customer_location']

### Use pandas to load csv converted DataFrame into database

In [None]:
new_customer_data_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

### Use pandas to load json converted DataFrame into database

In [None]:
new_customer_location_df.to_sql(name='customer_location', con=engine, if_exists='append', index=False)

### Confirm data has been added by querying the customer_name table
* NOTE: can also check using pgAdmin

In [None]:
pd.read_sql_query('select * from customer_name', con=engine).head()

Unnamed: 0,id,first_name,last_name
0,1,Benetta,Cancott
1,2,Lilyan,Cherry
2,3,Ezekiel,Benasik
3,4,Kennedy,Atlay
4,5,Sanford,Salmen


### Confirm data has been added by querying the customer_location table

In [None]:
pd.read_sql_query('select * from customer_location', con=engine).head()

Unnamed: 0,id,address,city,aus_state
0,1,043 Mockingbird Place,Ballarat,VIC
1,2,4 Prentice Point,Glenore Grove,QLD
2,3,46 Derek Junction,Bankstown,NSW
3,4,11966 Old Shore Place,Darwin,NT
4,5,5 Evergreen Circle,Horsham,VIC


In [None]:
sql_join = r"""SELECT customer_name.id, customer_name.first_name, customer_name.last_name, 
customer_location.address, customer_location.city, customer_location.aus_state
FROM customer_name
JOIN customer_location
ON customer_name.id = customer_location.id"""
pd.read_sql_query(sql_join, con=engine).head()

Unnamed: 0,id,first_name,last_name,address,city,aus_state
0,1,Benetta,Cancott,043 Mockingbird Place,Ballarat,VIC
1,2,Lilyan,Cherry,4 Prentice Point,Glenore Grove,QLD
2,3,Ezekiel,Benasik,46 Derek Junction,Bankstown,NSW
3,4,Kennedy,Atlay,11966 Old Shore Place,Darwin,NT
4,5,Sanford,Salmen,5 Evergreen Circle,Horsham,VIC
