# Project 2 - Team 6

## ETL 

### Import Dependencies

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect

--------------------------------

# ABS_ERP_COMP.csv

### Load CSV

In [2]:
csv_file = os.path.join('..', 'Resources', 'ABS_ERP_COMP_CLEAN.csv')
# csv_file = "Resources/ABS_ERP_COMP_CLEAN.csv"

erp_data_df = pd.read_csv(csv_file)
erp_data_df.head()

Unnamed: 0,DATAFLOW,MEASURE: Measure,REGION: Region,FREQ: Frequency,TIME_PERIOD: Time Period,OBS_VALUE,UNIT_MEASURE: Unit of Measure,UNIT_MULT: Unit of Multiplier,OBS_STATUS: Observation Status,OBS_COMMENT: Observation Comment
0,ABS:ERP_COMP_Q(1.0.0),Internal Arrivals,Australia,Q: Quarterly,1981-Q2,78.0,NUM: Number,3: Thousands,,
1,ABS:ERP_COMP_Q(1.0.0),Internal Departures,Australia,Q: Quarterly,1981-Q2,78.0,NUM: Number,3: Thousands,,
2,ABS:ERP_COMP_Q(1.0.0),Change Over Previous Quarter,Australian Capital Territory,Q: Quarterly,1981-Q2,,NUM: Number,0: Units,u: not applicable,
3,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Australian Capital Territory,Q: Quarterly,1981-Q2,565.0,NUM: Number,0: Units,,
4,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,New South Wales,Q: Quarterly,1981-Q2,-6330.0,NUM: Number,0: Units,,


----------------------------------------

# ABS_AWE.csv

### Load CSV

In [3]:
csv_file = os.path.join('..', 'Resources', 'ABS_ERP_COMP_CLEAN.csv')
# csv_file = "Resources/ABS_AWE_CLEAN.csv"

awe_data_df = pd.read_csv(csv_file)
awe_data_df.head()

Unnamed: 0,DATAFLOW,MEASURE: Measure,REGION: Region,FREQ: Frequency,TIME_PERIOD: Time Period,OBS_VALUE,UNIT_MEASURE: Unit of Measure,UNIT_MULT: Unit of Multiplier,OBS_STATUS: Observation Status,OBS_COMMENT: Observation Comment
0,ABS:ERP_COMP_Q(1.0.0),Internal Arrivals,Australia,Q: Quarterly,1981-Q2,78.0,NUM: Number,3: Thousands,,
1,ABS:ERP_COMP_Q(1.0.0),Internal Departures,Australia,Q: Quarterly,1981-Q2,78.0,NUM: Number,3: Thousands,,
2,ABS:ERP_COMP_Q(1.0.0),Change Over Previous Quarter,Australian Capital Territory,Q: Quarterly,1981-Q2,,NUM: Number,0: Units,u: not applicable,
3,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Australian Capital Territory,Q: Quarterly,1981-Q2,565.0,NUM: Number,0: Units,,
4,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,New South Wales,Q: Quarterly,1981-Q2,-6330.0,NUM: Number,0: Units,,


----------------------------------------

# States Table

<!-- ![states_table.png](attachment:states_table.png) -->

<img src="../Images/states_table.png" 
     align="left" 
     width="250" />

### Copy Region Colum to New DF - For Table 'States'

In [4]:
# Copy column to new DF
new_test_region_df = erp_data_df[['REGION: Region']].copy()
new_test_region_df.head()

Unnamed: 0,REGION: Region
0,Australia
1,Australia
2,Australian Capital Territory
3,Australian Capital Territory
4,New South Wales


In [5]:
# Drop duplicates
new_test_region_df = new_test_region_df.sort_values('REGION: Region').drop_duplicates('REGION: Region', keep='last')

In [6]:
# Create Index Column with Unique Values
new_test_region_df.reset_index(inplace=True)
new_test_region_df

Unnamed: 0,index,REGION: Region
0,303,Australia
1,553,Australian Capital Territory
2,2854,New South Wales
3,747,Northern Territory
4,2326,Queensland
5,2841,South Australia
6,2273,Tasmania
7,1287,Victoria
8,3114,Western Australia


In [7]:
# Rename Column Headers to match Schemata
new_test_region_df.rename(columns = {'index':'state_id', 'REGION: Region': 'state_name'}, inplace=True)
new_test_region_df

Unnamed: 0,state_id,state_name
0,303,Australia
1,553,Australian Capital Territory
2,2854,New South Wales
3,747,Northern Territory
4,2326,Queensland
5,2841,South Australia
6,2273,Tasmania
7,1287,Victoria
8,3114,Western Australia


In [8]:
new_test_region_df_final = new_test_region_df.set_index('state_id')
new_test_region_df_final

Unnamed: 0_level_0,state_name
state_id,Unnamed: 1_level_1
303,Australia
553,Australian Capital Territory
2854,New South Wales
747,Northern Territory
2326,Queensland
2841,South Australia
2273,Tasmania
1287,Victoria
3114,Western Australia


-------------------

# Quarters Table

<!-- ![states_table.png](attachment:states_table.png) -->
<img src="../Images/quarters_table.png" 
     align="left" 
     width="250" />

### Copy Time Period Colum to New DF - For Table 'Quarters'

In [9]:
# Copy column to new DF
new_test_quarter_df = erp_data_df[['TIME_PERIOD: Time Period']].copy()
new_test_quarter_df.head()

Unnamed: 0,TIME_PERIOD: Time Period
0,1981-Q2
1,1981-Q2
2,1981-Q2
3,1981-Q2
4,1981-Q2


In [10]:
# Drop duplicates
new_test_quarter_df = new_test_quarter_df.sort_values('TIME_PERIOD: Time Period').drop_duplicates('TIME_PERIOD: Time Period', keep='last')

In [11]:
# Create Index Column with Unique Values
new_test_quarter_df.reset_index(level=0, inplace=True)
new_test_quarter_df.head()

Unnamed: 0,index,TIME_PERIOD: Time Period
0,8,1981-Q2
1,25,1981-Q3
2,45,1981-Q4
3,59,1982-Q1
4,83,1982-Q2


In [12]:
# Rename Column Headers to match Schemata
new_test_quarter_df.rename(columns = {'index':'quarter_id', 'TIME_PERIOD: Time Period': 'date_quarter'}, inplace=True)
new_test_quarter_df.head()

Unnamed: 0,quarter_id,date_quarter
0,8,1981-Q2
1,25,1981-Q3
2,45,1981-Q4
3,59,1982-Q1
4,83,1982-Q2


In [13]:
new_test_quarter_df_final = new_test_quarter_df.set_index('quarter_id')
new_test_quarter_df_final

Unnamed: 0_level_0,date_quarter
quarter_id,Unnamed: 1_level_1
8,1981-Q2
25,1981-Q3
45,1981-Q4
59,1982-Q1
83,1982-Q2
...,...
3023,2021-Q1
3047,2021-Q2
3067,2021-Q3
3085,2021-Q4


----------

# Population_Change Table

<!-- ![states_table.png](attachment:states_table.png) -->
<img src="../Images/population_change_t.png" 
     align="left" 
     width="250" />

In [14]:
# Copy column to new DF
population_df = erp_data_df[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
population_df.head()

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
0,1981-Q2,Australia,78.0
1,1981-Q2,Australia,78.0
2,1981-Q2,Australian Capital Territory,
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0


In [15]:
# Change column headers to match shemata
population_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'net_change'}, inplace=True)
population_df.head()

Unnamed: 0,quarter_id,state_id,net_change
0,1981-Q2,Australia,78.0
1,1981-Q2,Australia,78.0
2,1981-Q2,Australian Capital Territory,
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0


In [16]:
# replace quarter_id names with keys from quarter_df
key_list = list(population_df['quarter_id'])
dict_lookup = dict(zip(new_test_quarter_df['date_quarter'], new_test_quarter_df['quarter_id']))
population_df['quarter_id'] = [dict_lookup[item] for item in key_list]
population_df.head()

Unnamed: 0,quarter_id,state_id,net_change
0,8,Australia,78.0
1,8,Australia,78.0
2,8,Australian Capital Territory,
3,8,Australian Capital Territory,565.0
4,8,New South Wales,-6330.0


In [17]:
# replace state_id names with keys from region_df
key_list = list(population_df['state_id'])
dict_lookup = dict(zip(new_test_region_df['state_name'], new_test_region_df['state_id']))
population_df['state_id'] = [dict_lookup[item] for item in key_list]
population_df.head()

Unnamed: 0,quarter_id,state_id,net_change
0,8,303,78.0
1,8,303,78.0
2,8,553,
3,8,553,565.0
4,8,2854,-6330.0


In [18]:
# replace NaN with 0
population_df['net_change'] = population_df['net_change'].fillna(0)
population_df.head()

Unnamed: 0,quarter_id,state_id,net_change
0,8,303,78.0
1,8,303,78.0
2,8,553,0.0
3,8,553,565.0
4,8,2854,-6330.0


-------------------

# Average_Earnings Table

<!-- ![states_table.png](attachment:states_table.png) -->
<img src="../Images/average_earnings_t.png" 
     align="left" 
     width="250" />

In [19]:
# Copy column to new DF
average_earnings_df = awe_data_df[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
average_earnings_df.head()

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
0,1981-Q2,Australia,78.0
1,1981-Q2,Australia,78.0
2,1981-Q2,Australian Capital Territory,
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0


In [20]:
# Change column headers to match shemata
average_earnings_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'avg_earnings'}, inplace=True)
average_earnings_df.head()

Unnamed: 0,quarter_id,state_id,avg_earnings
0,1981-Q2,Australia,78.0
1,1981-Q2,Australia,78.0
2,1981-Q2,Australian Capital Territory,
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0


In [21]:
# replace quarter_id names with keys from quarter_df
key_list = list(average_earnings_df['quarter_id'])
dict_lookup = dict(zip(new_test_quarter_df['date_quarter'], new_test_quarter_df['quarter_id']))
average_earnings_df['quarter_id'] = [dict_lookup[item] for item in key_list]
average_earnings_df.head()

Unnamed: 0,quarter_id,state_id,avg_earnings
0,8,Australia,78.0
1,8,Australia,78.0
2,8,Australian Capital Territory,
3,8,Australian Capital Territory,565.0
4,8,New South Wales,-6330.0


In [22]:
# replace state_id names with keys from region_df
key_list = list(average_earnings_df['state_id'])
dict_lookup = dict(zip(new_test_region_df['state_name'], new_test_region_df['state_id']))
average_earnings_df['state_id'] = [dict_lookup[item] for item in key_list]
average_earnings_df.head()

Unnamed: 0,quarter_id,state_id,avg_earnings
0,8,303,78.0
1,8,303,78.0
2,8,553,
3,8,553,565.0
4,8,2854,-6330.0


In [23]:
# replace NaN with 0
average_earnings_df['avg_earnings'] = average_earnings_df['avg_earnings'].fillna(0)
average_earnings_df.head()

Unnamed: 0,quarter_id,state_id,avg_earnings
0,8,303,78.0
1,8,303,78.0
2,8,553,0.0
3,8,553,565.0
4,8,2854,-6330.0


In [24]:
# reset index with final data 
average_earnings_df_final = average_earnings_df.reset_index(drop=True)
average_earnings_df_final

Unnamed: 0,quarter_id,state_id,avg_earnings
0,8,303,78.0
1,8,303,78.0
2,8,553,0.0
3,8,553,565.0
4,8,2854,-6330.0
...,...,...,...
3110,3114,2273,1338.0
3111,3114,1287,33373.0
3112,3114,1287,-3350.0
3113,3114,3114,2375.0


------------

# Internal_Net_Change Table

<!-- ![states_table.png](attachment:internal_net_change_t.png) -->
<img src="../Images/internal_net_change_t.png" 
     align="left" 
     width="250" />

This one will need to filter values in MEASURE to  Net Internal Migration before copy

In [25]:
erp_data_filtered = erp_data_df[erp_data_df['MEASURE: Measure'] == ' Net Internal Migration']

In [26]:
# Copy column to new DF
internal_net_change_df = erp_data_filtered[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
internal_net_change_df

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0
6,1981-Q2,Northern Territory,1806.0
8,1981-Q2,Queensland,8558.0
11,1981-Q2,South Australia,-1675.0
...,...,...,...
3105,2022-Q1,Queensland,11071.0
3108,2022-Q1,South Australia,432.0
3109,2022-Q1,Tasmania,156.0
3112,2022-Q1,Victoria,-3350.0


In [27]:
# Change column headers to match shemata
internal_net_change_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'net_change'}, inplace=True)
internal_net_change_df.head()

Unnamed: 0,quarter_id,state_id,net_change
3,1981-Q2,Australian Capital Territory,565.0
4,1981-Q2,New South Wales,-6330.0
6,1981-Q2,Northern Territory,1806.0
8,1981-Q2,Queensland,8558.0
11,1981-Q2,South Australia,-1675.0


In [28]:
# replace quarter_id names with keys from quarter_df
key_list = list(internal_net_change_df['quarter_id'])
dict_lookup = dict(zip(new_test_quarter_df['date_quarter'], new_test_quarter_df['quarter_id']))
internal_net_change_df['quarter_id'] = [dict_lookup[item] for item in key_list]
internal_net_change_df.head()

Unnamed: 0,quarter_id,state_id,net_change
3,8,Australian Capital Territory,565.0
4,8,New South Wales,-6330.0
6,8,Northern Territory,1806.0
8,8,Queensland,8558.0
11,8,South Australia,-1675.0


In [29]:
# replace state_id names with keys from region_df
key_list = list(internal_net_change_df['state_id'])
dict_lookup = dict(zip(new_test_region_df['state_name'], new_test_region_df['state_id']))
internal_net_change_df['state_id'] = [dict_lookup[item] for item in key_list]
internal_net_change_df.head()

Unnamed: 0,quarter_id,state_id,net_change
3,8,553,565.0
4,8,2854,-6330.0
6,8,747,1806.0
8,8,2326,8558.0
11,8,2841,-1675.0


In [30]:
# replace NaN with 0
internal_net_change_df['net_change'] = internal_net_change_df['net_change'].fillna(0)
internal_net_change_df.head()

Unnamed: 0,quarter_id,state_id,net_change
3,8,553,565.0
4,8,2854,-6330.0
6,8,747,1806.0
8,8,2326,8558.0
11,8,2841,-1675.0


In [31]:
# reset index with final data 
internal_net_change_final = internal_net_change_df.reset_index(drop=True)
internal_net_change_final

Unnamed: 0,quarter_id,state_id,net_change
0,8,553,565.0
1,8,2854,-6330.0
2,8,747,1806.0
3,8,2326,8558.0
4,8,2841,-1675.0
...,...,...,...
1307,3114,2326,11071.0
1308,3114,2841,432.0
1309,3114,2273,156.0
1310,3114,1287,-3350.0


----------

# Internal_Arrivals Table

<!-- ![states_table.png](attachment:internal_net_change_t.png) -->
<img src="../Images/internal_arrivals_t.png" 
     align="left" 
     width="250" />

In [32]:
erp_data_filtered = erp_data_df[erp_data_df['MEASURE: Measure'] == ' Internal Arrivals']

In [33]:
# Copy column to new DF
internal_arrivals_df = erp_data_filtered[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
internal_arrivals_df

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
0,1981-Q2,Australia,78.0
19,1981-Q3,Australia,76.6
38,1981-Q4,Australia,65.9
57,1982-Q1,Australia,70.3
76,1982-Q2,Australia,81.7
...,...,...,...
3021,2021-Q1,Australia,94.7
3040,2021-Q2,Australia,121.7
3059,2021-Q3,Australia,145.6
3078,2021-Q4,Australia,121.9


In [34]:
# Change column headers to match shemata
internal_arrivals_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'net_arrivals'}, inplace=True)
internal_arrivals_df.head()

Unnamed: 0,quarter_id,state_id,net_arrivals
0,1981-Q2,Australia,78.0
19,1981-Q3,Australia,76.6
38,1981-Q4,Australia,65.9
57,1982-Q1,Australia,70.3
76,1982-Q2,Australia,81.7


In [35]:
# replace quarter_id names with keys from quarter_df
key_list = list(internal_arrivals_df['quarter_id'])
dict_lookup = dict(zip(new_test_quarter_df['date_quarter'], new_test_quarter_df['quarter_id']))
internal_arrivals_df['quarter_id'] = [dict_lookup[item] for item in key_list]
internal_arrivals_df.head()

Unnamed: 0,quarter_id,state_id,net_arrivals
0,8,Australia,78.0
19,25,Australia,76.6
38,45,Australia,65.9
57,59,Australia,70.3
76,83,Australia,81.7


In [36]:
# replace state_id names with keys from region_df
key_list = list(internal_arrivals_df['state_id'])
dict_lookup = dict(zip(new_test_region_df['state_name'], new_test_region_df['state_id']))
internal_arrivals_df['state_id'] = [dict_lookup[item] for item in key_list]
internal_arrivals_df.head()

Unnamed: 0,quarter_id,state_id,net_arrivals
0,8,303,78.0
19,25,303,76.6
38,45,303,65.9
57,59,303,70.3
76,83,303,81.7


In [37]:
# replace NaN with 0
internal_arrivals_df['net_arrivals'] = internal_arrivals_df['net_arrivals'].fillna(0)
internal_arrivals_df.head()

Unnamed: 0,quarter_id,state_id,net_arrivals
0,8,303,78.0
19,25,303,76.6
38,45,303,65.9
57,59,303,70.3
76,83,303,81.7


In [38]:
# reset index with final data 
internal_arrivals_final = internal_arrivals_df.reset_index(drop=True)
internal_arrivals_final

Unnamed: 0,quarter_id,state_id,net_arrivals
0,8,303,78.0
1,25,303,76.6
2,45,303,65.9
3,59,303,70.3
4,83,303,81.7
...,...,...,...
159,3023,303,94.7
160,3047,303,121.7
161,3067,303,145.6
162,3085,303,121.9


---------------------------

# Internal_Departures Table

<!-- ![states_table.png](attachment:internal_net_change_t.png) -->
<img src="../Images/internal_departures_t.png" 
     align="left" 
     width="250" />

In [39]:
erp_data_filtered = erp_data_df[erp_data_df['MEASURE: Measure'] == ' Internal Departures']

In [40]:
# Copy column to new DF
internal_departures_df = erp_data_filtered[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
internal_departures_df

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
1,1981-Q2,Australia,78.0
20,1981-Q3,Australia,76.6
39,1981-Q4,Australia,65.9
58,1982-Q1,Australia,70.3
77,1982-Q2,Australia,81.7
...,...,...,...
3022,2021-Q1,Australia,94.7
3041,2021-Q2,Australia,121.7
3060,2021-Q3,Australia,145.6
3079,2021-Q4,Australia,121.9


In [41]:
# Change column headers to match shemata
internal_departures_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'net_departures'}, inplace=True)
internal_departures_df.head()

Unnamed: 0,quarter_id,state_id,net_departures
1,1981-Q2,Australia,78.0
20,1981-Q3,Australia,76.6
39,1981-Q4,Australia,65.9
58,1982-Q1,Australia,70.3
77,1982-Q2,Australia,81.7


In [42]:
# replace quarter_id names with keys from quarter_df
key_list = list(internal_departures_df['quarter_id'])
dict_lookup = dict(zip(new_test_quarter_df['date_quarter'], new_test_quarter_df['quarter_id']))
internal_departures_df['quarter_id'] = [dict_lookup[item] for item in key_list]
internal_departures_df.head()

Unnamed: 0,quarter_id,state_id,net_departures
1,8,Australia,78.0
20,25,Australia,76.6
39,45,Australia,65.9
58,59,Australia,70.3
77,83,Australia,81.7


In [43]:
# replace state_id names with keys from region_df
key_list = list(internal_departures_df['state_id'])
dict_lookup = dict(zip(new_test_region_df['state_name'], new_test_region_df['state_id']))
internal_departures_df['state_id'] = [dict_lookup[item] for item in key_list]
internal_departures_df.head()

Unnamed: 0,quarter_id,state_id,net_departures
1,8,303,78.0
20,25,303,76.6
39,45,303,65.9
58,59,303,70.3
77,83,303,81.7


In [44]:
# replace NaN with 0
internal_departures_df['net_departures'] = internal_departures_df['net_departures'].fillna(0)
internal_departures_df.head()

Unnamed: 0,quarter_id,state_id,net_departures
1,8,303,78.0
20,25,303,76.6
39,45,303,65.9
58,59,303,70.3
77,83,303,81.7


In [45]:
# reset index with final data 
internal_departures_final = internal_departures_df.reset_index(drop=True)
internal_departures_final

Unnamed: 0,quarter_id,state_id,net_departures
0,8,303,78.0
1,25,303,76.6
2,45,303,65.9
3,59,303,70.3
4,83,303,81.7
...,...,...,...
159,3023,303,94.7
160,3047,303,121.7
161,3067,303,145.6
162,3085,303,121.9


---------------------------

# Analysis Table

<!-- ![states_table.png](attachment:internal_net_change_t.png) -->
<img src="../Images/analysis_t.png" 
     align="left" 
     width="250" />

---------------------------

-----------------

---------------------

### Reset Index's of Finished Tables

In [None]:
# reset index with final data QUARTERS
new_test_quarter_df.reset_index(drop=True)
new_test_quarter_df.head()

In [None]:
# reset index with final data STATES
new_test_region_df.reset_index(drop=True)
new_test_region_df.head()

In [None]:
population_df.reset_index(drop=True)
population_df.tail()

In [None]:
# Test Output - DELETE
# population_df.to_csv('../jon_clean_test.csv')

-----

# Export to Postgres To Go Below