### Import Dependencies

In [1]:
import os
import pandas as pd
from sqlalchemy import create_engine
from sqlalchemy import inspect

--------------------------------

# ABS_ERP_COMP

### Load CSV

In [2]:
csv_file = os.path.join('..', 'Resources', 'ABS_ERP_COMP_CLEAN.csv')
# csv_file = "Resources/ABS_ERP_COMP_CLEAN.csv"

erp_data_df = pd.read_csv(csv_file)
erp_data_df.head()

Unnamed: 0,DATAFLOW,MEASURE: Measure,REGION: Region,FREQ: Frequency,TIME_PERIOD: Time Period,OBS_VALUE,UNIT_MEASURE: Unit of Measure,UNIT_MULT: Unit of Multiplier,OBS_STATUS: Observation Status,OBS_COMMENT: Observation Comment
0,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Western Australia,Q: Quarterly,1981-Q2,680.0,NUM: Number,0: Units,,
1,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Western Australia,Q: Quarterly,1981-Q3,2002.0,NUM: Number,0: Units,,
2,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Western Australia,Q: Quarterly,1981-Q4,441.0,NUM: Number,0: Units,,
3,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Western Australia,Q: Quarterly,1982-Q1,494.0,NUM: Number,0: Units,,
4,ABS:ERP_COMP_Q(1.0.0),Net Internal Migration,Western Australia,Q: Quarterly,1982-Q2,621.0,NUM: Number,0: Units,,


----------------------------------------

### Copy Region Colum to New DF - For Table 'States'

In [3]:
# Copy column to new DF
new_test_region_df = erp_data_df[['REGION: Region']].copy()
new_test_region_df.head()

Unnamed: 0,REGION: Region
0,Western Australia
1,Western Australia
2,Western Australia
3,Western Australia
4,Western Australia


In [4]:
# Drop duplicates
new_test_region_df = new_test_region_df.sort_values('REGION: Region').drop_duplicates('REGION: Region', keep='last')

In [39]:
# Create Index Column with Unique Values
new_test_region_df.reset_index(inplace=True)
new_test_region_df

Unnamed: 0,index,state_id,state_name
0,0,214,Australia
1,1,1420,Australian Capital Territory
2,2,1309,New South Wales
3,3,2838,Northern Territory
4,4,2292,Queensland
5,5,868,South Australia
6,6,983,Tasmania
7,7,2456,Victoria
8,8,0,Western Australia


In [6]:
# Rename Column Headers to match Schemata
new_test_region_df.rename(columns = {'index':'state_id', 'REGION: Region': 'state_name'}, inplace=True)
new_test_region_df

Unnamed: 0,state_id,state_name
0,214,Australia
1,1420,Australian Capital Territory
2,1309,New South Wales
3,2838,Northern Territory
4,2292,Queensland
5,868,South Australia
6,983,Tasmania
7,2456,Victoria
8,0,Western Australia


### Copy Time Period Colum to New DF - For Table 'Quarters'

In [7]:
# Copy column to new DF
new_test_quarter_df = erp_data_df[['TIME_PERIOD: Time Period']].copy()
new_test_quarter_df.head()

Unnamed: 0,TIME_PERIOD: Time Period
0,1981-Q2
1,1981-Q3
2,1981-Q4
3,1982-Q1
4,1982-Q2


In [8]:
# Drop duplicates
new_test_quarter_df = new_test_quarter_df.sort_values('TIME_PERIOD: Time Period').drop_duplicates('TIME_PERIOD: Time Period', keep='last')

In [9]:
# Create Index Column with Unique Values
new_test_quarter_df.reset_index(level=0, inplace=True)
new_test_quarter_df

Unnamed: 0,index,TIME_PERIOD: Time Period
0,1639,1981-Q2
1,2788,1981-Q3
2,1969,1981-Q4
3,1314,1982-Q1
4,2955,1982-Q2
...,...,...
159,322,2021-Q1
160,2947,2021-Q2
161,324,2021-Q3
162,2293,2021-Q4


In [10]:
# Rename Column Headers to match Schemata
new_test_quarter_df.rename(columns = {'index':'quarter_id', 'TIME_PERIOD: Time Period': 'date_quarter'}, inplace=True)
new_test_quarter_df

Unnamed: 0,quarter_id,date_quarter
0,1639,1981-Q2
1,2788,1981-Q3
2,1969,1981-Q4
3,1314,1982-Q1
4,2955,1982-Q2
...,...,...
159,322,2021-Q1
160,2947,2021-Q2
161,324,2021-Q3
162,2293,2021-Q4


### Population_Change Table

In [11]:
# Copy column to new DF
population_df = erp_data_df[['TIME_PERIOD: Time Period', 'REGION: Region', 'OBS_VALUE']].copy()
population_df.head()

Unnamed: 0,TIME_PERIOD: Time Period,REGION: Region,OBS_VALUE
0,1981-Q2,Western Australia,680.0
1,1981-Q3,Western Australia,2002.0
2,1981-Q4,Western Australia,441.0
3,1982-Q1,Western Australia,494.0
4,1982-Q2,Western Australia,621.0


In [12]:
# Change column headers to match shemata
population_df.rename(columns = {'REGION: Region':'state_id', 'TIME_PERIOD: Time Period': 'quarter_id', 'OBS_VALUE':'net_change'}, inplace=True)
population_df

Unnamed: 0,quarter_id,state_id,net_change
0,1981-Q2,Western Australia,680.0
1,1981-Q3,Western Australia,2002.0
2,1981-Q4,Western Australia,441.0
3,1982-Q1,Western Australia,494.0
4,1982-Q2,Western Australia,621.0
...,...,...,...
3110,2021-Q1,Tasmania,1529.0
3111,2021-Q2,Tasmania,823.0
3112,2021-Q3,Tasmania,-197.0
3113,2021-Q4,Tasmania,2115.0


In [13]:
# Create Index Column with Unique Values
new_test_quarter_df.reset_index(level=0, inplace=True)
new_test_quarter_df

Unnamed: 0,index,quarter_id,date_quarter
0,0,1639,1981-Q2
1,1,2788,1981-Q3
2,2,1969,1981-Q4
3,3,1314,1982-Q1
4,4,2955,1982-Q2
...,...,...,...
159,159,322,2021-Q1
160,160,2947,2021-Q2
161,161,324,2021-Q3
162,162,2293,2021-Q4


In [14]:
new_test_quarter_df.dtypes

index            int64
quarter_id       int64
date_quarter    object
dtype: object

In [15]:
population_df.dtypes

quarter_id     object
state_id       object
net_change    float64
dtype: object

In [16]:
# # convert_dict = {'index': object,
#                 'quarter_id': object 
#                 }
 
# new_test_quarter_df = new_test_quarter_df.astype(convert_dict)

In [41]:

# Loop through the values in df1
for index, row in new_test_region_df.iterrows():
    # Check if the value in column 1 of df1 is present in column 1 of df2
    if row['state_name'] in population_df['state_id'].values:
        # Get the index of the matching row in df2
        index2 = population_df[population_df['state_id'] == row['state_name']].index[-1]
        # Replace the value in column 3 of df2 with the value in column 3 of df1
        population_df.at[index2, 'state_id'] = row['state_id']

In [42]:

# Loop through the values in df1
for index, row in new_test_quarter_df.iterrows():
    # Check if the value in column 1 of df1 is present in column 1 of df2
    if row['date_quarter'] in population_df['quarter_id'].values:
        # Get the index of the matching row in df2
        index2 = population_df[population_df['quarter_id'] == row['date_quarter']].index[1]
        # Replace the value in column 3 of df2 with the value in column 3 of df1
        population_df.at[index2, 'quarter_id'] = row['quarter_id']

In [43]:
new_test_quarter_df

Unnamed: 0,index,quarter_id,date_quarter
0,0,1639,1981-Q2
1,1,2788,1981-Q3
2,2,1969,1981-Q4
3,3,1314,1982-Q1
4,4,2955,1982-Q2
...,...,...,...
159,159,322,2021-Q1
160,160,2947,2021-Q2
161,161,324,2021-Q3
162,162,2293,2021-Q4


In [46]:
population_df.tail(20)

Unnamed: 0,quarter_id,state_id,net_change
3095,2017-Q2,Tasmania,2543.0
3096,2017-Q3,Tasmania,2877.0
3097,2017-Q4,Tasmania,2628.0
3098,2018-Q1,Tasmania,3408.0
3099,2018-Q2,Tasmania,2875.0
3100,2018-Q3,Tasmania,3364.0
3101,2018-Q4,Tasmania,3107.0
3102,2019-Q1,Tasmania,3282.0
3103,2019-Q2,Tasmania,2445.0
3104,2019-Q3,Tasmania,2618.0


In [47]:
# Test Output - DELETE
population_df.to_csv('../jon_clean_test.csv')