In [114]:
# dependencies
import pandas as pd
import numpy as np
from datetime import datetime


In [115]:
# file path setting

file_path = 'Resources/underemployed_data.xlsx'


In [117]:
# 
dateCols = ['Unnamed: 0']
# Read in all sheets
df = pd.read_excel(file_path, engine='openpyxl', sheet_name='Data2', parse_dates=dateCols)


# rename columns
df.rename(columns={'Unnamed: 0':'datetime_col'}, inplace=True)

# drop unuseful rows
df = df.drop([0,1,2,3,4,5,6,7,8])


# select time range
start_date = '2015-01-01 00:00:00'
end_date = '2020-12-01 00:00:00'
time_range_df = df.loc[(df['datetime_col'] >= start_date) & (df['datetime_col'] <= end_date)]

# time_range_df.columns

#select columns containing data relevant to Underemployment. 
underemployed_df = time_range_df.filter(like='Underemployment rate')


# # selecting the columns with the "seasonally Adjusted" series type. 
underemployed_df = underemployed_df.filter(like=';.1', axis=1)

# underemployed_df

underemployed_df.drop(underemployed_df.iloc[:, 9:27], inplace=True, axis=1)



#rename columns
underemployed_df.columns.values[0] = 'Total'
underemployed_df.columns.values[1] = 'NSW'
underemployed_df.columns.values[2] = 'VIC'
underemployed_df.columns.values[3] = 'QLD'
underemployed_df.columns.values[4] = 'SA'
underemployed_df.columns.values[5] = 'WA'
underemployed_df.columns.values[6] = 'TAS'
underemployed_df.columns.values[7] = 'NT'
underemployed_df.columns.values[8] = 'ACT'

# Creating Year column and setting to index
date_data = time_range_df['datetime_col']
date_data_df = date_data.to_frame()

date_data_df['Year'] = pd.to_datetime(date_data_df['datetime_col']).dt.year
# date_data_df

# Merge Date index to underemployed_cleaned_df
underemployed_cleaned_df = pd.concat([date_data_df, underemployed_df], axis=1)

# set index
underemployed_cleaned_df = underemployed_cleaned_df.set_index('Year')


# display
underemployed_cleaned_df

Unnamed: 0_level_0,datetime_col,Total,NSW,VIC,QLD,SA,WA,TAS,NT,ACT
Year,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
2015,2015-01-01 00:00:00,8.52206,7.76488,9.54355,9.09478,8.84669,7.91932,10.5126,4.05151,6.2067
2015,2015-02-01 00:00:00,8.53424,8.00674,9.04442,9.01985,9.19244,7.42915,10.6901,4.2798,6.87868
2015,2015-03-01 00:00:00,8.1567,7.85422,8.53502,8.68641,8.37027,7.01305,10.2365,5.31318,6.92977
2015,2015-04-01 00:00:00,8.32123,8.27281,8.09019,8.9182,8.66695,7.20859,10.5346,4.05161,6.37993
2015,2015-05-01 00:00:00,8.38091,8.18829,8.7666,8.50894,9.75677,7.52186,10.8981,4.26351,5.80763
...,...,...,...,...,...,...,...,...,...,...
2020,2020-08-01 00:00:00,11.274,10.3138,14.5103,9.96735,11.7359,9.93334,12.2911,7.65649,5.63856
2020,2020-09-01 00:00:00,11.4021,10.1931,14.9416,10.6801,11.0275,9.44405,10.7413,5.9622,6.13643
2020,2020-10-01 00:00:00,10.3631,9.73728,12.919,9.47834,10.0898,8.63401,10.4769,7.76838,7.37123
2020,2020-11-01 00:00:00,9.29105,8.8311,10.2063,9.35109,10.6315,7.93988,10.1187,5.9385,5.911


In [111]:
underemployed_cleaned_df.to_csv('./underemployed_cleaned_df.csv', index=False)