## Init

In [1]:
import re
import numpy as np
import pandas as pd
import plotly.express as px

## Import Data

In [2]:
labor93 = pd.read_csv("./data/KRLabor1.csv", index_col=list(range(0,6)), header=([0]))
labor00 = pd.read_csv("./data/KRLabor2.csv", index_col=list(range(0,6)), header=([0]))
labor09 = pd.read_csv("./data/KRLabor3.csv", index_col=list(range(0,6)), header=([0]))

In [3]:
gdp_deflator = pd.read_csv('./data/KRGDPDeflator.csv', skiprows=26, dtype={0:'object'})
gdp_deflator.columns = ['Year', 'GDPDeflator']
gdp_deflator.set_index('Year', inplace=True)


## Tidy Data

In [4]:
labor93 = \
(labor93
 #columns and format
 .rename(columns=lambda x: re.sub('[^0-9]', "", x))
 .droplevel('UNIT')
 .unstack('Item')
 .reorder_levels([1,0], axis=1)
 .sort_index(axis=1)
 .rename_axis(['Item', 'Year'], axis=1)
 .stack('Year')
 .query('Industry != "Total" &\
         Education != "Total" &\
         Sex != "Total" &\
         Age != "Total"')
 # tidy
 .rename(columns=lambda x: x.replace(" ", ""))
 .reorder_levels([4,0,1,2,3])
 .sort_index(0)
)

In [5]:
labor00 =\
(labor00
 #columns and format
 .rename(columns=lambda x: re.sub('[^0-9]', "", x))
 .droplevel('UNIT')
 .unstack('Item')
 .reorder_levels([1,0], axis=1)
 .sort_index(axis=1)
 .rename_axis(['Item', 'Year'], axis=1)
 .stack('Year')
 # no total in the data
 # tidy
 .rename(columns=lambda x: x.replace(" ", ""))
 .reorder_levels([4,0,1,2,3])
 .sort_index(0)
)

In [6]:
def change_colnames(df, colnames):
    df.columns = colnames
    return df

labor09 =\
(labor09
 #columns and format
 .rename(columns=lambda x: re.sub('[^0-9]', "", x))
 .droplevel('UNIT')
 .unstack('Item')
 .reorder_levels([1,0], axis=1)
 .sort_index(axis=1)
 .rename_axis(['Item', 'Year'], axis=1)
 .stack('Year')
 #tidy and harmonise with other dataframes
 .pipe(change_colnames, ['Employment', 'MonthlyTotalwage', 'Totalhoursworked'])
 .rename_axis(index={'INDUSTRY_9S':'Industry'})
 .replace("-", np.NaN)
 .astype('float')
 .assign(MonthlyTotalwage=lambda x: x.MonthlyTotalwage*10000)
 .rename(columns=lambda x: x.replace(" ", ""))
 .reorder_levels([4,0,1,2,3])
 .sort_index(0)
)

In [7]:
labor = pd.concat([labor09, labor00, labor93], axis=0).sort_index()

## Transform Data

In [8]:
labor_agg =\
(labor
 #filter 
 .query('Industry != "Agriculture,hunting and forestry(01-02)"')
 .replace("-", np.NaN)
 .astype('float')
 #mutate
 .assign(W=lambda x: round(x.MonthlyTotalwage/1000, 0)/x.Totalhoursworked,
         H=lambda x: x.Employment*x.Totalhoursworked)
 #aggregate
 .groupby(['Education', 'Sex', 'Age', 'Year'])
 .agg({'W':'mean',
       'H': 'sum'})
 .assign(TW=lambda x: x.W*x.H)
 .join(gdp_deflator)
)

In [9]:
# merge and create new columns
dlnw =\
(labor_agg
 .assign(lnW=lambda x: x.W.apply(np.log))
 .groupby(['Education', 'Sex', 'Age'])
 ['lnW']
 .diff()
 .to_frame()
 .rename(columns={'lnW':'dlnW'})
 .replace(np.inf, np.NaN)
)

tw_yr =\
(labor_agg
 .groupby('Year')
 ['TW']
 .sum()
)

pi =\
(gdp_deflator
 .apply(np.log)
 .diff()
 .rename(columns={'GDPDeflator': 'Pi'})
)

dlnH =\
(labor_agg
 ['H']
 .apply(np.log)
 .groupby(['Education', 'Sex', 'Age'])
 .diff()
 .to_frame()
 .rename(columns={'H':'dlnH'})
 .replace(np.inf, np.NaN)
)

wdlnw =\
(labor_agg
 .assign(Weight=lambda x: x.TW/tw_yr)
 .join(dlnw)
 .join(pi)
 .assign(wdlnw=lambda x: x.Weight*(x.dlnW-x.Pi))
 .groupby('Year')
 ['wdlnw']
 .sum()
 .to_frame()
 .drop(['1993','2000','2009'])
)

wdlnh =\
(labor_agg
 .assign(Weight=lambda x: x.TW/tw_yr)
 .join(dlnH)
 .assign(wdlnH=lambda x: x.Weight*x.dlnH)
 .groupby('Year')
 ['wdlnH']
 .sum()
 .to_frame()
 .drop(['1993','2000','2009'])
)

  out_arr[res_indexer] = arr[res_indexer] - arr[lag_indexer]


In [10]:
wdlnh.to_csv('./data/KRWdlnH.csv')
wdlnw.to_csv('./data/KRWdlnw.csv')

In [11]:
plt = px.line(wdlnw.join(wdlnh).reset_index().melt(id_vars='Year'), x='Year', y='value', color='variable')


In [12]:
plt.write_html('./output/WeightedPercentageChangeinWageandManHour.html')

## Long Term

In [39]:

start_year = '2009'
end_year = '2019'

dpctagg =\
(labor_agg
 .query(f'Year == "{start_year}" | Year == "{end_year}"')
 .groupby(['Education', 'Sex', 'Age'])
 .pct_change()
 .rename(columns=lambda x: "dpct" + x)
)

yr_agg =\
(labor_agg
 .query(f'Year == "{start_year}" | Year == "{end_year}"')
 .groupby('Year')
 ['TW']
 .sum()
)


dual =\
(labor_agg
 .query(f'Year == "{start_year}" | Year == "{end_year}"')
 .join(dpctagg)
 .assign(Weight=lambda x: x.TW/yr_agg)
 .assign(Wdpctw=lambda x: x.Weight*((1+x.dpctW)/(1+x.dpctGDPDeflator)))
 .groupby('Year')
 ['Wdpctw']
 .sum()
)