In [1]:
import pandas as pd
import numpy as np
from functions import *
import re

%load_ext autoreload
%autoreload 2

In [2]:
# set paths to data
path = '/Users/johan/Library/CloudStorage/GoogleDrive-johan.oelgaard@gmail.com/My Drive/04 Økonomi/10 Thesis/Data'

oxford = 'oxford_economics.xlsx'
oxford_df = pd.read_excel(path + '/' + oxford)

In [3]:
# industry data
oxford = oxford_df.iloc[:, :-11]
# extract the nace codes
nace_cols = ['Indicator', 'Sector', 'Nace code']
oxford["NACE"] = oxford.apply(lambda row: extract_nace(row, nace_cols), axis=1)

# remove commas from 'Sector' and make lower case
oxford['Sector'] = oxford['Sector'].str.replace(',', '').str.lower()

dk = oxford[oxford['Location'] == 'Denmark'].copy().reset_index(drop=True)
eu = oxford[oxford['Location'] == 'Europe'].copy().reset_index(drop=True)
world = oxford[oxford['Location'] == 'World'].copy().reset_index(drop=True)

In [4]:
# danish data
# explode the NACE column
dk = dk.explode('NACE')
dk['NACE industry'] = dk['NACE'].str.split('.', expand=True)[0]
# dk['NACE subindustry'] = dk['NACE'].str.split('.', expand=True)[1]
dk['NACE'] = dk['NACE'].apply(pd.to_numeric, errors='coerce')

# remove where 'Sector' is 'whole economy'
whole_dk = dk[dk['Sector'] == 'whole economy'].copy().reset_index(drop=True)
dk = dk[dk['Sector'] != 'whole economy'].copy().reset_index(drop=True)

# drop columns
dk.drop(columns=['Location', 'Sector', 'Nace code','NACE'], inplace=True)
indicators = ['Production index', 
              'Gross output (sales)', 
              'Intermediate consumption', 
              'Investment', 
              'Value-added output']

dk_filtered = dk[dk['Indicator'].isin(indicators)].copy()
dk_filtered = dk_filtered.groupby(['NACE industry', 'Indicator']).mean(numeric_only=True).reset_index()

# set the identifying columns as the index.
dk_indexed = dk_filtered.set_index(['NACE industry', 'Indicator'])

# .stack() to transform the quarter columns into a single column.
dk_stacked = dk_indexed.stack().reset_index()

# rename the resulting columns appropriately.
dk_stacked.columns = ['NACE industry', 'Indicator', 'quarter', 'value']

# Now pivot so that each indicator becomes its own column.
dk_wide = dk_stacked.pivot_table(
    index=['NACE industry', 'quarter'],
    columns='Indicator',
    values='value'
).reset_index()
dk_wide.columns.name = None

# rename NACE industry to NACE
dk_wide.rename(columns={'NACE industry': 'NACE'}, inplace=True)

# change the quarter column to a datetime column NB END OF QUARTER dates
dk_wide['quarter'] = dk_wide['quarter'].str.replace('Q1', '-01-01')
dk_wide['quarter'] = dk_wide['quarter'].str.replace('Q2', '-04-01')
dk_wide['quarter'] = dk_wide['quarter'].str.replace('Q3', '-07-01')
dk_wide['quarter'] = dk_wide['quarter'].str.replace('Q4', '-10-01')
dk_wide['quarter'] = pd.to_datetime(dk_wide['quarter'], format='%Y-%m-%d')
dk_wide.rename(columns={'quarter': 'timestamp'}, inplace=True)



dk_wide = dk_wide.rename(columns={
    'Production index': 'prodind',
    'Gross output (sales)': 'grossoutput',
    'Intermediate consumption': 'intercons',
    'Investment': 'invest',
    'Value-added output': 'valaddoutput'
})
# display(dk_wide)

# save the data
dk_wide.to_csv('data/dk_industry.csv', index=False)