In [1]:
import pandas as pd
from pathlib import Path
import regex as re

In [2]:
root_folderpath = Path(r"C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau")
matrices_folderpath = Path(root_folderpath / r"data\matrices")
vectors_individual_folderpath = Path(root_folderpath / r"data\vectors_individual")
vectors_consumer_flows_folderpath = Path(root_folderpath / r"data\vectors_consumer_flows")
vectors_producer_flows_folderpath = Path(root_folderpath / r"data\vectors_producer_flows")
muni_names_filepath = Path(root_folderpath / r"data\crosswalk\dk_municipalities_code_to_name.xlsx")
output_folderpath = Path(root_folderpath / r"output_csv")

In [3]:
muni_names_crosswalk = pd.read_excel(muni_names_filepath)
muni_names_crosswalk = muni_names_crosswalk.convert_dtypes()
muni_names_crosswalk.dtypes

code    string
name    string
dtype: object

# Common Functions

In [4]:
def convert_munis_to_names(df, muni_names_crosswalk):
    df = df.copy()
    print("Initial state:")
    print(df.shape, df.columns)
        
    # Remove columns like 'prod_cell' and 'cust_cell'
    for col in df.columns:
        if 'cell' in col:
            df = df.drop(col, axis=1)
    print("After removing 'cell' variables:")
    print(df.shape, df.columns)
        
    # Convert 'muni' columns into string
    for col in df.columns:
        if 'muni' in col:
            df[col] = df[col].astype('string')
    print(df.dtypes)

    # Convert values in sectors to integers, i.e. remove the 's' in 's01'
    for col in df.columns:
        if 'sector' in col:
            df[col] = df[col].str.replace('s','')

    # Convert munis into muni names, and drop the original munis.
    df_merge = df.copy()
    for col in df_merge.columns:
        if '_muni' in col:
            muni_name = col + '_name'
            print("Processing:", col)
            print("Before merge:", df.shape, df_merge.columns)
            df_merge = df_merge.merge(muni_names_crosswalk, left_on=col, right_on='code', how='inner', validate='m:1')
            assert(df_merge.shape[0] == df.shape[0])
            print("After merge:", df.shape, df_merge.columns)
            df_merge = df_merge.rename(columns={'name':muni_name})
            df_merge = df_merge.drop([col, 'code'], axis=1)
            print("After processing:", df.shape, df_merge.columns)
    
    return df_merge

In [5]:
def save_df_to_csv(df, input_filepath, output_folderpath, suffix=''):
    output_filename = input_filepath.stem + suffix + '.csv'
    output_filepath = Path(output_folderpath / output_filename)
    df.to_csv(output_filepath, index=False)
    print("Saved as:", output_filepath)

# Process matrices

In [6]:
for matrix_filepath in matrices_folderpath.iterdir():
    print(matrix_filepath)
    df = pd.read_csv(matrix_filepath, low_memory=False)
    
    # Create a new df where munis are converted to names, then save as csv
    df_merge = convert_munis_to_names(df, muni_names_crosswalk)
    save_df_to_csv(df_merge, matrix_filepath, output_folderpath)
    
    print('\n-')

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\matrices\Domestic consumer spending matrix (predicted).csv
Initial state:
(7260624, 7) Index(['cust_muni', 'cust_sector', 'cust_cell', 'spend_muni', 'spend_sector',
       'spend_cell', 's_'],
      dtype='object')
After removing 'cell' variables:
(7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
cust_muni        string
cust_sector      object
spend_muni       string
spend_sector     object
s_              float64
dtype: object
Processing: cust_muni
Before merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
After merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_', 'code',
       'name'],
      dtype='object')
After processing: (7260624, 5) Index(['cust_sector', 'spend_muni', 'spend_sector', 's_', 'cust_muni_name'], dtype='object')
Processing: spend_muni
Be

# Process vectors

## Process individual vectors (foreign consumer spending, surplus of owner-occupied housing to consumers)
Note: surplus of owner-occupied housing to consumers has both cust and prod cell in the raw data, but the prod cell is always cust muni + s88, so we want to remove it. This effectively reduces the matrix into a vector.

In [7]:
for vector_individual_filepath in vectors_individual_folderpath.iterdir():
    print(vector_individual_filepath)
    df = pd.read_csv(vector_individual_filepath, low_memory=False)
    
    # Create a new df where munis are converted to names, then save as csv
    df_merge = convert_munis_to_names(df, muni_names_crosswalk)
    save_df_to_csv(df_merge, vector_individual_filepath, output_folderpath)
    
    print('\n-')

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_individual\Foreign consumer spending (predicted).csv
Initial state:
(2744, 4) Index(['cust_muni', 'cust_sector', 'cust_cell', 's_foreign_'], dtype='object')
After removing 'cell' variables:
(2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_'], dtype='object')
cust_muni       string
cust_sector     object
s_foreign_     float64
dtype: object
Processing: cust_muni
Before merge: (2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_'], dtype='object')
After merge: (2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_', 'code', 'name'], dtype='object')
After processing: (2744, 3) Index(['cust_sector', 's_foreign_', 'cust_muni_name'], dtype='object')
Saved as: C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\output_csv\Foreign consumer spending (predicted).csv

-
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\ve

## Process consumer flows vectors

In [8]:
def process_vector(vector_filepath, player):
    # The csv files come in different types. Process each accordingly.
    # The goal is to get 3 columns per df: cust_muni, cust_sector, and the variable of interest.
    print(vector_filepath)
    df = pd.read_csv(vector_filepath, low_memory=False, header=None)
    var_name = vector_filepath.stem

    if df.shape[0] < df.shape[1]:
        df = df.transpose()

    if df.shape[1] > 2:
        df = pd.read_csv(vector_filepath, low_memory=False)
        df = df.drop(f'{player}_cell', axis=1)
        var_name_in_table = df.columns[-1]
        df[f'{player}_muni'] = df[f'{player}_muni'].astype('str')
        df[f'{player}_sector'] = df[f'{player}_sector'].str.replace('s','').astype('str')
        df['var_name'] = var_name
        df = df.rename(columns={var_name_in_table:'value'})
    elif df.shape[1] == 2:
        df.columns = [f'{player}_cell', 'value'] 
        df['var_name'] = var_name 
        df[f'{player}_muni'] = df[f'{player}_cell'].str[1:4].astype('str')
        df[f'{player}_sector'] = df[f'{player}_cell'].str[-2:].astype('str')
        df = df.drop(f'{player}_cell', axis=1)
    
    print("Processed df_vector.shape:", df.shape)
    return df

In [9]:
def compile_vectors_into_df(vectors_consumer_flows_folderpath, player):
    df = pd.DataFrame()
    for vector_filepath in vectors_consumer_flows_folderpath.iterdir():
        if vector_filepath.suffix == '.csv':
            df_vector = process_vector(vector_filepath, player)
            if df.shape == (0,0):
                df = df_vector
                print("df.shape:", df.shape)
            else:
                df = pd.concat([df, df_vector], axis=0)
                print("df.shape:", df.shape)
    return df

In [10]:
df = compile_vectors_into_df(vectors_consumer_flows_folderpath, player='cust')
df_merge = convert_munis_to_names(df, muni_names_crosswalk)
save_df_to_csv(df_merge, Path(output_folderpath / 'Additional consumer flows.csv'), output_folderpath)
df_merge

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\Consumer adjustment for pension entitlements received.csv
Processed df_vector.shape: (2744, 4)
df.shape: (2744, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\Consumer gross saving.csv
Processed df_vector.shape: (2744, 4)
df.shape: (5488, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\Consumer interest paid.csv
Processed df_vector.shape: (2744, 4)
df.shape: (8232, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\Consumer interest received.csv
Processed df_vector.shape: (2744, 4)
df.shape: (10976, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\Consumer natural resource rents paid.csv
Processed df_vector.shape: (2744,

Unnamed: 0,value,var_name,cust_sector,cust_muni_name
0,197366365.711175,Consumer adjustment for pension entitlements r...,01,København
1,308340477.702592,Consumer adjustment for pension entitlements r...,02,København
2,199690524.2474,Consumer adjustment for pension entitlements r...,03,København
3,1347486676.418711,Consumer adjustment for pension entitlements r...,04,København
4,223900554.813007,Consumer adjustment for pension entitlements r...,05,København
...,...,...,...,...
35667,341570823.1017025,Consumer social contributions paid,24,Hjørring
35668,355249.3077520026,Consumer social contributions paid,25,Hjørring
35669,188851304.60838416,Consumer social contributions paid,26,Hjørring
35670,106507543.97043432,Consumer social contributions paid,27,Hjørring


## Process producer flows vectors

In [11]:
df = compile_vectors_into_df(vectors_producer_flows_folderpath, player='prod')
df_merge = convert_munis_to_names(df, muni_names_crosswalk)
save_df_to_csv(df_merge, Path(output_folderpath / 'Additional producer flows.csv'), output_folderpath)
df_merge

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\Producer exports.csv
Processed df_vector.shape: (2646, 4)
df.shape: (2646, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\Producer imports.csv
Processed df_vector.shape: (2646, 4)
df.shape: (5292, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\Producer product taxes paid.csv
Processed df_vector.shape: (2646, 4)
df.shape: (7938, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\Producer sales to capital accumulation cell.csv
Processed df_vector.shape: (2646, 4)
df.shape: (10584, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\Producer sales to government.csv
Processed df_vector.shape: (2646, 4)
df.shape: (13230, 4)
Ini

Unnamed: 0,value,var_name,prod_sector,prod_muni_name
0,1612641024.0,Producer exports,01,København
1,1526094720.0,Producer exports,02,København
2,1520103040.0,Producer exports,03,København
3,5996545023.999999,Producer exports,04,København
4,2439924117.431438,Producer exports,05,København
...,...,...,...,...
13225,1267504128.0,Producer sales to government,26,Hjørring
13226,1637282176.0,Producer sales to government,27,Hjørring
13227,0.0,Producer sales to government,77,Hjørring
13228,0.0,Producer sales to government,88,Hjørring
