In [1]:
import pandas as pd
from pathlib import Path
import regex as re

In [2]:
root_folderpath = Path(r"C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau")
matrices_folderpath = Path(root_folderpath / r"data\matrices")
vectors_foreign_consumer_spending_filepath = Path(root_folderpath / r"data\vector_foreign_consumer_spending\02_Foreign consumer spending (predicted).csv")
vectors_consumer_flows_folderpath = Path(root_folderpath / r"data\vectors_consumer_flows")
vectors_producer_flows_folderpath = Path(root_folderpath / r"data\vectors_producer_flows")
muni_names_filepath = Path(root_folderpath / r"data\crosswalk\dk_municipalities_code_to_name.xlsx")
output_folderpath = Path(root_folderpath / r"output_csv")

In [3]:
muni_names_crosswalk = pd.read_excel(muni_names_filepath)
muni_names_crosswalk = muni_names_crosswalk.convert_dtypes()
muni_names_crosswalk.dtypes

code    string
name    string
dtype: object

# Common Functions

In [4]:
def convert_munis_to_names(df, muni_names_crosswalk):
    df = df.copy()
    print("Initial state:")
    print(df.shape, df.columns)
        
    # Remove columns like 'prod_cell' and 'cust_cell'
    for col in df.columns:
        if 'cell' in col:
            df = df.drop(col, axis=1)
    print("After removing 'cell' variables:")
    print(df.shape, df.columns)
        
    # Convert 'muni' columns into string
    for col in df.columns:
        if 'muni' in col:
            df[col] = df[col].astype('string')
    print(df.dtypes)

    # Convert values in sectors to integers, i.e. remove the 's' in 's01'
    for col in df.columns:
        if 'sector' in col:
            df[col] = df[col].str.replace('s','')

    # Convert munis into muni names, and drop the original munis.
    df_merge = df.copy()
    for col in df_merge.columns:
        if '_muni' in col:
            muni_name = col + '_name'
            print("Processing:", col)
            print("Before merge:", df.shape, df_merge.columns)
            df_merge = df_merge.merge(muni_names_crosswalk, left_on=col, right_on='code', how='inner', validate='m:1')
            assert(df_merge.shape[0] == df.shape[0])
            print("After merge:", df.shape, df_merge.columns)
            df_merge = df_merge.rename(columns={'name':muni_name})
            df_merge = df_merge.drop([col, 'code'], axis=1)
            print("After processing:", df.shape, df_merge.columns)
    
    return df_merge

In [5]:
def save_df_to_csv(df, input_filepath, output_folderpath, suffix=''):
    output_filename = input_filepath.stem.replace(' ', '_') + suffix + '.csv'
    output_filepath = Path(output_folderpath / output_filename)
    df.to_csv(output_filepath, index=False)
    print("Saved as:", output_filepath)

# Process matrices

In [6]:
for matrix_filepath in matrices_folderpath.iterdir():
    print(matrix_filepath)
    df = pd.read_csv(matrix_filepath, low_memory=False)
    
    # Create a new df where munis are converted to names, then save as csv
    df_merge = convert_munis_to_names(df, muni_names_crosswalk)
    save_df_to_csv(df_merge, matrix_filepath, output_folderpath)
    
    print('\n-')

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\matrices\01_Domestic consumer spending matrix (predicted).csv
Initial state:
(7260624, 7) Index(['cust_muni', 'cust_sector', 'cust_cell', 'spend_muni', 'spend_sector',
       'spend_cell', 's_'],
      dtype='object')
After removing 'cell' variables:
(7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
cust_muni        string
cust_sector      object
spend_muni       string
spend_sector     object
s_              float64
dtype: object
Processing: cust_muni
Before merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
After merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_', 'code',
       'name'],
      dtype='object')
After processing: (7260624, 5) Index(['cust_sector', 'spend_muni', 'spend_sector', 's_', 'cust_muni_name'], dtype='object')
Processing: spend_muni

# Process vectors

## Process foreign consumer spending vector

In [7]:
print(vectors_foreign_consumer_spending_filepath)
df = pd.read_csv(vectors_foreign_consumer_spending_filepath, low_memory=False)

# Create a new df where munis are converted to names, then save as csv
df_merge = convert_munis_to_names(df, muni_names_crosswalk)
save_df_to_csv(df_merge, vectors_foreign_consumer_spending_filepath, output_folderpath)

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vector_foreign_consumer_spending\02_Foreign consumer spending (predicted).csv
Initial state:
(2744, 4) Index(['cust_muni', 'cust_sector', 'cust_cell', 's_foreign_'], dtype='object')
After removing 'cell' variables:
(2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_'], dtype='object')
cust_muni       string
cust_sector     object
s_foreign_     float64
dtype: object
Processing: cust_muni
Before merge: (2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_'], dtype='object')
After merge: (2744, 3) Index(['cust_muni', 'cust_sector', 's_foreign_', 'code', 'name'], dtype='object')
After processing: (2744, 3) Index(['cust_sector', 's_foreign_', 'cust_muni_name'], dtype='object')
Saved as: C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\output_csv\02_Foreign_consumer_spending_(predicted).csv


## Process consumer flows vectors

In [8]:
def process_vector(vector_filepath, player):
    # The csv files come in different types. Process each accordingly.
    # The goal is to get 3 columns per df: cust_muni, cust_sector, and the variable of interest.
    print(vector_filepath)
    df = pd.read_csv(vector_filepath, low_memory=False, header=None)
    var_name = vector_filepath.stem.replace(' ', '_').lower()

    if df.shape[0] < df.shape[1]:
        df = df.transpose()

    if df.shape[1] > 2:
        df = pd.read_csv(vector_filepath, low_memory=False)
        df = df.drop(f'{player}_cell', axis=1)
        var_name_in_table = df.columns[-1]
        df[f'{player}_muni'] = df[f'{player}_muni'].astype('str')
        df[f'{player}_sector'] = df[f'{player}_sector'].str.replace('s','').astype('str')
        df['var_name'] = var_name
        df = df.rename(columns={var_name_in_table:'value'})
    elif df.shape[1] == 2:
        df.columns = [f'{player}_cell', 'value'] 
        df['var_name'] = var_name 
        df[f'{player}_muni'] = df[f'{player}_cell'].str[1:4].astype('str')
        df[f'{player}_sector'] = df[f'{player}_cell'].str[-2:].astype('str')
        df = df.drop(f'{player}_cell', axis=1)
    
    print("Processed df_vector.shape:", df.shape)
    return df

In [9]:
def compile_vectors_into_df(vectors_consumer_flows_folderpath, player):
    df = pd.DataFrame()
    for vector_filepath in vectors_consumer_flows_folderpath.iterdir():
        if vector_filepath.suffix == '.csv':
            df_vector = process_vector(vector_filepath, player)
            if df.shape == (0,0):
                df = df_vector
                print("df.shape:", df.shape)
            else:
                df = pd.concat([df, df_vector], axis=0)
                print("df.shape:", df.shape)
    return df

In [12]:
df = compile_vectors_into_df(vectors_consumer_flows_folderpath, player='cust')
df_merge = convert_munis_to_names(df, muni_names_crosswalk)
save_df_to_csv(df_merge, Path(output_folderpath / 'consumer_flows.csv'), output_folderpath)
df_merge

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\03_Consumer product taxes paid.csv
Processed df_vector.shape: (2744, 4)
df.shape: (2744, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\04_Consumer non-product taxes paid.csv
Processed df_vector.shape: (2744, 4)
df.shape: (5488, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\05_Consumer social contributions paid.csv
Processed df_vector.shape: (2744, 4)
df.shape: (8232, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\06_Consumer interest paid.csv
Processed df_vector.shape: (2744, 4)
df.shape: (10976, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_consumer_flows\07_Consumer natural resource rents paid.csv
Processed df_vector.shape:

Unnamed: 0,cust_sector,value,var_name,cust_muni_name
0,01,568214382.869876,03_consumer_product_taxes_paid,København
1,02,563008343.185246,03_consumer_product_taxes_paid,København
2,03,569229424.670897,03_consumer_product_taxes_paid,København
3,04,662525987.57906,03_consumer_product_taxes_paid,København
4,05,574376013.30355,03_consumer_product_taxes_paid,København
...,...,...,...,...
35667,24,51034610.650797,19_consumer_other_transfers_received,Hjørring
35668,25,5081032.061075,19_consumer_other_transfers_received,Hjørring
35669,26,22437630.544748,19_consumer_other_transfers_received,Hjørring
35670,27,15588157.7833,19_consumer_other_transfers_received,Hjørring


## Process producer flows vectors

In [13]:
df = compile_vectors_into_df(vectors_producer_flows_folderpath, player='prod')
df_merge = convert_munis_to_names(df, muni_names_crosswalk)
save_df_to_csv(df_merge, Path(output_folderpath / 'producer_flows.csv'), output_folderpath)
df_merge

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\22_Producer product taxes paid.csv
Processed df_vector.shape: (2646, 4)
df.shape: (2646, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\25_Producer imports.csv
Processed df_vector.shape: (2646, 4)
df.shape: (5292, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\26_Producer sales to government.csv
Processed df_vector.shape: (2646, 4)
df.shape: (7938, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\27_Producer sales to capital accumulation cell.csv
Processed df_vector.shape: (2646, 4)
df.shape: (10584, 4)
C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\process_data_for_tableau\data\vectors_producer_flows\28_Producer exports.csv
Processed df_vector.shape: (2646, 4)
df.shape:

Unnamed: 0,value,var_name,prod_sector,prod_muni_name
0,415510881.97414243,22_producer_product_taxes_paid,01,København
1,580910726.5817533,22_producer_product_taxes_paid,02,København
2,40044104.83515471,22_producer_product_taxes_paid,03,København
3,597623593.2851751,22_producer_product_taxes_paid,04,København
4,496804281.59196955,22_producer_product_taxes_paid,05,København
...,...,...,...,...
13225,3379130.0,28_producer_exports,26,Hjørring
13226,9971863.0,28_producer_exports,27,Hjørring
13227,0.0,28_producer_exports,77,Hjørring
13228,0.0,28_producer_exports,88,Hjørring
