In [1]:
import pandas as pd
from pathlib import Path

In [2]:
root_folderpath = Path(r"C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\preprocess_data")
matrices_folderpath = Path(root_folderpath / r"data\matrices")
vectors_folderpath = Path(root_folderpath / r"data\vectors")
muni_names_filepath = Path(root_folderpath / r"data\crosswalk\dk_municipalities_code_to_name.xlsx")
output_folderpath = Path(root_folderpath / r"output")

In [3]:
muni_names_crosswalk = pd.read_excel(muni_names_filepath)
muni_names_crosswalk = muni_names_crosswalk.convert_dtypes()
muni_names_crosswalk.dtypes

code    string
name    string
dtype: object

# Process matrices

In [4]:
for matrix_filepath in matrices_folderpath.iterdir():
    print(matrix_filepath)
    df = pd.read_csv(matrix_filepath, low_memory=False)
    print("Initial state:")
    print(df.shape, df.columns)
    
    # Remove columns 'prod_cell' and 'cust_cell'
    for col in df.columns:
        if 'cell' in col:
            df = df.drop(col, axis=1)
    print("After removing 'cell' variables:")
    print(df.shape, df.columns)
    
    # Convert 'muni' columns into string
    for col in df.columns:
        if 'muni' in col:
            df[col] = df[col].astype('string')
    print(df.dtypes)
    
    # Convert values in sectors to integers, i.e. remove the 's' in 's01'
    for col in df.columns:
        if 'sector' in col:
            df[col] = df[col].str.replace('s','')
    
    # Convert munis into muni names, and drop the original munis.
    df_merge = df.copy()
    for col in df_merge.columns:
        if '_muni' in col:
            muni_name = col + '_name'
            print("Processing:", col)
            print("Before merge:", df.shape, df_merge.columns)
            df_merge = df_merge.merge(muni_names_crosswalk, left_on=col, right_on='code', how='inner', validate='m:1')
            assert(df_merge.shape[0] == df.shape[0])
            print("After merge:", df.shape, df_merge.columns)
            df_merge = df_merge.rename(columns={'name':muni_name})
            df_merge = df_merge.drop([col, 'code'], axis=1)
            print("After processing:", df.shape, df_merge.columns)
    
    # Save as csv
    outputfilename = matrix_filepath.stem.replace(' ', '_') + '.csv'
    output_filepath = Path(output_folderpath / outputfilename)
    df_merge.to_csv(output_filepath, index=False)
    print("Saved as:", output_filepath)
    print('\n-')

C:\Users\jasonjia\Dropbox\projects\disaggregated_accounts\preprocess_data\data\matrices\01_Domestic consumer spending matrix (predicted).csv
Initial state:
(7260624, 7) Index(['cust_muni', 'cust_sector', 'cust_cell', 'spend_muni', 'spend_sector',
       'spend_cell', 's_'],
      dtype='object')
After removing 'cell' variables:
(7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
cust_muni        string
cust_sector      object
spend_muni       string
spend_sector     object
s_              float64
dtype: object
Processing: cust_muni
Before merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_'], dtype='object')
After merge: (7260624, 5) Index(['cust_muni', 'cust_sector', 'spend_muni', 'spend_sector', 's_', 'code',
       'name'],
      dtype='object')
After processing: (7260624, 5) Index(['cust_sector', 'spend_muni', 'spend_sector', 's_', 'cust_muni_name'], dtype='object')
Processing: spend_muni
Before m

# Process vectors