### HDR: Weighted Stochastic Block Model (WSBM) data concatenation

In [1]:
import numpy as np
import os
import pandas as pd

from pprint import pprint

#### Settings

In [2]:
years = np.arange(2010, 2016+1)

#### Set input/output folder

In [3]:
settings = 'sparse/exp_0.5'

data_in = f'../data/HDR_4c_wsbm/00_raw/{settings}'
# pprint(sorted(list(os.listdir(data_in))))

data_out = f'../data/HDR_4c_wsbm/01_merged/{settings}'
if not os.path.exists(data_out):
    os.makedirs(data_out)

#### Set columns | <font style="color: #FF0000;">Some DataFrame(s) may have already been sorted!</font>

In [4]:
columns = list()

for i in range(5):
    columns += list(np.arange(i, 20, 5) + 1)

columns

[1, 6, 11, 16, 2, 7, 12, 17, 3, 8, 13, 18, 4, 9, 14, 19, 5, 10, 15, 20]

#### Concatenate data

In [8]:
labels_dfs = dict()
logevd_dfs = dict()

for file in sorted(os.listdir(data_in)):
    info = file.split('_')
    year = int(info[0])
    tag  = info[1].split('.')[0]
    
    if tag == 'labels':
        if year not in labels_dfs.keys():
            labels_dfs[year] = pd.read_csv(f'{data_in}/{file}', header=None)
        else:
            labels_dfs[year] = pd.concat([labels_dfs[year], pd.read_csv(f'{data_in}/{file}', header=None)], axis=1)
#             labels_dfs[year] = labels_dfs[year].reindex()
            
    if tag == 'logevidence':
        if year not in logevd_dfs.keys():
            logevd_dfs[year] = pd.read_csv(f'{data_in}/{file}', header=None)
        else:
            logevd_dfs[year] = pd.concat([logevd_dfs[year], pd.read_csv(f'{data_in}/{file}', header=None)], axis=1)
#             logevd_dfs[year] = logevd_dfs[year].reindex()  

#### Re-index data axes

In [6]:
# temp_df = pd.read_csv('../data/HDR_4a_graph_formation/mean/l1_2010.csv', index_col='Country')
# labels_dfs[2010].index = temp_df.index
# labels_dfs[2010].columns = np.arange(labels_dfs[2010].shape[1]) + 1

# logevd_dfs[2010].index = ['Log-Likelihood']
# logevd_dfs[2010].columns = np.arange(logevd_dfs[2010].shape[1]) + 1

In [9]:
for year in years:
    temp_df = pd.read_csv(f'../data/HDR_4a_graph_formation/mean/l1_{year}.csv', index_col='Country')
    
    labels_dfs[year].index   = temp_df.index
    labels_dfs[year].columns = columns
    labels_dfs[year] = labels_dfs[year].reindex(sorted(labels_dfs[year].columns), axis=1)
    
    logevd_dfs[year].index   = ['Log-Likelihood']
    logevd_dfs[year].columns = columns
    logevd_dfs[year] = logevd_dfs[year].reindex(sorted(logevd_dfs[year].columns), axis=1)
    
    del(temp_df)

#### Output data to csv format

In [10]:
for year in years:
    labels_dfs[year].to_csv(f'{data_out}/{year}_labels.csv', index_label='Country')
    logevd_dfs[year].to_csv(f'{data_out}/{year}_logevd.csv', index_label='Measure')