### HDR: Threshold determination

In [83]:
import networkx as nx
import numpy as np
import os
import pandas as pd

from queue import deque

#### Settings

In [71]:
years = np.arange(2010, 2016+1)

mean_type = 'mean'  # ['mean', 'median', 'wmean']

#### Set input/output folder

In [79]:
data_in  = f'../data/HDR_4a_graph_formation/{mean_type}'
data_out = f'../data/HDR_4b_sparsity_threshold/'

if not os.path.exists(data_out):
    os.makedirs(data_out)

graphs_out = f'../data/HDR_4b_sparsity_threshold/{mean_type}'
if not os.path.exists(graphs_out):
    os.makedirs(graphs_out)

#### Functions

In [38]:
def one_component(adj_mtx):
    n = adj_mtx.shape[0]  # number of items
    
    visited = set()
    q = deque()
    
    q.append(0)
    
    while len(q) > 0:
        u = q.pop()
        visited.add(u)
        
        for v in range(n):
            if adj_mtx[u, v] and v not in visited and v not in q:
                q.append(v)
        
#     print(len(visited), n)
    
    if len(visited) == n:
        return True
    
    return False

#### Output DataFrame

In [47]:
info_df = pd.DataFrame(columns=['Total edges', 'Removed edges',
                                'Remaining edges', 'Threshold'],
                       index=years)
info_df

Unnamed: 0,Threshold,Removed edges,Remaining edges,Total edges
2010,,,,
2011,,,,
2012,,,,
2013,,,,
2014,,,,
2015,,,,
2016,,,,


#### Brute-force

In [None]:
for year in years:
    print(f'Year {year}...', end=' ')
    
    df = pd.read_csv(f'{data_in}/l1_{year}.csv', index_col='Country')
    
    min_w = df.min().min()
    max_w = df.max().max()

#     print('Minimum weight:', min_w)
#     print('Maximum weight:', max_w)
    
    edge_w = sorted(list(df.values.flatten()))
    
    i = 0
    while one_component(df.values >= edge_w[i]):
        i += 1

#     print(edge_w[i-1])
#     print(i-1)
    
    info_df.at[year, 'Threshold']    = edge_w[i-1]
    info_df.at[year, 'Removed edges']   = ((df.values >= edge_w[i-1]) ^ True).sum() - len(df)
    info_df.at[year, 'Remaining edges'] = (df.values >= edge_w[i-1]).sum()
    info_df.at[year, 'Total edges']     = (len(df) * len(df)) - len(df)
    
    print('Done!')

In [75]:
info_df

Unnamed: 0,Total edges,Removed edges,Remaining edges,Threshold
2010,37830,31808,6022,0.27716
2011,37830,32000,5830,0.282598
2012,37830,31230,6600,0.26807
2013,37830,32094,5736,0.279441
2014,37830,31616,6214,0.272869
2015,37830,31714,6116,0.264968
2016,37830,31136,6694,0.262653


In [74]:
info_df.to_csv(f'{data_out}/{mean_type}.csv', index_label='Year')

#### <font style="color: #FF0000;">Binary search implementation</font>

In [1]:
# ...

#### Calculating density

In [88]:
info_df['Remaining edges'] / info_df['Total edges'] * 100

2010    15.9186
2011     15.411
2012    17.4465
2013    15.1626
2014    16.4261
2015    16.1671
2016     17.695
dtype: object

#### Output sparse graphs

In [86]:
for year in years:
    df = pd.read_csv(f'{data_in}/l1_{year}.csv', index_col='Country')
    df = (df >= info_df['Threshold'][year]) * df
    df.to_csv(f'{graphs_out}/l1_{year}.csv')
    
    nx.write_gexf(nx.Graph(df), f'{graphs_out}/l1_{year}.gexf')