### HDR: Attributes max-min [0, 1] standardization

In [1]:
import os
import pandas as pd

from pprint import pprint

#### Set input/output folder

In [2]:
data_in       = '../data/HDR_1b_iso_standardization'
data_attr_out = '../data/HDR_2a_max_min_attr'
data_year_out = '../data/HDR_2a_max_min_year'

if not os.path.exists(data_attr_out):
    os.mkdir(data_attr_out)
    
if not os.path.exists(data_year_out):
    os.mkdir(data_year_out)

#### Read attributes data

In [3]:
all_attrs = dict()

for dim in os.listdir(data_in):
    dim_path = f'{data_in}/{dim}'
    
    if os.path.isdir(dim_path):
        for attr_file in os.listdir(dim_path):
            if not os.path.isdir(attr_file):
                attr = f'[{dim}] {attr_file[:-4]}'
                all_attrs[attr] = pd.read_csv(f'{dim_path}/{attr_file}', index_col='Country')
                
    else:
        attr = dim[:-4]
        all_attrs[attr] = pd.read_csv(f'{dim_path}', index_col='Country')

#### Attributes max-min [0, 1] standardization & output to csv format

$x' = \cfrac{x - min}{max - min}$
- $max$: maximum of all attribute's observations
- $min$: mimimum of all attribute's observations

In [4]:
max_min_df = pd.DataFrame(columns=['Minimum', 'Maximum'])

for attr in sorted(list(all_attrs.keys())):
#     print(attr)
    
    min_val = all_attrs[attr].min().min()  # min of all attribute's values
    max_val = all_attrs[attr].max().max()  # max of all attribute's values
    max_min_df.loc[attr] = [min_val, max_val]
    
    all_attrs[attr] = (all_attrs[attr] - min_val) / (max_val - min_val)
    s = attr.split(']')
    
    if len(s) == 2:
        folder_name = s[0][1:]
        file_name = s[1][1:]
        if folder_name not in os.listdir(data_attr_out):
            os.mkdir(f'{data_attr_out}/{folder_name}')
        all_attrs[attr].to_csv(f'{data_attr_out}/{folder_name}/{file_name}.csv', index_label='Country')
        
    else:
        folder_name = s[0]
        all_attrs[attr].to_csv(f'{data_attr_out}/{folder_name}.csv', index_label='Country')

max_min_df.to_csv('../data/HDR_max_min_values.csv', index_label='[Dimension] Attribute')

In [5]:
attrs = sorted(list(all_attrs.keys()))
print(len(attrs))

141


#### Concatenate attributes by year | [Dimension] Attribute

In [6]:
df_dict = dict()

for attr in attrs:
    df = all_attrs[attr]
    for year in df.columns:
        if year not in df_dict.keys():
            df_dict[year] = pd.DataFrame()
        df_year = pd.DataFrame(df[year])
        df_year.columns = [attr]
        df_dict[year] = df_dict[year].join(df_year, how='outer')
        # df_dict[year][attr] = df[year]

#### Year (Number of countries, Number of attributes)

In [7]:
yoi = list()  # years of interest

print('YEAR CNT ATR')
for year in df_dict.keys():
    
    if len(year) > 4:
        print(f'\n{year}')
        pprint(df_dict[year].columns)
        
    else:
        print(year, df_dict[year].count().max(), df_dict[year].shape[1])
        yoi.append(year)

YEAR CNT ATR
1990 195 71
1991 191 18
1992 191 8
1993 191 8
1994 191 8
1995 195 90
1996 191 9
1997 191 9
1998 191 8
1999 191 8
2000 195 103
2001 191 8
2002 191 8
2003 191 8
2004 191 8
2005 195 105
2006 191 8
2007 191 8
2008 191 8
2009 191 8
2010 195 115
2011 195 108
2012 195 107
2013 195 107
2014 195 108
2015 195 116
2016 195 102
2017 195 83

2008-2013
Index(['[Education] Proportion of schools with access to the Internet (%)'], dtype='object')

1990/2015
Index(['[Environmental Sustainability] Forest area, change (%)'], dtype='object')

2003-2017
Index(['[Gender] Child marriage, women married by age 18 (% of women ages 20–24 years who are married or in union)'], dtype='object')

2007-2017
Index(['[Gender] Female share of graduates in science, mathematics, engineering, manufacturing and construction at tertiary level (%)'], dtype='object')

2005-2018
Index(['[Gender] Violence against women ever experienced, intimate partner (% of female population ages 15 and older)', '[Gender] Violence a

In [8]:
pprint(yoi)

['1990',
 '1991',
 '1992',
 '1993',
 '1994',
 '1995',
 '1996',
 '1997',
 '1998',
 '1999',
 '2000',
 '2001',
 '2002',
 '2003',
 '2004',
 '2005',
 '2006',
 '2007',
 '2008',
 '2009',
 '2010',
 '2011',
 '2012',
 '2013',
 '2014',
 '2015',
 '2016',
 '2017']


#### Output data to csv format

In [9]:
for year in yoi:
    df_dict[year].to_csv(f'{data_year_out}/{year}.csv',
                         index_label='Country')