### HDR: Number of mutual attributes between the countries by year

In [1]:
import numpy as np
import os
import pandas as pd

#### Set input and output folder

In [2]:
data_in  = '../data/HDR_1c_aggregate_attr_by_year'
data_out = '../data/HDR_attr_intersection_between_countries'

if not os.path.exists(data_out):
    os.mkdir(data_out)

#### Read data

In [3]:
data = dict()

for file in os.listdir(data_in):
    year = file.split('.')[0]
    data[year] = pd.read_csv(f'{data_in}/{file}',
                             index_col='Country')

#### Years of Interest

In [4]:
years = sorted(list(data.keys()))

In [5]:
yoi = [str(y) for y in range(1990, 2017 + 1)]
print(yoi)

['1990', '1991', '1992', '1993', '1994', '1995', '1996', '1997', '1998', '1999', '2000', '2001', '2002', '2003', '2004', '2005', '2006', '2007', '2008', '2009', '2010', '2011', '2012', '2013', '2014', '2015', '2016', '2017']


#### Mutual attributes

In [6]:
attributes = set(data[yoi[0]].columns)

for year in yoi:
    attributes.intersection_update(set(data[year].columns))
    
len(attributes)

8

#### Mutual countries

In [7]:
# countries = set(data[yoi[0]].index)

# for year in yoi:
#     print(year, len(data[year].index))
#     countries.intersection_update(set(data[year].index))
    
# countries  = sorted(list(countries))
# print(len(countries))

#### Compute mutual attributes between two countries; Output data to csv format
#### <font style="color: #FF0000;">Warning: Slow operation!!!</font> 

In [8]:
intersect_dict = dict()

for year in yoi:
    print(f'Year {year} ...', end=' ')
    intersect_dict[year] = pd.DataFrame(columns=data[year].index,
                                        index=data[year].index)

    for c1 in data[year].index:
        for c2 in data[year].index:
            a1 = set(data[year].loc[c1].dropna().index)
            a1.intersection_update(attributes)

            a2 = set(data[year].loc[c2].dropna().index)
            a2.intersection_update(attributes)

            intersect_dict[year][c1][c2] = len(a1.intersection(a2))
                
    intersect_dict[year].to_csv(f'{data_out}/{year}.csv',
                                index_label='Country')
    print('Done!')

Year 1990 ... Done!
Year 1991 ... Done!
Year 1992 ... Done!
Year 1993 ... Done!
Year 1994 ... Done!
Year 1995 ... Done!
Year 1996 ... Done!
Year 1997 ... Done!
Year 1998 ... Done!
Year 1999 ... Done!
Year 2000 ... Done!
Year 2001 ... Done!
Year 2002 ... Done!
Year 2003 ... Done!
Year 2004 ... Done!
Year 2005 ... Done!
Year 2006 ... Done!
Year 2007 ... Done!
Year 2008 ... Done!
Year 2009 ... Done!
Year 2010 ... Done!
Year 2011 ... Done!
Year 2012 ... Done!
Year 2013 ... Done!
Year 2014 ... Done!
Year 2015 ... Done!
Year 2016 ... Done!
Year 2017 ... Done!


#### Display results

In [9]:
for year in intersect_dict.keys():
    print(year, intersect_dict[year].shape)
#     display(intersect_dict[year].head())

1990 (195, 195)
1991 (194, 194)
1992 (194, 194)
1993 (194, 194)
1994 (194, 194)
1995 (195, 195)
1996 (195, 195)
1997 (195, 195)
1998 (194, 194)
1999 (194, 194)
2000 (195, 195)
2001 (194, 194)
2002 (194, 194)
2003 (194, 194)
2004 (194, 194)
2005 (195, 195)
2006 (194, 194)
2007 (194, 194)
2008 (194, 194)
2009 (194, 194)
2010 (195, 195)
2011 (195, 195)
2012 (195, 195)
2013 (195, 195)
2014 (195, 195)
2015 (195, 195)
2016 (195, 195)
2017 (195, 195)
