In [23]:
import numpy as np 
import pandas as pd 
import os 
import glob
import warnings
warnings.filterwarnings('ignore')    ## I don't like pandas setting with copy warnings 

In [24]:
emro = ['AFG', 'ARE', 'BHR', 'DJI','EGY','IRN','IRQ','JOR',
        'KWT', 'LBN', 'LBR', 'MAR', 'OMN', 'PAK','PSE' , 'QAT','SAU',
        'SDN','SOM', 'SYR', 'YEM','TUN']
drop_list = ['superregion2','age','urban','edu']

def select(col, val,df):
        df = df[df[col] == val]
        return df

def emro_select(df:pd.DataFrame):
    
    d1 = select('age',999,df)
    d2 = select('edu',999,d1)
    d3 = select('urban',999,d2)
    d3 = d3.drop(drop_list,axis=1)

    all = select('female',999,d3)
    males = select('female',0,d3)
    females = select('female',1,d3)
        
    return all, males, females 

In [25]:
def phdi_(med, min_val, max_val):
    inv = (min_val > max_val)
    # Identify the “low” and “high” ends for arithmetic
    if inv:
        low, high = max_val, min_val
    else:
        low, high = min_val, max_val

    # Case A: one side is zero ⇒ only 1 extreme bin on the non‑zero side
    if low == 0 or high == 0:
        # 10 equal intervals in the open interior
        step = (high - low) / 10

        if not inv:
            # non‑inverted, so the non‑zero threshold is `high`
            if med >= high:
                return 10
            # interior: bins 0–9
            return int((med - low) / step)
        else:
            # inverted, so the non‑zero threshold is `high` (= min_val)
            if med >= high:
                return 0
            # interior: compute and flip
            return 10 - int((med - low) / step)

    # Case B: both thresholds non‑zero ⇒ 2 extremes + 9 interior bins
    else:
        # 9 equal intervals in the open interior
        step = (high - low) / 9

        if not inv:
            if med <= low:
                return 0
            if med >= high:
                return 10
            # interior: bins 1–9
            return int((med - low) / step) + 1
        else:
            if med >= high:
                return 0
            if med <= low:
                return 10
            # interior: compute then flip
            return 10 - (int((med - low) / step) + 1)

In [26]:
def phdi_score(df, min_val, max_val, weight):
    result = []

    for index, row in df.iterrows():
        country = row['iso3']
        year = row['year']
        median = row['median']

        bin = phdi_(median, min_val, max_val)

        result.append({
            'Country': country,
            'Year': year,
            'PHDI_score': bin * weight
        })
    result_df = pd.DataFrame(result)
    return result_df

def get_pivots(df):
    cf = df.pivot_table(index='Year',columns='Country',values='PHDI_score').transpose()
    return cf 

def aio (df, min, max, wt):
    all, males, females = emro_select(df)
    all = phdi_score(all,min,max, wt)
    males = phdi_score(males,min,max, wt)
    females = phdi_score(females,min,max, wt)

    all = get_pivots(all)
    males = get_pivots(males)
    females = get_pivots(females)
    
    return all, males, females

def sum_csv_files(file_paths, sum_columns):
        df_sum = None
        non_sum_columns = None
        
        for file in file_paths:
            df = pd.read_csv(file)
            
            # separate the columns to sum and the other columns
            df_sum_cols = df[sum_columns]
            df_non_sum_cols = df.drop(columns=sum_columns)
            
            if df_sum is None:
                df_sum = df_sum_cols
                non_sum_columns = df_non_sum_cols  # keep the non-summed columns fromm the first file
            else:
                # Sum the specified columns, aligned by index
                df_sum = df_sum.add(df_sum_cols, fill_value=0)
        
        # Concatenate the non-summed columns back with the summed columns
        final_df = pd.concat([non_sum_columns, df_sum], axis=1)
        
        return final_df

def calc_for_item (v0_codes:list, name:str, min, max, wt):
    total = sum_csv_files(v0_codes,sum_columns=['median'])
    all, males, females = aio(total,min=min, max=max, wt=wt)

    all = all[all.index.isin(emro)]
    males = males[males.index.isin(emro)]
    females = females[females.index.isin(emro)]
    
    os.makedirs('scores\phdi',exist_ok=True)
    all.to_csv(f'scores\phdi\{name}_all.csv')
    males.to_csv(f'scores\phdi\{name}_males.csv')
    females.to_csv(f'scores\phdi\{name}_females.csv')
    return all, males, females

In [27]:
naruto = [([r'..\raw_data\Country-level estimates\v01_cnty.csv'],'fruit',0,200,1),
           ([r'..\raw_data\Country-level estimates\v02_cnty.csv'],'veg',0,300,1),
             ([r'..\raw_data\Country-level estimates\v06_cnty.csv'],'nuts', 0, 50, 1),
             ([r'..\raw_data\Country-level estimates\v05_cnty.csv'],'non_soy_legumes',0,100,0.5),
             ([r'..\raw_data\Country-level estimates\v27_cnty.csv'],'saturated_fat',10, 0, 1),
             ([r'..\raw_data\Country-level estimates\v28_cnty.csv',
               r'..\raw_data\Country-level estimates\v29_cnty.csv'], 'unsaturated_fat', 3.5, 21, 1),
                 ([r'..\raw_data\Country-level estimates\v03_cnty.csv',
                   r'..\raw_data\Country-level estimates\v04_cnty.csv'],'tubers',200,50,1),
                 ([r'..\raw_data\Country-level estimates\v35_cnty.csv'],'added_sugar',25, 5, 1),
                 ([r'..\raw_data\Country-level estimates\v12_cnty.csv'],'eggs',120,13,1),
                 ([r'..\raw_data\Country-level estimates\v57_cnty.csv',
                   r'..\raw_data\Country-level estimates\v14_cnty.csv',
                   r'..\raw_data\Country-level estimates\v13_cnty.csv'],'dairy',1000,250,1),
                   ([r'..\raw_data\Country-level estimates\v10_cnty.csv'],'red_meat',100,14,1),
                     ([r'..\raw_data\Country-level estimates\v11_cnty.csv'],'sea_food',0, 28,1)]
a = []
m = []
f = []

for i, j, k,l, n in naruto:
    a_temp,m_temp,f_temp = calc_for_item(v0_codes=i, name=j, min=k, max=l, wt=n)
    a.append(a_temp)
    m.append(m_temp)
    f.append(f_temp)

In [28]:
### whole grain has differnt ranges for male and females... we used the average of them for 'all' calculation
# Whole grain calculation

wg_ = sum_csv_files([r'..\raw_data\Country-level estimates\v08_cnty.csv'],sum_columns=['median'])
wg_all, wg_males, wg_females = emro_select(wg_)

wg_all = phdi_score(wg_all, 0, 82.5, 1).set_index('Country')
wg_males = phdi_score(wg_males, 0, 90, 1).set_index('Country')
wg_females = phdi_score(wg_females, 0, 75, 1).set_index('Country')

wg_all = wg_all[wg_all.index.isin(emro)]
wg_males = wg_males[wg_males.index.isin(emro)]
wg_females = wg_females[wg_females.index.isin(emro)]

wg_all = get_pivots(wg_all)
wg_males = get_pivots(wg_males)
wg_females = get_pivots(wg_females)

# os.makedirs('scores\phdi',exist_ok=True)
wg_all.to_csv(f'scores\phdi\whole_grain_all.csv' )
wg_males.to_csv(f'scores\phdi\whole_grain_males.csv')
wg_females.to_csv(f'scores\phdi\whole_grain_females.csv')

In [29]:
## a function to sum the scores for countries 

def calc_total(path: str, suff: str):
    all_glob = glob.glob(f'{path}/*_{suff}.csv')
    
    cumulative_df = None

    for i in all_glob:
        temp = pd.read_csv(i)
        
        if cumulative_df is None:
            cumulative_df = temp
        else:
            cumulative_df.iloc[:, 1:] += temp.iloc[:, 1:]
    
    return cumulative_df

folder = 'scores/phdi'  
for i in ['all', 'males', 'females']:
    temp_df = calc_total(folder, i)
    temp_df.to_csv(f'{folder}/total_{i}.csv', index=False)

# Global calculations #

In [30]:
def select(col, val,df):
        df = df[df[col] == val]
        return df

def not_emro_select(df: pd.DataFrame):
    d1 = select('age', 999, df)
    d2 = select('edu', 999, d1)
    d3 = select('urban', 999, d2)

    all = select('female', 999, d3)
    males = select('female', 0, d3)
    females = select('female', 1, d3)

    return all, males, females

def phdi_score_global(df, min_val, max_val, weight):
    result = []

    for index, row in df.iterrows():
        year = row['year']
        median = row['median']

        bin = phdi_(median, min_val, max_val)

        result.append({
            'Country': 'global',
            'Year': year,
            'PHDI_score': bin * weight
        })
    result_df = pd.DataFrame(result)
    return result_df


def aio_global(df, min, max, wt):
    all, males, females = not_emro_select(df)
    all = phdi_score_global(all, min, max, wt)
    males = phdi_score_global(males, min, max, wt)
    females = phdi_score_global(females, min, max, wt)

    all = get_pivots(all)
    males = get_pivots(males)
    females = get_pivots(females)
    
    return all, males, females

def calc_for_item_global(v0_codes: list, name: str, min, max, wt):
    total = sum_csv_files(v0_codes, sum_columns=['median'])
    all, males, females = aio_global(total, min=min, max=max, wt=wt)

    os.makedirs('scores/phdi_global', exist_ok=True)
    all.to_csv(f'scores/phdi_global/{name}_all.csv')
    males.to_csv(f'scores/phdi_global/{name}_males.csv')
    females.to_csv(f'scores/phdi_global/{name}_females.csv')
    return all, males, females

# 2. RUN GLOBAL PHDI CALCULATION

naruto_global = [
    ([r'..\raw_data\Global estimates\v01_global.csv'], 'fruit', 0, 200, 1),
    ([r'..\raw_data\Global estimates\v02_global.csv'], 'veg', 0, 300, 1),
    ([r'..\raw_data\Global estimates\v06_global.csv'], 'nuts', 0, 50, 1),
    ([r'..\raw_data\Global estimates\v05_global.csv'], 'non_soy_legumes', 0, 100, 1),
    ([r'..\raw_data\Global estimates\v27_global.csv'], 'saturated_fat', 10, 0, 1),
    ([r'..\raw_data\Global estimates\v28_global.csv',
      r'..\raw_data\Global estimates\v29_global.csv'], 'unsaturated_fat', 3.5, 21, 1),
    ([r'..\raw_data\Global estimates\v03_global.csv',
      r'..\raw_data\Global estimates\v04_global.csv'], 'tubers', 200, 50, 1),
    ([r'..\raw_data\Global estimates\v35_global.csv'], 'added_sugar', 25, 5, 1),
    ([r'..\raw_data\Global estimates\v12_global.csv'], 'eggs', 120, 13, 1),
    ([r'..\raw_data\Global estimates\v57_global.csv',
      r'..\raw_data\Global estimates\v14_global.csv',
      r'..\raw_data\Global estimates\v13_global.csv'], 'dairy', 1000, 250, 1),
    ([r'..\raw_data\Global estimates\v10_global.csv'], 'red_meat', 100, 14, 1),
    ([r'..\raw_data\Global estimates\v11_global.csv'], 'sea_food', 0, 28, 1),
]

a_g, m_g, f_g = [], [], []

for i, j, k, l, w in naruto_global:
    a_temp, m_temp, f_temp = calc_for_item_global(i, j, k, l, w)
    a_g.append(a_temp)
    m_g.append(m_temp)
    f_g.append(f_temp)

### Whole grain calculation for global data
# 3. SPECIAL CASE: WHOLE GRAINS

wg_df = sum_csv_files([r'..\raw_data\Global estimates\v08_global.csv'], sum_columns=['median'])
wg_all, wg_males, wg_females = not_emro_select(wg_df)

wg_all = phdi_score_global(wg_all, 0, 82.5, 1)       # average of male/female
wg_males = phdi_score_global(wg_males, 0, 90, 1)     # male range
wg_females = phdi_score_global(wg_females, 0, 75, 1) # female range

wg_all = get_pivots(wg_all)
wg_males = get_pivots(wg_males)
wg_females = get_pivots(wg_females)

wg_all.to_csv(f'scores/phdi_global/whole_grain_all.csv')
wg_males.to_csv(f'scores/phdi_global/whole_grain_males.csv')
wg_females.to_csv(f'scores/phdi_global/whole_grain_females.csv')

folder = 'scores/phdi_global'  
for i in ['all', 'males', 'females']:
    temp_df = calc_total(folder, i)
    temp_df.to_csv(f'{folder}/total_{i}.csv', index=False)

* Double check *

In [38]:
testsss = sum_csv_files(
    [r'..\raw_data\Country-level estimates\v10_cnty.csv'],
     sum_columns=['median']
)
ta, tm, tf = emro_select(testsss)
ta = ta[ta['iso3'] == 'IRN']
ta

Unnamed: 0,iso3,female,year,v10_type,v10_type_desc,lowerci_95,upperci_95,serving,s_lowerci_95,s_upperci_95,median
833864,IRN,999,1990,1,1: Foods & Beverages,27.243746,34.546123,0.310821,0.276502,0.350615,30.62521
833865,IRN,999,1995,1,1: Foods & Beverages,26.214717,33.149892,0.298779,0.266058,0.336445,29.438681
833866,IRN,999,2000,1,1: Foods & Beverages,26.434012,33.291851,0.300412,0.268284,0.337885,29.599586
833867,IRN,999,2005,1,1: Foods & Beverages,29.383131,36.854063,0.332843,0.298215,0.374039,32.795025
833868,IRN,999,2010,1,1: Foods & Beverages,21.055279,26.181378,0.237959,0.213694,0.26572,23.446091
833869,IRN,999,2015,1,1: Foods & Beverages,14.908789,18.484071,0.168501,0.151312,0.187598,16.602358
833870,IRN,999,2018,1,1: Foods & Beverages,14.628016,18.088993,0.165087,0.148463,0.183589,16.266049
833871,IRN,999,2020,1,1: Foods & Beverages,14.753168,18.219526,0.166393,0.149733,0.184914,16.394716


In [34]:
def emro_select2(df:pd.DataFrame):
    
    d1 = select('age',999,df)
    d2 = select('edu',999,d1)
    d3 = select('urban',999,d2)
    # d3 = d3.drop(drop_list,axis=1)

    all = select('female',999,d3)
    males = select('female',0,d3)
    females = select('female',1,d3)
        
    return all, males, females 

testsss = sum_csv_files(
    [r'..\raw_data\Global estimates\v12_global.csv'],
     sum_columns=['median']
)
ta, tm, tf = emro_select2(testsss)
ta

Unnamed: 0,age,female,urban,edu,year,varnum,v12_type,v12_type_desc,upperci_95,lowerci_95,serving,s_lowerci_95,s_upperci_95,median
827,999.0,999,999,999,1990,12,1,1: Foods & Beverages,13.703129,11.612341,0.230899,0.213423,0.251849,12.56324
1655,999.0,999,999,999,1995,12,1,1: Foods & Beverages,16.661278,13.170258,0.269799,0.241789,0.30588,14.695969
2483,999.0,999,999,999,2000,12,1,1: Foods & Beverages,19.312057,14.773788,0.307169,0.271029,0.354285,16.743795
3311,999.0,999,999,999,2005,12,1,1: Foods & Beverages,20.788176,15.867687,0.329944,0.29099,0.381225,17.991831
4139,999.0,999,999,999,2010,12,1,1: Foods & Beverages,23.259062,17.682497,0.367695,0.324212,0.426459,20.054089
4967,999.0,999,999,999,2015,12,1,1: Foods & Beverages,23.382547,17.863041,0.37171,0.327402,0.428566,20.2805
5795,999.0,999,999,999,2018,12,1,1: Foods & Beverages,24.279184,18.461086,0.384194,0.338301,0.444918,20.965443
