In [10]:
import numpy as np 
import pandas as pd 
import os 
import glob
import warnings
warnings.filterwarnings('ignore')    ## I don't like pandas setting with copy warnings 

In [11]:
### FYI: GDD has no data for Somalia. I have included it in my list tho

emro = ['AFG', 'ARE', 'BHR', 'DJI','EGY','IRN','IRQ','JOR',
        'KWT', 'LBN', 'LBR', 'MAR', 'OMN', 'PAK','PSE' , 'QAT','SAU',
        'SDN','SOM', 'SYR', 'YEM','TUN']
drop_list = ['superregion2','age','urban','edu']

def select(col, val,df):
        df = df[df[col] == val]
        return df

def emro_select(df:pd.DataFrame):
    
    d1 = select('age',999,df)
    d2 = select('edu',999,d1)
    d3 = select('urban',999,d2)
    d3 = d3.drop(drop_list,axis=1)
#     d4 = d3[d3['iso3'].isin(emro)]

    all = select('female',999,d3)
    males = select('female',0,d3)
    females = select('female',1,d3)
        
    return all, males, females 

In [12]:

def pos_quartile_scores(df, value_column, quantiles=[0.25, 0.5, 0.75]):
    # calculate the quantiles
    quantile_values = df[value_column].quantile(quantiles)
    
    # Function to assign score based on quantile
    def get_score(value):
        if value <= quantile_values[quantiles[0]]:
            return 1
        elif value <= quantile_values[quantiles[1]]:
            return 2
        elif value <= quantile_values[quantiles[2]]:
            return 3
        else:
            return 4
    # apply the function to assign scores
    df['score'] = df[value_column].apply(get_score)
    return df

def neg_quartile_scores(df, value_column, quantiles=[0.25, 0.5, 0.75]):
    # calculate the quantiles
    quantile_values = df[value_column].quantile(quantiles)
    
    # Function to assign score based on quantile
    def get_score(value):
        if value <= quantile_values[quantiles[0]]:
            return 4
        elif value <= quantile_values[quantiles[1]]:
            return 3
        elif value <= quantile_values[quantiles[2]]:
            return 2
        else:
            return 1  
    # apply the function to assign scores
    df['score'] = df[value_column].apply(get_score)
    return df

def dash_score(data: pd.DataFrame, type: str) -> pd.DataFrame:
    years = data['year'].unique()
    scores = []

    for i in years:
        year_df = select('year', i, data)  
        
        # Process the DataFrame based on the `type`
        if type == 'pos':
            year_df = pos_quartile_scores(year_df,'median')  
            scores.append(year_df)
        elif type == 'neg':
            year_df = neg_quartile_scores(year_df,'median')  
            scores.append(year_df)
        else:
            print('Fix your type') 
            return pd.DataFrame()  # Return an empty DataFrame if the type is invalid

    # Concatenate all the DataFrames collected in the list
    scores_df = pd.concat(scores, axis=0)
    return scores_df

def get_pivots(df):
    cf = df.pivot_table(index='year', columns='iso3', values='score').transpose()
    return cf 

def aio (path:str, type:str, pivot:bool=True):
    full_data = pd.read_csv(path)
    all, males, females = emro_select(full_data)

    all = dash_score(all,type)
    males = dash_score(males,type)
    females = dash_score(females,type)

    if pivot == True:
        all = get_pivots(all)
        males = get_pivots(males)
        females = get_pivots(females)
    
    return all, males, females

In [13]:
## this is the same function as before but doesn't read the files 
def aio_v2 (df:pd.DataFrame, type:str, pivot:bool=True):
    all, males, females = emro_select(df)

    all = dash_score(all,type)
    males = dash_score(males,type)
    females = dash_score(females,type)

    ###! next three lines are there to keep only the emro countries... this is for the global calculation that I will do next 
    ###? but these lines won't affect this emro region calculation; but I will comment them out anyway.
    
    all = all[all['iso3'].isin(emro)]
    females = females[females['iso3'].isin(emro)]
    males = males[males['iso3'].isin(emro)]
    
    if pivot == True:
        all = get_pivots(all)
        males = get_pivots(males)
        females = get_pivots(females)
    
    return all, males, females

def get_dash_scores (name:str, save:bool, v0_codes:list, type:str):
    def sum_csv_files(file_paths, sum_columns):
        df_sum = None
        non_sum_columns = None
        
        for file in file_paths:
            df = pd.read_csv(file)
            
            # Separate the columns to sum and the other columns
            df_sum_cols = df[sum_columns]
            df_non_sum_cols = df.drop(columns=sum_columns)
            
            if df_sum is None:
                df_sum = df_sum_cols
                non_sum_columns = df_non_sum_cols  # Keep the non-summed columns from the first file
            else:
                # Sum the specified columns, aligned by index
                df_sum = df_sum.add(df_sum_cols, fill_value=0)
        
        # Concatenate the non-summed columns back with the summed columns
        final_df = pd.concat([non_sum_columns, df_sum], axis=1)
        
        return final_df

    total = sum_csv_files(v0_codes,sum_columns=['median'])
    all, males, females = aio_v2(total,type=type,pivot=True)

    if save:
        all.to_csv(f'scores\dash_global\{name}_all.csv')
        males.to_csv(f'scores\dash_global\{name}_males.csv')
        females.to_csv(f'scores\dash_global\{name}_females.csv')
    return all, males, females

In [14]:
luffy = [([r'..\raw_data\Country-level estimates\v08_cnty.csv'],'whole_grain','pos'), 
           ([r'..\raw_data\Country-level estimates\v01_cnty.csv'],'fruit','pos' ),
           ([r'..\raw_data\Country-level estimates\v02_cnty.csv',
             r'..\raw_data\Country-level estimates\v04_cnty.csv'],'veg','pos'),
             ([r'..\raw_data\Country-level estimates\v06_cnty.csv'],'nuts', 'pos'),
             ([r'..\raw_data\Country-level estimates\v05_cnty.csv'],'legumes','pos'),
                 ([r'..\raw_data\Country-level estimates\v15_cnty.csv'],'SSB','neg'),
                 ([r'..\raw_data\Country-level estimates\v35_cnty.csv'],'sugar','neg'),
                 ([r'..\raw_data\Country-level estimates\v57_cnty.csv',
                   r'..\raw_data\Country-level estimates\v14_cnty.csv',
                   r'..\raw_data\Country-level estimates\v13_cnty.csv'],'dairy','pos'),
                   ([r'..\raw_data\Country-level estimates\v09_cnty.csv',
                     r'..\raw_data\Country-level estimates\v10_cnty.csv'],'meats','neg'),
                     ([r'..\raw_data\Country-level estimates\v37_cnty.csv'],'sodium','neg')]

for i, j, k in luffy:
    _,_,_ = get_dash_scores(name=j,save=True,v0_codes=i,type=k)

In [15]:
## a function to sum the scores for countries 

def calc_total(path: str, suff: str):
    all_glob = glob.glob(f'{path}/*_{suff}.csv')
    
    cumulative_df = None

    for i in all_glob:
        temp = pd.read_csv(i)
        
        if cumulative_df is None:
            cumulative_df = temp
        else:
            cumulative_df.iloc[:, 1:] += temp.iloc[:, 1:]
    
    return cumulative_df

folder = 'scores/dash_global'  
for i in ['all', 'males', 'females']:
    temp_df = calc_total(folder, i)
    temp_df.to_csv(f'{folder}/total_{i}.csv', index=False)


So in **dash_emro** we have scores based on how countries scored in the EMRO region

and in **dash_global** we have thier scores based on how they scored across the world