# Calculate stastistics for an entire database and sectors within the database

## First Step : calculating indicators for each {activity|impact method}

In [1]:
import h5py
import numpy as np
from scipy import stats
import os

#MC_results_dict={act:{ic_name:[MC_results]}} as the output of MC_multi_impact_entire_DB()

#Stored MC results in HDF5 are np array 1d which size=# iteration
#and stored like: Uncertainty LCI 1 LCIA 1/ActKey/impact method name



def calculating_endpoint_sum(hdf5_file_MC_LCA_results,hdf5_file_MC_statistics):
    
    
    for uncertainty_level in hdf5_file_MC_LCA_results.items():
        
        if 'lci_iteration_name_list' not in uncertainty_level[0]:

            for act in uncertainty_level[1].items():                    
                
                for impact_method in act[1].items():
                    
                    #If endpoint names are the second name in impact method tuples (...,...,...)
                    endpoint_name='{},{})'.format(impact_method[0].split(',', 3)[0],impact_method[0].split(',', 3)[1])

                    
                    
                    #If endpoint names are the first name in impact method tuples (...,...,...)
                    #endpoint_name=impact_method[0].rsplit(',', 2)[0]+')'
                    
                    endpoint_group_path='/{}/{}/{}'.format(uncertainty_level[0],act[0],endpoint_name)
                    
                    contribution_to_add=impact_method[1][()]
                    
                    try:
                        endpoint_sum_dataset=hdf5_file_MC_statistics['{}/endpoint_sum'.format(endpoint_group_path)]
                        endpoint_sum_dataset[...]=endpoint_sum_dataset[()]+contribution_to_add
                        
                    except:
                        hdf5_file_MC_statistics.create_dataset('{}/endpoint_sum'.format(endpoint_group_path),data=contribution_to_add)
                        
    return;



def sensivity_index_1st(Y,X,bin_size=50):
    
    #Gathering Y and X
    pairs=np.column_stack((Y,X))
    
    
    #Sorting by X ascending
    pairs=pairs[pairs[:, 1].argsort()]
    
    
    #Number of bins
    if Y.size%bin_size != 0:
        print("bin_size should be adjusted to be a multiple of Y size")
        return;
    
    if bin_size%int(bin_size) != 0:
        print("bin_size should be an integer")
        return;
    
    bins=int(Y.size/bin_size)
    bin_size=int(bin_size)
    
    
    #Calculating mean for each bin
    data=pairs[:,0]
    data=np.reshape(data,(bins,bin_size))
    bin_means=np.mean(data, axis=1)
    
    
    #sensivity_index_1st
    si_1st=np.var(bin_means)/np.var(Y)
    
    return si_1st;
    
    



#MC_results_dict={act_key:{ic_name:[MC_results]}} as the output of MC_multi_impact_entire_DB()

def calculating_endpoint_stats_indicators(hdf5_file_MC_LCA_results,hdf5_file_MC_statistics,bin_size):
  
    sum_spear_corr_endpoint={}
    
    endpoint_name_list=[]
    
    for uncertainty_level in hdf5_file_MC_LCA_results.items():

        if 'lci_iteration_name_list' not in uncertainty_level[0]:

            for act in uncertainty_level[1].items():                    

                for impact_method in act[1].items():

                    #If endpoint names are the second name in impact method tuples (...,...,...)
                    endpoint_name='{},{})'.format(impact_method[0].split(',', 3)[0],impact_method[0].split(',', 3)[1])
                    
                    
                    #If endpoint names are the first name in impact method tuples (...,...,...)
                    #endpoint_name=impact_method[0].rsplit(',', 2)[0]+')'
                    
                    endpoint_group_path='/{}/{}/{}'.format(uncertainty_level[0],act[0],endpoint_name)
                    
                    stats_dict={}
                    
                    #Regular stats
                    stats_dict['mean']=np.mean(impact_method[1])
                    stats_dict['variance']=np.var(impact_method[1])
                    stats_dict['std dev']=np.std(impact_method[1])
                    stats_dict['minimum']=min(impact_method[1])
                    stats_dict['maximum']=max(impact_method[1])
                    stats_dict['2.5th percentile']=np.percentile(impact_method[1],2.5)
                    stats_dict['25th percentile']=np.percentile(impact_method[1],25)
                    stats_dict['median']=np.percentile(impact_method[1],50)
                    stats_dict['75th percentile']=np.percentile(impact_method[1],75)
                    stats_dict['97.5th percentile']=np.percentile(impact_method[1],97.5)
                    stats_dict['number of iterations']=len(impact_method[1])

                    #Stats to measure the dispersion
                    stats_dict['MADM']=np.percentile(abs(impact_method[1]-stats_dict['median']),50)
                    stats_dict['IQR']=stats_dict['75th percentile']-stats_dict['25th percentile']
                    stats_dict['Spread']=stats_dict['maximum']-stats_dict['minimum']
                    stats_dict['CI95']=stats_dict['97.5th percentile']-stats_dict['2.5th percentile']
                    try:
                        stats_dict['Quartile coeff of dispersion']=stats_dict['IQR']/(stats_dict['75th percentile']+stats_dict['25th percentile'])
                    except:
                        stats_dict['Quartile coeff of dispersion']='NA'
                    try:
                        stats_dict['CV']=stats_dict['std dev']/stats_dict['mean']
                    except:
                        stats_dict['CV']='NA'
                    try:
                        stats_dict['CV modified']=stats_dict['std dev']/np.sqrt((stats_dict['maximum']-stats_dict['mean'])*(stats_dict['mean']-stats_dict['minimum']))
                    except:
                        stats_dict['CV modified']='NA'
                    try:
                        stats_dict['CV robust']=stats_dict['MADM']/stats_dict['median']
                    except:
                        stats_dict['CV robust']='NA'
                    try:
                        stats_dict['IQR\spread']=stats_dict['IQR']/(stats_dict['Spread'])
                    except:
                        stats_dict['IQR\spread']='NA'
                    try:
                        stats_dict['IQR\CI95']=stats_dict['IQR']/stats_dict['CI95']
                    except:
                        stats_dict['IQR\CI95']='NA'


                    #Statistics based on endpoint_sum  
                    endpoint_sum=hdf5_file_MC_statistics['{}/endpoint_sum'.format(endpoint_group_path)]
                    
                    stats_dict['Spearmann rank correlation - coefficient']=stats.spearmanr(impact_method[1],endpoint_sum)[0]
                    stats_dict['Spearmann rank correlation - pvalue']=stats.spearmanr(impact_method[1],endpoint_sum)[1]
                    
                    if np.isnan(stats_dict['Spearmann rank correlation - coefficient']):
                        stats_dict['Spearmann rank correlation - coefficient']=0

                    try:
                        sum_spear_corr_endpoint[endpoint_name]=sum_spear_corr_endpoint[endpoint_name]+(stats_dict['Spearmann rank correlation - coefficient'])**2

                    except:
                        sum_spear_corr_endpoint[endpoint_name]=(stats_dict['Spearmann rank correlation - coefficient'])**2
                    
                    
                    stats_dict['Sensitivity index 1st order - midpoint to endpoint']=sensivity_index_1st(Y=endpoint_sum,X=impact_method[1],bin_size=bin_size)
                        
                        
                    #print(str(stats_dict['Spearmann rank correlation - coefficient'])+' with sum '+str(sum_spear_corr_endpoint[endpoint_name]))
                    
                    
                    #Store values
                    impact_method_group_path='/{}/{}/{}'.format(uncertainty_level[0],act[0],impact_method[0])
                    
                    for indicator in stats_dict.keys():
                        try:
                            hdf5_file_MC_statistics.create_dataset('{}/{}'.format(impact_method_group_path,indicator),data=stats_dict[indicator])
                        except:
                            hdf5_file_MC_statistics['{}/{}'.format(impact_method_group_path,indicator)][...]=stats_dict[indicator]
                 
                
                for impact_method in act[1].items():
                    
                    #If endpoint names are the second name in impact method tuples (...,...,...)
                    #endpoint_name='{})'.format(impact_method[0].rsplit(',', 1)[0])
                    endpoint_name='{},{})'.format(impact_method[0].split(',', 3)[0],impact_method[0].split(',', 3)[1])
                    
                    #If endpoint names are the first name in impact method tuples (...,...,...)
                    #endpoint_name=impact_method[0].rsplit(',', 2)[0]+')'
                    
                    impact_method_group_path='/{}/{}/{}'.format(uncertainty_level[0],act[0],impact_method[0])

                    #Calculating Contribution To Variance
                    stats_dict={}
                    stats_dict['Spearmann CTV midpoint to endpoint']=(hdf5_file_MC_statistics['{}/Spearmann rank correlation - coefficient'.format(impact_method_group_path)][()])**2/sum_spear_corr_endpoint[endpoint_name]
                    
                    
                    #Store values
                    for indicator in stats_dict.keys():
                        try:
                            hdf5_file_MC_statistics.create_dataset('{}/{}'.format(impact_method_group_path,indicator),data=stats_dict[indicator])
                        except:
                            hdf5_file_MC_statistics['{}/{}'.format(impact_method_group_path,indicator)][...]=stats_dict[indicator]
                    
                for endpoint_name in sum_spear_corr_endpoint.keys():
                    sum_spear_corr_endpoint[endpoint_name]=0
                    
                    endpoint_name_list.append(endpoint_name)
                    endpoint_name_list=list(set(endpoint_name_list))
                    
                    
    #Calculating Sensitivity index between uncertainty level for endpoint_sum                
    for endpoint_name in endpoint_name_list:
    
        for uncertainty_level in hdf5_file_MC_LCA_results.items():

            if ('lci_iteration_name_list' or 'LCI 1 LCIA 1') not in uncertainty_level[0]:

                for act in uncertainty_level[1].items():

                    endpoint_group_path='/{}/{}/{}'.format(uncertainty_level[0],act[0],endpoint_name)
                    endpoint_sum=hdf5_file_MC_statistics['{}/endpoint_sum'.format(endpoint_group_path)]
                    
                    endpoint_11_group_path='/{}/{}/{}'.format('Uncertainty LCI 1 LCIA 1',act[0],endpoint_name)
                    endpoint_sum_11=hdf5_file_MC_statistics['{}/endpoint_sum'.format(endpoint_group_path)]

                    stats_dict={}
                    stats_dict['Sensitivity index 1st order - endpoint between uncertainty level']=sensivity_index_1st(Y=endpoint_sum_11,X=endpoint_sum,bin_size=bin_size)
                    
                    #Store values
                    for indicator in stats_dict.keys():
                        try:
                            hdf5_file_MC_statistics.create_dataset('{}/{}'.format(endpoint_group_path,indicator),data=stats_dict[indicator])
                        except:
                            hdf5_file_MC_statistics['{}/{}'.format(endpoint_group_path,indicator)][...]=stats_dict[indicator]
        
    return;


def calculating_endpoint_stats_entire_database_aggregated_MC_results(hdf5_file_MC_LCA_results_path, dir_path_for_saving,bin_size):
    
    #Create and/or open the file for MC stats results
    hdf5_file_MC_statistics=h5py.File(os.path.join(dir_path_for_saving,'MC_statistics_aggregated_results.hdf5'),'w-')
    
    #Open the MC LCA results file
    hdf5_file_MC_LCA_results=h5py.File(hdf5_file_MC_LCA_results_path,'r')
    
    #Calculate stats --> only make sense if impact categories in hdf5_file_MC_LCA_results are endpoint per midpoint categories
    calculating_endpoint_sum(hdf5_file_MC_LCA_results,hdf5_file_MC_statistics)
    calculating_endpoint_stats_indicators(hdf5_file_MC_LCA_results,hdf5_file_MC_statistics,bin_size)
    
    #Close hdf5 files
    hdf5_file_MC_statistics.close()
    hdf5_file_MC_LCA_results.close()
    
    return;    
    


In [2]:
hdf5_file_MC_LCA_results_path=r"D:\Dossiers professionnels\Logiciels\Brightway 2\Test Dependant LCA Monte Carlo - test 3\LCA_Dependant_Monte_Carlo_aggregated_results_ALL.hdf5"
dir_path_for_saving="D:\Dossiers professionnels\Logiciels\Brightway 2\Test Dependant LCA Monte Carlo - test 3"

bin_size=100
calculating_endpoint_stats_entire_database_aggregated_MC_results(hdf5_file_MC_LCA_results_path, dir_path_for_saving,bin_size)

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [114]:
#import numpy as np
X=np.random.random(1000)
Y=np.arange(1000)

np.var(Y),np.var(X)

(83333.25, 0.07817271246938956)

In [120]:
sensivity_index_1st(Y,X,bin_size=50)

[ 489.06  470.76  530.52  584.16  467.08  479.86  475.88  497.58  511.6
  448.78  612.68  505.88  479.9   504.64  547.54  480.06  510.24  504.32
  472.92  416.54]


0.022473641673641659

In [46]:
pairs=np.column_stack((Y,X))
pairs

array([[ 0.05561784,  0.67129819],
       [ 0.57679289,  0.72835559],
       [ 0.31340933,  0.14037702],
       [ 0.74814291,  0.91584807],
       [ 0.88762585,  0.83594971],
       [ 0.68554716,  0.31560965],
       [ 0.13855739,  0.22897577],
       [ 0.90786571,  0.12950932],
       [ 0.12784339,  0.86145301],
       [ 0.86138512,  0.26797564],
       [ 0.59398654,  0.21868958],
       [ 0.74969962,  0.05711744],
       [ 0.58615788,  0.91062249],
       [ 0.55691427,  0.66998432],
       [ 0.90220203,  0.1326723 ],
       [ 0.39422025,  0.32362138],
       [ 0.14578997,  0.61006975],
       [ 0.90143821,  0.68433986],
       [ 0.95925203,  0.25026719],
       [ 0.10580314,  0.13249037],
       [ 0.02712601,  0.37122594],
       [ 0.65932062,  0.18573381],
       [ 0.65247316,  0.19692277],
       [ 0.07159735,  0.77451529],
       [ 0.34124012,  0.75488842],
       [ 0.94523225,  0.05757424],
       [ 0.10180174,  0.58926531],
       [ 0.90041107,  0.93174501],
       [ 0.08721703,

In [47]:
pairs=pairs[pairs[:, 1].argsort()]
pairs

array([[ 0.97554321,  0.0056588 ],
       [ 0.91683535,  0.02062548],
       [ 0.11753879,  0.02595412],
       [ 0.45717413,  0.04067403],
       [ 0.74969962,  0.05711744],
       [ 0.94523225,  0.05757424],
       [ 0.99555302,  0.05914512],
       [ 0.01933249,  0.06232794],
       [ 0.41113054,  0.07184622],
       [ 0.59465664,  0.08489839],
       [ 0.27591197,  0.09048777],
       [ 0.91304535,  0.09239419],
       [ 0.64690736,  0.09372956],
       [ 0.68787272,  0.09714076],
       [ 0.36680882,  0.1081725 ],
       [ 0.33992657,  0.11491249],
       [ 0.90786571,  0.12950932],
       [ 0.10580314,  0.13249037],
       [ 0.90220203,  0.1326723 ],
       [ 0.65804783,  0.13448154],
       [ 0.95897061,  0.13483776],
       [ 0.31340933,  0.14037702],
       [ 0.14756861,  0.14668482],
       [ 0.30416718,  0.15524791],
       [ 0.23546181,  0.18105396],
       [ 0.65932062,  0.18573381],
       [ 0.13957594,  0.18581665],
       [ 0.1443516 ,  0.19621696],
       [ 0.65247316,

In [48]:
data=pairs[:,0]

In [49]:
data

array([ 0.97554321,  0.91683535,  0.11753879,  0.45717413,  0.74969962,
        0.94523225,  0.99555302,  0.01933249,  0.41113054,  0.59465664,
        0.27591197,  0.91304535,  0.64690736,  0.68787272,  0.36680882,
        0.33992657,  0.90786571,  0.10580314,  0.90220203,  0.65804783,
        0.95897061,  0.31340933,  0.14756861,  0.30416718,  0.23546181,
        0.65932062,  0.13957594,  0.1443516 ,  0.65247316,  0.26953218,
        0.22617551,  0.23636302,  0.55005082,  0.59398654,  0.15303398,
        0.13855739,  0.24951643,  0.95925203,  0.09595789,  0.86138512,
        0.90554072,  0.46697896,  0.15658305,  0.87998658,  0.74232022,
        0.68554716,  0.39422025,  0.97286209,  0.83094588,  0.02712601,
        0.08593361,  0.94451753,  0.58640828,  0.37961668,  0.08721703,
        0.08914476,  0.47316729,  0.64711205,  0.01988686,  0.10180174,
        0.49691032,  0.14578997,  0.27326587,  0.74941262,  0.61375438,
        0.66350068,  0.55691427,  0.05561784,  0.90143821,  0.00

In [30]:

bin_means = (np.histogram(data, bins='fd', weights=data)[0] / np.histogram(data, bins='fd')[0])

TypeError: Automated estimation of the number of bins is not supported for weighted data

In [55]:
cluster_size=7
Number_of_bins=int(Y.size/cluster_size)
Number_of_bins

14

In [56]:
bin_means = (np.histogram(data, Number_of_bins, weights=data)[0] / np.histogram(data, Number_of_bins)[0])
bin_means

array([ 0.03565684,  0.11620999,  0.17995013,  0.25461722,  0.3246858 ,
        0.38728502,  0.47355768,  0.54677879,  0.59195943,  0.66522404,
        0.73769152,  0.82209195,  0.8965492 ,  0.96541166])

In [57]:
np.var(bin_means)

0.081552575927668444

In [58]:
np.var(Y)

0.098727772841276221

In [59]:
np.var(bin_means)/np.var(Y)

0.826034798321439

98

In [85]:
test="""('etyr (arfas)','eyt euyt','hfhafhds, fdsjfgsd, dgfdg')"""

In [86]:
'{},{})'.format(test.split(',', 3)[0],test.split(',', 3)[1])

"('etyr (arfas)','eyt euyt')"

In [84]:
test_2="""('etyr (arfas)','eyt euyt','hfhafhds fdsjfgsd')"""
'{},{})'.format(test_2.split(',', 3)[0],test_2.split(',', 3)[1])

"('etyr (arfas)','eyt euyt')"

In [92]:
res_spr=stats.spearmanr([3,3,3,3,3,3,3],[3,4,36,5643,36,3,36])[0]

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (self.a < x) & (x < self.b)
  return (self.a < x) & (x < self.b)
  cond2 = cond0 & (x <= self.a)


In [95]:
res_spr+567

nan

In [104]:
set(list(zip(['jhgh','ehgeh'],['ehgeh'])))

{('jhgh', 'ehgeh')}

In [105]:
list(zip(['jhgh','ehgeh'],['ehgeh']))

[('jhgh', 'ehgeh')]

In [118]:
sets=[]
sets.append('fdgfd')
sets=list(set(sets))
sets

['fdgfd']

In [119]:
sets.append('hfj')
sets=list(set(sets))
sets

['fdgfd', 'hfj']

In [113]:
list(set(start))

['fdgfd', 'hfj']