In [1]:
import numpy as np
import pandas as pd
import pycountry
import glob

In [2]:
path = '/data/users/kgruber/other-data/impacts_paper/'
results_path = path + 'individual-quality-check/'
review_path = path + 'review_rounds/'
output_path = path + 'after-automated-quality-check/'

In [3]:
files = glob.glob(results_path + 'results_*') + [results_path + 'new_values.xlsx']

read and merge files

In [4]:
for file in files:
    f = pd.read_excel(file,engine='openpyxl',index_col=0)
    f = f[~f.index.isna()]
    if 'responses' in globals():
        responses = pd.concat([responses,f],axis=0)
    else:
        responses = f

## entry has been individually checked

In [5]:
individual_check = responses['Checked'].replace(1,'Passed').replace(np.nan,'Failed')
individual_check.name = 'individual_check'

## iso codes

In [6]:
iso_codes = [list(pycountry.countries)[i].alpha_2 for i in range(len(list(pycountry.countries)))]

In [7]:
def check_iso(code):
    if code in iso_codes:
        return 'Passed'
    else:
        return 'Failed'

In [8]:
is_iso_code = pd.Series(responses['Country:'].apply(check_iso),name='iso_code')

## numeric values

In [9]:
def check_float(number):
    try:
        if np.isnan(float(number)):
            return 'Failed'
        else:
            return 'Passed'
    except:
        return 'Failed'

In [10]:
value_is_numeric = pd.Series(responses['4a. Numeric value, e.g., 0.3, 1.5, 3'].apply(check_float),name='numeric_value')

## self calculated value = reproduced value (leave this for the end)

## Consistency between 3 used metric and 10 method of power calculation

### categorise metrics and power related components

In [11]:
power_related_metrics = ['power_density','installed_power_density','output_power_density','power per unit area',
                         'capacity_density']
energy_related_metrics = ['energy_density','surface_performance_ratio','energy_yields',
                          'aperture_specific_net_electrical_output']
land_related_metrics = ['land_use_efficiency','land_requirements','total_impact_area','direct_impact_area_permanent',
                        'direct_impact_area_temporary','direct_impact_area','land_transformation','land_use_footprints',
                        'land_use_requirements','area_requirements','direct_land_requirements','land_occupation',
                        'spatial_footprint','land-use intensity','land use intensity',' land use intensity',
                        'land_use_intensity','land_use','land_area','area_required_by_system']
other_metrics = ['land_use per vehicle mile traveled (based on land_use_intensity)','land-use per vehicle mile',
          'land-use impact (total habitat developed)']

In [12]:
power_related_components_general = ['nameplate (installed) capacity','nameplate (installed) capacity DC',
                                    'nameplate_capacity','nominal nameplate capacity','peak_rated_power',
                                    'peak_capacity','author assumes a typical power per unit area of 2.5\u2009W\u2009m−2',
                                    'commercial module output','module']
power_related_components_capacity_factor = ['nameplate capacity multiplied by capacity factor',
                                            'nameplate (installed) capacity multiplied by capacity factor',
                                            'wind-density multiplied by capacity factor',
                                            'wind density multiplied by capacity factor',
                                            'solar constant/insolation multiplied by capacity factor/efficiency',
                                            'typical solar insolation at average-insolation location',
                                            'typical solar insolation at high-insolation location']
energy_related_components = ['estimated energy generation (unsure what it means)','modelled energy generation',
                             'simulated using ﬂow-sheet computer program based softbeen simulated using ﬂow-sheet computer program based software Cycle-Tempo',
                             'reported energy generation','net energy generation (electricity generation after substracting energy needed for manufacturing/dismantling, construction/operation, and transportation)',
                             'experimentally measured','annual energy production','net output']
unclear_components = ['unclear','number of turbines of a particular type','no power component','no power']

In [13]:
metrics_classification = pd.Series(['power']*len(power_related_metrics)+
                                   ['energy']*len(energy_related_metrics)+
                                   ['other']*(len(land_related_metrics)+len(other_metrics)),
                                  index = power_related_metrics+energy_related_metrics+land_related_metrics+other_metrics)
component_classification = pd.Series(['power']*(len(power_related_components_general)+len(power_related_components_capacity_factor))+
                                     ['energy']*len(energy_related_components)+
                                     ['other']*len(unclear_components),
                                     index = power_related_components_general+power_related_components_capacity_factor+energy_related_components+unclear_components)

### check metrics

In [14]:
def check_metric(data):
    metric = metrics_classification[data['3. Metrics used:']]
    component = component_classification[data['10. Power-related component of land-use requirements is represented by:']]
    if metric =='other' or component =='other':
        return 'Unclear'
    if metric ==  component:
        return 'Passed'
    else:
        return 'Failed'

In [15]:
metric_fits_powercalculation = pd.Series(responses.apply(check_metric,axis=1),name='metric_fits_powercalculation')

## Unit match metrics

### prepare unit columns

#### merge columns

In [16]:
def merge_col(line):
    return line.dropna().values

In [17]:
measurement_unit = responses[['4c-1. Measurement unit:','4c-2. Measurement unit:']].apply(merge_col,axis=1)

there are two entries where there are values in both columns

In [18]:
(measurement_unit.apply(len)!=1).sum()

2

In [19]:
measurement_unit[(measurement_unit.apply(len)!=1)]

0.0          [DC, MW/ha]
1.0    [DC, MWh/year/ha]
dtype: object

only use second value

In [20]:
measurement_unit[measurement_unit.apply(len)!=1] = measurement_unit[measurement_unit.apply(len)!=1].apply(lambda x: [x[1]])

In [21]:
measurement_unit = measurement_unit.apply(lambda x: x[0])

#### unify units - replace similar units with one writing

In [22]:
replace = pd.DataFrame({'replace':['acres/MW', 'm²/GWh', 'ha/GWh/y',    'W m-2', 'W/m²', 'WP/m2', 'W_p/m²', 'We m-2', 'We m−2', 'We/m²', 'W_e/m2', 'w/ft2', 'kW_e/m²', 'MWi km−2', 'GW_e/m2', 'kWh/year/m²', 'kWh/m²year',  'kWh/m2/year', 'kWh/m²/year', 'MWh/year/m²', 'GWh/yr/m2',   'GJ/m2/year', 'rho_e W_e/m2', 'm^2', 'm2/VPM (Vehicl mile traveled)'],
                        'with':   ['acre/MW',  'm2/GWh', 'ha/GWh/year', 'W/m2',  'W/m2', 'Wp/m2', 'Wp/m2',  'We/m2',  'We/m2',  'We/m2', 'We/m2',  'W/ft2', 'kWe/m2',  'MWi/km2', 'GWe/m2',   'kWh/year/m2', 'kWh/year/m2', 'kWh/year/m2', 'kWh/year/m2', 'MWh/year/m2', 'GWh/year/m2', 'GJ/year/m2', 'rhoe We/m2',   'm2',  'm2/VPM (Vehicle mile traveled)']})

In [23]:
for i in range(len(replace)):
    measurement_unit = measurement_unit.replace(replace['replace'][i],replace['with'][i])

### categorise units

In [24]:
footprint_power_related = ['m2/W','m2/Wp','m2/kW','m2/kWp','m2/MW','ha/MW','ha/MWp','acre/MW','acre/MW-DC',
                           'acre/MW-AC','km2/MW']
footprint_energy_related = ['m2/MWh','m2/GWh','km2/GWh','km2/TWh']
footprint_annual_energy_related = ['m2/MWh/year','ha/MWh/year','ha/GWh/year','ha/TWh/year','km2/TWh/year']
power_density = ['W/m2','Wp/m2','We/m2','W/ft2','kW/m2','kWp/m2','kWe/m2','kW/ha','kW/acre','MW/m2',
                 'MW/ha','MWp/ha','MW/km2','MWi/km2','MW/acre','GWe/m2']
energy_density = ['Wh/cm2/day','kWh/year/m2','kWh/year/acre','kWh/year/ft2','kWh/year/ha','MWh/year/acre',
                  'MWh/year/m2','MWh/year/ha','GWh/year/m2','GWh/year/km2','GJ/year/m2','TWh/year/km2']
unclear_units = ['MJ/m2','GJ/unit/year','km2 year/GWh','kW/ft','km/GWh','m2/VPM (Vehicle mile traveled)',
                 'm2','rhoe We/m2','MW/h/ha']

In [25]:
metrics_classification = pd.Series(['power']*len(power_related_metrics)+
                                   ['energy']*len(energy_related_metrics)+
                                   ['footprint']*len(land_related_metrics)+
                                   ['other']*len(other_metrics),
                                  index = power_related_metrics+energy_related_metrics+land_related_metrics+other_metrics)
unit_classification = pd.Series(['power']*len(power_density)+
                                ['energy']*len(energy_density)+
                                ['footprint']*(len(footprint_power_related)+len(footprint_energy_related)+len(footprint_annual_energy_related))+
                                ['other']*len(unclear_units),
                                index = power_density+energy_density+footprint_power_related+footprint_energy_related+footprint_annual_energy_related+unclear_units)

### check units

In [26]:
def check_unit(data):
    metric = metrics_classification[data['Metric']]
    unit = unit_classification[data['Unit']]
    if metric =='other' or unit =='other':
        return 'Unclear'
    if metric ==  unit:
        return 'Passed'
    else:
        return 'Failed'

In [27]:
metric_unit = pd.DataFrame({'Metric':responses['3. Metrics used:'].values,
                            'Unit':measurement_unit.values},
                           index=responses.index)
metric_fits_unit = pd.Series(metric_unit.apply(check_unit,axis=1),name='metric_fits_unit')

## Check range of values

### unify units

add combined measurement units as column

In [28]:
responses2  = pd.concat([responses.loc[:,:'4c-2. Measurement unit:'],
                         pd.Series(measurement_unit,name='4c. Measurement unit:'),
                         responses.loc[:,'4d. Type of value':]],axis=1)

### conversion

In [29]:
m2peracre = 4046.86
m2perft2 = 10.7639

WattUnits = pd.DataFrame({'start':    ['W/ft2',   'kW/m2', 'kW/ha','kW/acre',      'MW/m2', 'MW/ha','MW/km2','MW/acre'],
                          'target':   'W/m2',
                          'factor':   [1/m2perft2, 1000,   0.1,    1000/m2peracre, 1/10**6, 100,    1,       10**6/m2peracre]})
OtherWattUnits = pd.DataFrame({'start':   ['kWp/m2', 'MWp/ha', 'kWe/m2', 'GWe/m2', 'MWi/km2'],
                               'target':  ['Wp/m2',  'Wp/m2',  'We/m2',  'We/m2',  'Wi/m2'],
                               'factor':  [1000,     100,      1000,     10**6,    1]})
WatthUnits = pd.DataFrame({'start':  ['Wh/cm2/day', 'kWh/year/acre', 'kWh/year/ft2','kWh/year/ha','MWh/year/acre','MWh/year/m2','MWh/year/ha','GWh/year/m2','GWh/year/km2','GJ/year/m2','TWh/year/km2'],
                           'target': 'kWh/year/m2',
                           'factor': [3650,         1/m2peracre,     1/m2perft2,    1/10**4,      1000/m2peracre, 1000,         0.1,          10**6,        1,             10**6/3600,  1000]})
AreaPowerUnits = pd.DataFrame({'start':  ['m2/W',  'm2/Wp',  'm2/MW',  'ha/MW', 'ha/MWp', 'acre/MW',      'acre/MW-DC',   'acre/MW-AC',   'km2/MW'],
                               'target': ['m2/kW', 'm2/kWp', 'm2/kW',  'm2/kW', 'm2/kWp', 'm2/kW',        'm2/kW-DC',     'm2/kW-AC',     'm2/kW'],
                               'factor': [1000,    1000,     1/1000,   10,      10,       m2peracre/1000, m2peracre/1000, m2peracre/1000, 1000]})
AreaEnergyUnits = pd.DataFrame({'start':  ['m2/MWh','m2/GWh','km2/GWh','km2/TWh','m2/MWh/year','ha/MWh/year','ha/GWh/year','ha/TWh/year','km2/TWh/year'],
                                'target': ['m2/kWh','m2/kWh','m2/kWh', 'm2/kWh', 'm2/kWh/year','m2/kWh/year','m2/kWh/year','m2/kWh/year','m2/kWh/year'],
                                'factor': [1/1000,  1/10**6, 1,        1/1000,   1/1000,       10,           0.01,         1/10**5,      1]})

unit_conversion = pd.concat([WattUnits,OtherWattUnits,WatthUnits,AreaPowerUnits,AreaEnergyUnits],axis=0)

In [30]:
new_unit = responses2['4c. Measurement unit:'].map(unit_conversion.set_index('start').target)
factor = responses2['4c. Measurement unit:'].map(unit_conversion.set_index('start').factor)
# fill in lines where unit stays the same
new_unit[new_unit.isna()] = responses2['4c. Measurement unit:'][new_unit.isna()]
factor = factor.fillna(1)
new_value = responses2['4a. Numeric value, e.g., 0.3, 1.5, 3'][value_is_numeric=='Passed'].apply(float)*factor[value_is_numeric=='Passed']

In [31]:
responses3 = pd.concat([responses2.loc[:,:'4a. Numeric value, e.g., 0.3, 1.5, 3'],
                        pd.Series(responses2.index.map(new_value),name='4a-1. Converted value',index=responses2.index),
                        responses2.loc[:,'4b. Is power-related component of the land-use requirement expressed as energy e.g., ha/GWh/year?':'4c. Measurement unit:'],
                        pd.Series(new_unit,name='4c-3. Converted measurement unit'),
                        responses2.loc[:,'4d. Type of value':]],axis=1)

### find outliers

only look for outliers where there are more than 10 values

In [32]:
frequency_units = responses3['4c-3. Converted measurement unit'].groupby(responses3['4c-3. Converted measurement unit']).count()

In [33]:
def find_outliers(data):
    #define a list to accumlate anomalies
    outliers = []
    # Set upper and lower limit to 3 standard deviation
    stdev = np.std(data)
    avg = np.mean(data)
    lower_limit  = avg - stdev*3 
    upper_limit = avg + stdev*3
    # Find outliers
    def test_if_in_range(value):
        return in_range(lower_limit,upper_limit,value)
    return data.apply(test_if_in_range)

def in_range(lower,upper,value):
    if (value > lower) & (value < upper):
        return 'Passed'
    else:
        return 'Failed'

In [34]:
outliers = pd.concat([find_outliers(responses3[responses3['4c-3. Converted measurement unit']==unit]['4a-1. Converted value']) for unit in frequency_units.index[(frequency_units>10)].values],axis=0)

In [35]:
value_within_range = pd.Series(responses3.index.map(outliers).fillna('Too few values or not standardised'),name='value_range',index=responses3.index)

## check if matches: DOI + first Author + scopus_id + year of publication

In [36]:
rev_file = pd.read_csv(review_path + 'SCOPUS_DOI2.csv',encoding = "utf-8",dtype=str).dropna().drop_duplicates()

In [37]:
mapped_SCOPUS = responses3['1b. DOI link'].map(rev_file.set_index('DOI').SCOPUS)
scopusID_fits_doi = pd.Series(responses3.index.map(mapped_SCOPUS.dropna()==responses3['1a. SCOPUS ID'].apply(str)[mapped_SCOPUS.notna()]),
                              index=responses3.index,name='scopusID_fits_doi'
                             ).fillna('Not available').replace(True,'Passed').replace(False,'Failed')

In [38]:
results_quality_check = pd.concat([individual_check,is_iso_code,value_is_numeric,metric_fits_powercalculation,metric_fits_unit,value_within_range,scopusID_fits_doi],axis=1)

In [39]:
results_quality_check.to_csv(path + 'results_quality_check.csv')

In [40]:
results_quality_check

Unnamed: 0,individual_check,iso_code,numeric_value,metric_fits_powercalculation,metric_fits_unit,value_range,scopusID_fits_doi
357.0,Passed,Failed,Passed,Unclear,Passed,Passed,Passed
358.0,Passed,Failed,Passed,Unclear,Passed,Passed,Passed
359.0,Passed,Failed,Passed,Unclear,Passed,Passed,Passed
166.0,Passed,Failed,Passed,Unclear,Passed,Passed,Passed
167.0,Passed,Failed,Passed,Unclear,Passed,Passed,Passed
...,...,...,...,...,...,...,...
2256.0,Passed,Passed,Passed,Failed,Passed,Passed,Not available
2257.0,Passed,Passed,Passed,Failed,Passed,Failed,Not available
2258.0,Passed,Passed,Passed,Failed,Passed,Passed,Passed
2259.0,Passed,Passed,Passed,Passed,Passed,Passed,Not available


# split results per person and remove passed rows and columns

In [41]:
def clean_and_save(data,name):
    passed_results = (data == 'Passed').sum(axis=1)
    not_available_results = (data == 'Not available').sum(axis=1)
    data2 = data[passed_results+not_available_results<data.shape[1]]
    passed_results = (data2 == 'Passed').sum(axis=0)
    not_available_results = (data2 == 'Not available').sum(axis=0)
    data3 = data2.loc[:,passed_results+not_available_results<data2.shape[0]]
    data3.to_csv(output_path + name + '_result_automated_quality_check.csv')
    return

In [42]:
comments = pd.DataFrame(np.nan * np.ones(shape=results_quality_check.shape),
                        columns=results_quality_check.columns.values + '_comment',
                        index=results_quality_check.index)

In [43]:
results_quality_check_with_comments = pd.concat([results_quality_check,comments],axis=1)

In [44]:
for name in responses['Reviewed by'].unique():
    print(name)
    d = results_quality_check_with_comments[responses['Reviewed by']==name]
    clean_and_save(d,name)

Luis
Johannes
Peter
Olga
Claude
Michael
Sebastian
Katharina


also split results per person including new values

In [45]:
for name in responses['Reviewed by'].unique():
    print(name)
    responses[responses['Reviewed by']==name].to_csv(output_path + name + '_results.csv')

Luis
Johannes
Peter
Olga
Claude
Michael
Sebastian
Katharina


#### extract bibtex info

In [None]:
import urllib.request

In [None]:
url = 'http://enviroinfo.eu/sites/default/files/pdfs/vol8514/0093.pdf'
req = urllib.request.Request(url)
req.add_header('Accept', 'application/x-bibtex')
try:
    with urllib.request.urlopen(req) as f:
        bibtex = f.read().decode()
    print(bibtex)
except:
    print('no doi')

In [None]:
clean_bibtex = pd.Series(str.replace(bibtex,"'","").replace("},","").replace("{","").split("\n\t"))[1:]

In [None]:
clean_bibtex.apply(lambda x: str.split(x," = ")[1]).values

In [None]:
pd.Series(clean_bibtex.apply(lambda x: str.split(x," = ")[1]).values,
          index = clean_bibtex.apply(lambda x: str.split(x," = ")[0]))