In [1]:
import pandas as pd
import numpy as np
import pint
import pint_pandas
import iam_units

FORECAST_COLUMN = 'Forecast'
FORECAST_x = 'Forecast_x'
FORECAST_y = 'Forecast_y'
VALUE_COLUMN = 'Value'
VALUE_x = 'Value_x'
VALUE_y = 'Value_y'



In [9]:
df = pd.DataFrame({
    'A': pd.Series([0, 0, 4]),
    'B': pd.Series([0, 2, 4]),
    'Forecast': pd.Series([True, False, True]),
    'Value': pd.Series([3, 5, 7])}).set_index('A')
df = df.groupby('A')
df.sum()

Unnamed: 0_level_0,B,Forecast,Value
A,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,2,1,8
4,4,1,7


In [15]:
unit_registry = pint.UnitRegistry()
#unit_registry.define('toxic_equivalency_quantity = [TEQ] = g_teq')
unit_registry.load_definitions("health_impact_units.txt")

unit_registry.load_definitions("health_impact_data.txt")

qty = unit_registry('1 ng TCDD/kg')

#unit_registry.enable_contexts('teq_who2005')

#def toteq(metric, quantity, species):
#    return quantity.to(quantity.units, metric, _a=f"a_{species}")

#qty = qty.to('pg_teq/kg', 'teq_who2005', _a="PCB77") #+ qty.to('pg_teq', _a='_12378_PeCDF')

q = unit_registry('3 per_100000py')
q = (q * unit_registry('1 Mcap')).to('cases/a')
q.__dict__

q2 = unit_registry('1 ug/m**3')
q2.to('ug/kg/d', 'exposure_generic')

q3 = unit_registry('incidence').to('per_100000py', 'cvddeath')

q4 = unit_registry('erf_param_bq_air').to('m**3/pg', 'radon_lungcancer')
q4.to('1/(fg/kg/d)', 'exposure_generic')

q5 = unit_registry('1 rna').to('pg', 'microbes')
q5.to('fg/kg/d', 'exposure_generic')

q6 = unit_registry('erf_param_ug_air').to('m**3/ug', 'pm2_5_mortality')
q6 = q6.to('1/(ug/kg/d)', 'exposure_generic')
q6 = q6 * unit_registry('1 ug/m**3').to('(ug/kg/d)', 'exposure_generic')
np.exp(q6)


False

In [58]:

class Ovariable():
    
    def __init__(self, quantity, unit, input_nodes, content):
        self.quantity = quantity
        self.unit = unit
        self.input_nodes = input_nodes
        self.content = content
    
    def get_input(self, quantity, query=None, drop=None):
        count = 0
        for node in self.input_nodes:
            if node.quantity == quantity:
                out = node
                count += 1
        if count == 0:
            print(quantity)
        assert count == 1

        of = out.get_output()

        if query is not None:
        #    out = copy.copy(out)
            of = of.query(query)

        if drop is not None:
            of = of.droplevel(drop)

        return of

    def clean_computing(self, node):
        self[VALUE_COLUMN] = node.ensure_output_unit(self[VALUE_COLUMN])
        return self
    
class OvariableFrame(pd.DataFrame):

    def do_inner_join(self, other):
        assert VALUE_COLUMN in self.columns  # Cannot be in a merged format

        def add_temporary_index(self):
            tst = self.index.to_frame().assign(temporary=1)
            tst = pd.MultiIndex.from_frame(tst)
            return self.set_index(tst)

        if isinstance(other, pd.DataFrame):
            df2 = other
        else:
            df2 = pd.DataFrame([other], columns=[VALUE_COLUMN])

        df1 = add_temporary_index(self)
        df2 = add_temporary_index(df2)

        out = df1.merge(df2, left_index=True, right_index=True)
        out = OvariableFrame(out)
        out.index = out.index.droplevel(['temporary'])

        return out

    def aggregate_by_column(self, groupby, fun):
        self = self.groupby(groupby)
        if fun == 'sum':
            self = self.sum()
        else:
            self = self.mean()
        self[FORECAST_COLUMN] = self[FORECAST_COLUMN].mask(self[FORECAST_COLUMN] > 0, 1).astype('boolean')
        return self

    def clean(self):
        df = self.reset_index()
        if FORECAST_x in df.columns:
            df[FORECAST_COLUMN] = df[FORECAST_x] | df[FORECAST_y]
        keep = set(df.columns) - {0, 'index', VALUE_x, VALUE_y, FORECAST_x, FORECAST_y}
        df = df[list(keep)].set_index(list(keep - {VALUE_COLUMN, FORECAST_COLUMN}))
        return OvariableFrame(df)

    def print_pint_df(self):
        df = self
        pint_cols = [col for col in df.columns if hasattr(df[col], 'pint')]
        if not pint_cols:
            print(df)
            return

        out = df[pint_cols].pint.dequantify()
        for col in df.columns:
            if col in pint_cols:
                continue
            out[col] = df[col]
        print(out)

    def __add__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] + self[VALUE_y]
        return self.clean()

    def __sub__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] - self[VALUE_y]
        return self.clean()

    def __mul__(self, other):
        if isinstance(other, pd.DataFrame):
            self = self.do_inner_join(other)
            self[VALUE_COLUMN] = self[VALUE_x] * self[VALUE_y]
            return self.clean()
        else:
            self[VALUE_COLUMN] = self[VALUE_COLUMN] * other
            print(other)
            return self

    def __truediv__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] / self[VALUE_y]
        return self.clean()

    def __mod__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] % self[VALUE_y]
        return self.clean()

    def __pow__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] ** self[VALUE_y]
        return self.clean()

    def __floordiv__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] // self[VALUE_y]
        return self.clean()

    def __lt__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] < self[VALUE_y]
        return self.clean()

    def __le__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] <= self[VALUE_y]
        return self.clean()

    def __gt__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] > self[VALUE_y]
        return self.clean()

    def __ge__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] >= self[VALUE_y]
        return self.clean()

    def __eq__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] == self[VALUE_y]
        return self.clean()

    def __ne__(self, other):
        self = self.do_inner_join(other)
        self[VALUE_COLUMN] = self[VALUE_x] != self[VALUE_y]
        return self.clean()

    def exp(self):
        s = self[VALUE_COLUMN]
        assert s.pint.units.dimensionless
        s = np.exp(s.pint.m)
        s = pd.Series(s, dtype='pint[dimensionless]')
        self[VALUE_COLUMN] = s
        return self

    def log10(self):
        s = self[VALUE_COLUMN]
        assert s.pint.units.dimensionless
        s = np.log10(s.pint.m)
        s = pd.Series(s, dtype='pint[dimensionless]')
        self[VALUE_COLUMN] = s
        return self

    def log(self):
        s = self[VALUE_COLUMN]
        assert s.pint.units.dimensionless
        s = np.log(s.pint.m)
        s = pd.Series(s, dtype='pint[dimensionless]')
        self[VALUE_COLUMN] = s
        return self


In [60]:
df = OvariableFrame(pd.DataFrame({
    'A': pd.Series([1,2]),
    'Value': pd.Series([3,4]),
    'Forecast': pd.Series([True,False])
}).set_index('A'))
df = df*df
df

Unnamed: 0_level_0,Forecast,Value
A,Unnamed: 1_level_1,Unnamed: 2_level_1
1,True,9
2,False,16


In [23]:
class Exposure(Ovariable):
    # Exposure is the intensity of contact with the environment by the target population.

    quantity = 'ingestion'
    scaled = False  # This is probably not needed any more

    def compute(self):
        consumption = self.get_input('ingestion') # This can be bolus (g) or rate (g/d)
        concentration = self.get_input('mass_concentration')

        exposure = concentration * consumption

        return self.clean_computing(exposure)


class PopulationAttributableFraction(Ovariable):
    
    def compute(self):
        erf = self.get_input('exposure-response')
        param2 = self.get_input(
            'exposure-response',
            query="observation == 'param2'", drop='observation')
        exposure = self.get_input('ingestion')
        frexposed = self.get_input('fraction')
        incidence = self.get_input('incidence')
        rr = self.get_input('ratio')
        p_illness = self.get_input('probability')

        of = erf.do_inner_join(exposure).reset_index()

#        er_function_list = list(sorted(set(of.reset_index()['er_function'])))

        out = pd.DataFrame()

        for row in of.index:
            # FIXME Tähän scale_exposure

            # FIXME Tähän yksikön tarkistus. ERF-dataan annetaan altistuksen yksikkö ilman log-muunnosta.

            if of['er_function'].loc[row] == 'UR':
                k = OvariableFrame(of.loc[row])
                k = k ** -1

                threshold = param2

                dose2 = (exposure - threshold)
                # FIXME clip removes the pint unit. Figure out something else.
                # dose2 = np.clip(dose2, 0, None)  # Smallest allowed value is 0
                out1 = (k * dose2 * frexposed / incidence)
                out = out.append(out1.reset_index())

            if of['er_function'].loc[row] == 'Step':
                upper = OvariableFrame(of.loc[row])

                lower = param2
                out2 = (exposure >= lower) * (exposure <= upper) * -1 + 1  # FIXME
                out2 = out2 * frexposed / incidence
                out = out.append(out2.reset_index())

            elif of['er_function'].loc[row] == 'RR' or of['er_function'].loc[row] == 'Relative Hill':
                r = frexposed * (rr - 1)

                out3 = (r / (r + 1))  # AF=r/(r+1) if r >= 0; AF=r if r<0. Therefore, if the result
                # is smaller than 0, we should use r instead. It can be converted from the result:
                # r/(r+1)=a <=> r=a/(1-a)
                out3[VALUE_COLUMN] = np.where(
                    out3[VALUE_COLUMN] < 0,
                    out3[VALUE_COLUMN] / (1 - out3[VALUE_COLUMN]),
                    out3[VALUE_COLUMN])

                out = out.append(out3.reset_index())

            elif of['er_function'].loc[row] == 'beta poisson approximation':
                out4 = ((exposure / param2 + 1) ** (erf * -1) * -1 + 1) * frexposed
                out4 = (out4 / incidence * p_illness)
                out = out.append(out4.reset_index())

            elif of['er_function'].loc[row] == 'exact beta poisson':
                out5 = ((erf / (erf + param2) * exposure * -1).exp() * -1 + 1) * frexposed
                out5 = out5 / incidence * p_illness
                out = out.append(out5.reset_index())

            elif of['er_function'].loc[row] == 'exponential':
                k = erf
                out6 = ((k * exposure * -1).exp() * -1 + 1) * frexposed
                out6 = out6 / incidence * p_illness
                out = out.append(out6.reset_index())

        keep = set(out.columns) - {'scaling', 'matrix', 'exposure', 'exposure_unit', 'er_function', 0}
        out = out[list(keep)].set_index(list(keep - {VALUE_COLUMN, FORECAST_COLUMN}))

        return self.clean_computing(out)



In [24]:
consumption = Ovariable(quantity = 'ingestion', unit = 'g', input_nodes = [], content = df3)

#print(consumption.content)

def aggregate_columns(self, groupby, fun):
    self = self.groupby(groupby)
    if fun=='sum':
        tmp = self.sum()
    else:
        tmp = self.mean()
    tmp['Forecast'] = np.minimum(1, tmp['Forecast']).astype('boolean')
    return tmp

#df3 = aggregate_columns(df3, groupby = 'B', fun = 'sum')



In [28]:
unit_registry = pint.UnitRegistry()

df3 = OvariableFrame(pd.DataFrame({
    'A': pd.Series([1, 2]),
    'B': pd.Series([3,3]),
    'Value_x': pd.Series([3., 4.]) * unit_registry.kg,
    'Value_y': pd.Series([2,5]) * unit_registry.d,
    'Forecast': pd.Series([False, False])
}).set_index(['A','B']))

print(df3.reset_index()['Value_y'])
cols = df3.index.names
df3 = df3.reset_index()
for i in df3.index:
    tmp = df3.xs(i, axis=0, drop_level=False)#.set_index(cols)
#    if tmp.query('A == 3').size == 1:
#        out = 3#df3['Value'].loc[i] * 3
    print(tmp)
type(df3)

0    2
1    5
Name: Value_y, dtype: int64
A               1
B               3
Value_x       3.0
Value_y         2
Forecast    False
Name: 0, dtype: object
A               2
B               3
Value_x       4.0
Value_y         5
Forecast    False
Name: 1, dtype: object


pandas.core.frame.DataFrame

In [20]:
df6 = OvariableFrame(pd.DataFrame({
    'A': pd.Series([2,4]),
    'Value': pd.Series([1.,3.], dtype='pint[mg/g]')
}).set_index(['A']))
df7 = (df6 + df6)
#df7 = df7.astype('pint[mg]')

assert df7.Value.pint.units.dimensionless

df7 = np.log(df7.Value.pint.m)

df7 = pd.Series(df7, dtype='pint[dimensionless]')
df6.Value = df7
df6

  return np.array(qtys, dtype="object", copy=copy)
  return np.array(qtys, dtype="object", copy=copy)


Unnamed: 0_level_0,Value
A,Unnamed: 1_level_1
2,0.6931471805599453
4,1.791759469228055


In [4]:
['a']*3

['a', 'a', 'a']

In [5]:
of = OvariableFrame(pd.DataFrame({
            'scaling': pd.Series(['None']*4),
            'Response': pd.Series(['CVD']*2 + ['Cancer']*2),
            'er_function': pd.Series(['RR']*2 + ['UR']*2),
            'observation': pd.Series(['param1', 'param2']*2),
            FORECAST_COLUMN: pd.Series([False]*4),
            VALUE_COLUMN: pd.Series([200., 0., 2000., 0.2], dtype='pint[mg/d]')
        }).set_index(['er_function', 'Response', 'observation', 'scaling']))
v = unit_registry.Quantity("1 mg/d")
print(v)
isinstance(of.query('observation == "param1"'), OvaribleFrame)

1.0 milligram / day


NameError: name 'OvaribleFrame' is not defined

In [6]:
#create some data with Names column
data = pd.DataFrame({
    'Names': ['Joe', 'John', 'Jasper', 'Jez'] *4,
    'Ob1' : pd.Series(np.random.rand(16), dtype='pint[kg]'),
    'Ob2' : np.random.rand(16)
})

#create unique list of names
UniqueNames = data.Names.unique()

#create a data frame dictionary to store your data frames
DataFrameDict = {elem : pd.DataFrame for elem in UniqueNames}

for key in DataFrameDict.keys():
    DataFrameDict[key] = data[:][data.Names == key]

DataFrameDict['Joe'].Ob1.values


<PintArray>
[ 0.9064527886820508, 0.06876062094136504, 0.07454569621577023,
  0.1929152734631414]
Length: 4, dtype: pint[kilogram]