In [2]:
 %matplotlib inline
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from gradutil import init_boreal

## The Boreal Forest Data

TODO: Describe here where the data is from and what does it really mean. Describe the variables and different "levels" (BAU, SA etc.)

In [3]:
revenue, carbon, deadwood, HA = init_boreal()

In [5]:
revenue.describe()

## Cleaning the data

In the papers related to this data it says that:
- GTR30, EXT10 and EXT30 were not allowed for stands that didn't reach maturity
- No-thinning (NTLR) not allowed for stands where no-thinning according to the BAU scheme
- SA not allowed for stands where no thinning or harvest in BAU either

Now it looks like the NaN values are at the same places in all the files. It would be good to examine that more thoroughly:

In [3]:
(carbon.isnull()==HA.isnull()).all()

In [4]:
(HA.isnull()==deadwood.isnull()).all()

In [5]:
(deadwood.isnull()==revenue.isnull()).all()

We would now conclude that it is safe to say all the Nan values are at exactly the same places for all the files. Then we could remova all the rows (forest stands) that have any NaN values, and the id's of rows (stands) will still match.

In [6]:
carbon_clean = carbon.dropna(axis=0, how='any')
HA_clean = HA.dropna(axis=0, how='any')
deadwood_clean = deadwood.dropna(axis=0, how='any')
revenue_clean = revenue.dropna(axis=0, how='any')

In [7]:
print('Size of the original dataset: %r' % len(carbon))
carbon.describe()

In [8]:
print('Size of cleaned dataset: % r' % len(carbon_clean))
carbon_clean.describe()

## Relationships between Nans

In [9]:
carbon[carbon.BAU.isnull()].count()

In [10]:
carbon[carbon.SA.isnull()].count()

In [11]:
carbon[carbon.EXT10.isnull()].count()

In [12]:
carbon[carbon.EXT30.isnull()].count()

In [13]:
carbon[carbon.GTR30.isnull()].count()

In [14]:
carbon[carbon.NTSR.isnull()].count()

In [15]:
carbon[carbon.NTLR.isnull()].count()

### Outcomes of comparisons

- BAU and SA are never Nan
- If EXT10 is NaN
 - then EXT30 and GTR30 are NaN
- If EXT30 is NaN
 - then EXT10 is NaN
- IF GTR30 or NLTR is NaN
 - doesn't imply anything else


## Objective values

Objective function as described in the paper "Spatially dynamic forest management to sustain biodiversity and economic returns". So if the value is Nan, then apply the Business as Usual scheme.

In [16]:
def objective_value(obj, x):
    '''
    Sums values of the given column on each row of the
    dataframe. If the value on the given column at 
    some row is Nan, then selects the first value from
    that row (according to the original source of the data)
    '''
    summ = .0
    for ind, col in zip(range(len(x)), x):
        if not np.isnan(obj.iat[ind,col]):
            summ += obj.iat[ind,col]
        else:
            summ += obj.iat[ind,0]
    return summ

In [17]:
carbon_values = pd.DataFrame({'BAU': objective_value(carbon, np.zeros(len(carbon), dtype=int)),
'SA': objective_value(carbon, np.ones(len(carbon), dtype=int)),
'EXT10': objective_value(carbon, np.ones(len(carbon), dtype=int)*2),
'EXT30': objective_value(carbon, np.ones(len(carbon), dtype=int)*3),
'GTR30': objective_value(carbon, np.ones(len(carbon), dtype=int)*4),
'NTSR': objective_value(carbon, np.ones(len(carbon), dtype=int)*5),
'NTLR': objective_value(carbon, np.ones(len(carbon), dtype=int)*6)}, index=pd.Series([1]))

In [18]:
print('Maximum values for all the harvesting schemes using original data:')
carbon_values

In [19]:
clean_carbon_values = pd.DataFrame({'BAU': objective_value(carbon_clean, np.zeros(len(carbon_clean), dtype=int)),
'SA': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)),
'EXT10': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*2),
'EXT30': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*3),
'GTR30': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*4),
'NTSR': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*5),
'NTLR': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*6)}, index=pd.Series([1]))

In [20]:
print('Maximum values for all the harvesting schemes using original data:')
clean_carbon_values

In [21]:
carbon_pic = carbon_values.plot(kind='bar')

Resulting bar-graph looks the same than in the paper..

In [22]:
clean_carbon_values.plot(kind='bar')

## Different way to evaluate data with Nan-values

In [23]:
#x = np.array(['NTLR']).repeat(len(carbon))
x = np.ones(len(carbon), dtype=int)*6

In [24]:
%%time 
print(sum([carbon.iat[ind, col] if not np.isnan(carbon.iat[ind, col]) else carbon.iat[ind, 0] for ind,col in zip(range(len(x)), x)]))

## Print statistics about different objectives

Calculate ratio of lines containing Nans

In [25]:
len(carbon_clean)/len(carbon), len(revenue_clean)/len(revenue)

Calculate the total ratio of Nan

In [26]:
carbon.isnull().sum().sum()/np.product(np.shape(carbon))

In [27]:
carbon.describe()

In [28]:
revenue.describe()

In [29]:
obs = [revenue, carbon, deadwood, HA]
for o in obs:
    print(o.min().min(), o.max().max())

The following actually computes the single objective optimums:

In [30]:
for o in obs:
    print(np.nanmax(o.values, axis=1).sum())

And as it is supposed, the values are exactly the same than if calculated using "real" optimization procedure.