In [263]:
 %matplotlib inline
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

## The Boreal Forest Data

TODO: Describe here where the data is from and what does it really mean. Describe the variables and different "levels" (BAU, SA etc.)

In [264]:
data_dir = os.path.join(os.getcwd(), '../boreal_data')

carbon = pd.read_csv(os.path.join(data_dir, 'Carbon_storage.csv'))
HA = pd.read_csv(os.path.join(data_dir, 'Combined_HA.csv'))
deadwood = pd.read_csv(os.path.join(data_dir, 'Deadwood_volume.csv'))
revenue = pd.read_csv(os.path.join(data_dir, 'Timber_revenues.csv'))

## Cleaning the data

In the papers related to this data it says that:
- GTR30, EXT10 and EXT30 were not allowed for stands that didn't reach maturity
- No-thinning not allowed for stands were no-thinning according to the BAU scheme
- SA not allowed for stands where no thinning or harvest in BAU either

Now it looks like the NaN values are at the same places in all the files. It would be good to examine that more thoroughly:

In [265]:
(carbon.isnull()==HA.isnull()).all()

In [266]:
(HA.isnull()==deadwood.isnull()).all()

In [267]:
(deadwood.isnull()==revenue.isnull()).all()

We would now conclude that it is safe to say all the Nan values are at exactly the same places for all the files. Then we could remova all the rows (forest stands) that have any NaN values, and the id's of rows (stands) will still match.

In [268]:
carbon_clean = carbon.dropna(axis=0, how='any')
HA_clean = HA.dropna(axis=0, how='any')
deadwood_clean = deadwood.dropna(axis=0, how='any')
revenue_clean = revenue.dropna(axis=0, how='any')

In [269]:
print('Size of the original dataset: %r' % len(carbon))
carbon.describe()

In [270]:
print('Size of cleaned dataset: % r' % len(carbon_clean))
carbon_clean.describe()

## Relationships between Nans

In [271]:
carbon[carbon.BAU.isnull()].count()

In [272]:
carbon[carbon.SA.isnull()].count()

In [273]:
carbon[carbon.EXT10.isnull()].count()

In [274]:
carbon[carbon.EXT30.isnull()].count()

In [275]:
carbon[carbon.GTR30.isnull()].count()

In [276]:
carbon[carbon.NTSR.isnull()].count()

In [277]:
carbon[carbon.NTLR.isnull()].count()

### Outcomes of comparisons

- BAU and SA are never Nan
- If EXT10 is NaN
 - then EXT30 and GTR30 are NaN
- If EXT39 is NaN
 - then EXT10 is NaN
- IF GTR30 or NLTR is NaN
 - doesn't imply anything else


## Correlation of Nan within the data

In [278]:
carbon[carbon.BAU.isnull()].count()

## Objective values

Objective function as described in the paper "Spatially dynamic forest management to sustain biodiversity and economic returns". So if the value is Nan, then apply the Business as Usual scheme.

In [279]:
def objective_value(obj, x):
    '''
    Sums values of the given column on each row of the
    dataframe. If the value on the given column at 
    some row is Nan, then selects the first value from
    that row (according to the original source of the data)
    '''
    summ = .0
    for ind, col in zip(range(len(x)), x):
        if not np.isnan(obj.iat[ind,col]):
            summ += obj.iat[ind,col]
        else:
            summ += obj.iat[ind,0]
    return summ

In [280]:
carbon_values = pd.DataFrame({'BAU': objective_value(carbon, np.zeros(len(carbon), dtype=int)),
'SA': objective_value(carbon, np.ones(len(carbon), dtype=int)),
'EXT10': objective_value(carbon, np.ones(len(carbon), dtype=int)*2),
'EXT30': objective_value(carbon, np.ones(len(carbon), dtype=int)*3),
'GTR30': objective_value(carbon, np.ones(len(carbon), dtype=int)*4),
'NTSR': objective_value(carbon, np.ones(len(carbon), dtype=int)*5),
'NTLR': objective_value(carbon, np.ones(len(carbon), dtype=int)*6)}, index=pd.Series([1]))

In [281]:
print('Maximum values for all the harvesting schemes using original data:')
carbon_values

In [282]:
clean_carbon_values = pd.DataFrame({'BAU': objective_value(carbon_clean, np.zeros(len(carbon_clean), dtype=int)),
'SA': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)),
'EXT10': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*2),
'EXT30': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*3),
'GTR30': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*4),
'NTSR': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*5),
'NTLR': objective_value(carbon_clean, np.ones(len(carbon_clean), dtype=int)*6)}, index=pd.Series([1]))

In [283]:
print('Maximum values for all the harvesting schemes using original data:')
clean_carbon_values

In [284]:
carbon_pic = carbon_values.plot(kind='bar')

Resulting bar-graph looks the same than in the paper..

In [285]:
clean_carbon_values.plot(kind='bar')

## Different way to evaluate data with Nan-values

In [286]:
#x = np.array(['NTLR']).repeat(len(carbon))
x = np.ones(len(carbon), dtype=int)*6

In [287]:
%%time 
print(sum([carbon.iat[ind, col] if not np.isnan(carbon.iat[ind, col]) else carbon.iat[ind, 0] for ind,col in zip(range(len(x)), x)]))

## Concat data


In [288]:
all_vars = pd.concat([carbon[:5], HA[:5], deadwood[:5]], keys=['Carbon', 'HA','Deadwood'])

In [289]:
type(all_vars)

In [290]:
type(all_vars.loc['Carbon','BAU'])

This is not as easy as I thought. It is also important to think what is the most efficient structure for the future use.

# Optimization

In [291]:
%%time
from __future__ import division
from pyomo.environ import *

model = ConcreteModel()


data = carbon_clean
model.n = Param(within=NonNegativeIntegers, initialize=len(data))
model.m = Param(within=NonNegativeIntegers, initialize=len(list(data)))

model.I = RangeSet(0, model.n-1)
model.J = RangeSet(0, model.m-1)

model.x = Var(model.I, model.J, domain=Binary, initialize=0.0)
for i in model.I:
    model.x[i,0].value = 1.0

def c_init(model, i, j):
    return data.values[i,j]


model.c = Param(model.I, model.J, initialize=c_init)


def obj_fun(model):
    return sum(sum(model.x[i,j]*model.c[i,j] for i in model.I) for j in model.J)


model.OBJ = Objective(rule=obj_fun, sense=maximize)

def constra(model, i):
    return sum(model.x[i,j] for j in model.J) == 1 

model.Constraint1 = Constraint(model.I, rule=constra)

from pyomo.opt import SolverFactory
opt = SolverFactory('glpk')
res = opt.solve(model, tee=False)

In [292]:
res_dict = dict()
for i in model.I:
    for j in model.J:
        res_dict[j] = res_dict.get(j,0) + int(model.x[i,j].value)
print('Handling, # of stands')        
for key in res_dict:
    print("{:8} {}".format(list(data)[key], res_dict[key]))

So it looks like we really are able to solve this problem using the original data! (Without Nans)

# Clustering

### Feature selection

In [293]:
carbon.corr()

### Clustering according to the features