# How to handle Nan-values so that the HA doesn't get marginalized?

It has been a problem this far, that the clustering doesn't work as desired, and the problem is now located in the procedure Nan-values have been handled in clustering. So we need a better way to do that.

In [1]:
 %matplotlib inline
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from gradutil import *
from pyomo.opt import SolverFactory

In [2]:
seedn = 2
opt = SolverFactory('glpk')
solutions = real_solutions()
revenue, carbon, deadwood, ha = init_boreal()
x = pd.concat((revenue, carbon, deadwood, ha), axis=1)
x_bau = pd.concat((nan_to_bau(revenue), nan_to_bau(carbon), nan_to_bau(deadwood), nan_to_bau(ha)), axis=1)

## Cluster Nan:s separately

In [56]:
x_nan = x.iloc[np.any(np.isnan(x), axis=1),:]
x_nan_bau = x_bau.iloc[np.any(np.isnan(x), axis=1),:]
type(x_nan), np.shape(x_nan)

In [5]:
x_num = x.iloc[np.all(~np.isnan(x), axis=1),:]
np.shape(x_num)

Let's cluster all the different "nan-scenarios" differently. Details on how the nan:s relate are found in the DataTesting notebook.

In [6]:
x_nany2y4y6 = x_nan.values[np.logical_and(np.logical_and(np.isnan(x_nan.iloc[:,2]),np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]
x_nany2y4n6 = x_nan.values[np.logical_and(np.logical_and(np.isnan(x_nan.iloc[:,2]),np.isnan(x_nan.iloc[:,4])), ~np.isnan(x_nan.iloc[:,6])),:]
x_nany2n4y6 = x_nan.values[np.logical_and(np.logical_and(np.isnan(x_nan.iloc[:,2]),~np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]
x_nany2n4n6 = x_nan.values[np.logical_and(np.logical_and(np.isnan(x_nan.iloc[:,2]),~np.isnan(x_nan.iloc[:,4])), ~np.isnan(x_nan.iloc[:,6])),:]
x_nann2y4y6 = x_nan.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]),np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]
x_nann2y4n6 = x_nan.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]),np.isnan(x_nan.iloc[:,4])), ~np.isnan(x_nan.iloc[:,6])),:]
x_nann2n4y6 = x_nan.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]),~np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]
x_nann2n4n6 = x_nan.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]),~np.isnan(x_nan.iloc[:,4])), ~np.isnan(x_nan.iloc[:,6])),:]

In [7]:
np.shape(x_nany2y4y6), np.shape(x_nany2y4n6), np.shape(x_nany2n4y6), np.shape(x_nany2n4n6)

In [8]:
np.shape(x_nann2y4y6), np.shape(x_nann2y4n6), np.shape(x_nann2n4y6), np.shape(x_nann2n4n6)

The combinations we need are then: x_nany2y4y6, x_nany2y4n6, x_nann2y4n6, x_nann2n4y6.

In [9]:
np.shape(x_nany2y4y6)[0]+np.shape(x_nany2y4n6)[0]+np.shape(x_nann2y4n6)[0]+np.shape(x_nann2n4y6)[0]

Actually last three of these nan-versions are so small, that there is probably no point to cluster them anymore. So we can assume them as single clusters for the optimization part. The biggest one should still be splitted a bit more.

In [10]:
x_nan1 = x_nany2y4y6
x_nan2 = x_nany2y4n6
x_nan3 = x_nann2y4n6
x_nan4 = x_nann2n4y6

x_bau1 = x_nan_bau.values[np.logical_and(np.logical_and( np.isnan(x_nan.iloc[:,2]), np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]
x_bau2 = x_nan_bau.values[np.logical_and(np.logical_and( np.isnan(x_nan.iloc[:,2]), np.isnan(x_nan.iloc[:,4])),~np.isnan(x_nan.iloc[:,6])),:]
x_bau3 = x_nan_bau.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]), np.isnan(x_nan.iloc[:,4])),~np.isnan(x_nan.iloc[:,6])),:]
x_bau4 = x_nan_bau.values[np.logical_and(np.logical_and(~np.isnan(x_nan.iloc[:,2]),~np.isnan(x_nan.iloc[:,4])), np.isnan(x_nan.iloc[:,6])),:]

#### Remove all the columns containing (only) Nans, and normalize column wise to the 0-1 scale.

In [11]:
mask = np.ones(len(x_nan1[0]), dtype=bool)
mask[np.isnan(x_nan1[0])] = False
clust_x_nan1 = x_nan1[:,mask]
norm_clust_nan1 = normalize(clust_x_nan1)

In [68]:
%%time 
nclust1 = 100
c, xtoc, dist = cluster(norm_clust_nan1, nclust1, seedn, verbose=1)

w_nan1 = np.array([sum(xtoc == i) for i in range(len(c))])

c_nan1 = np.array([x_bau1[xtoc == i].mean(axis=0) for i in range(nclust)])

Now because we are using reduced data for the clustering, we need to change the centers to correspond to the dataset where Nans are replaced with Bau values.

In [69]:
c_nan2 = x_bau2.mean(axis=0)
c_nan3 = x_bau3.mean(axis=0)
c_nan4 = x_bau4.mean(axis=0)

w_nan2 = np.shape(x_nan2)[0]
w_nan3 = np.shape(x_nan3)[0]
w_nan4 = np.shape(x_nan4)[0]

combined_data = np.concatenate((c_nan1,np.array((c_nan2, c_nan3, c_nan4))), axis=0)
combined_weights = np.concatenate((w_nan1, np.array((w_nan2, w_nan3, w_nan4))), axis=0)

In [70]:
res_x = np.concatenate((x_nan1, x_nan2, x_nan3, x_nan4), axis=0)
res_xtoc = np.concatenate((xtoc, 
                           np.ones(np.shape(x_nan2)[0])*(nclust), 
                           np.ones(np.shape(x_nan3)[0])*(nclust+1), 
                           np.ones(np.shape(x_nan4)[0])*(nclust+2)))

In [71]:
opt = SolverFactory('glpk')

prob1, prob2, prob3, prob4 = optimize_all(normalize(combined_data), combined_weights, opt)

val1 = model_to_real_values(x_nan_bau.iloc[:, :7].values, res_xtoc, prob1[0].model)
val2 = model_to_real_values(x_nan_bau.iloc[:, 7:14].values, res_xtoc, prob2[0].model)
val3 = model_to_real_values(x_nan_bau.iloc[:, 14:21].values, res_xtoc, prob3[0].model)
val4 = model_to_real_values(x_nan_bau.iloc[:, 21:].values, res_xtoc, prob4[0].model)

In [43]:
norm_num_x = normalize(x_num.values)
norm_nan_x = normalize(x_nan.values)

Here we calculate the optimization result straight away using only lines with nan-values, so that we can compare clustering results to something.

In [18]:
%%time
opt = SolverFactory('glpk')
real_nan_revenue, real_nan_carbon, real_nan_deadwood, real_nan_ha = optimize_all(norm_nan_x, np.ones(len(norm_nan_x)), opt)

In [32]:
revenue_list = values_to_list(real_nan_revenue[0], x_nan_bau.iloc[:,:7].values)
carbon_list = values_to_list(real_nan_carbon[0], x_nan_bau.iloc[:,7:14].values)
deadwood_list = values_to_list(real_nan_deadwood[0], x_nan_bau.iloc[:,14:21].values)
ha_list = values_to_list(real_nan_ha[0], x_nan_bau.iloc[:,21:].values)

In [72]:
print('Relative differences to original values (calculated with Nan:s), 100 clusters')
print("(i)   Harvest revenues {:.3f}".format((val1-sum(revenue_list))/sum(revenue_list)))
print("(ii)  Carbon storage   {:.3f}".format((val2-sum(carbon_list))/sum(carbon_list)))
print("(iii) Deadwood index   {:.3f}".format((val3-sum(deadwood_list))/sum(deadwood_list)))
print("(iv)  Combined Habitat {:.3f}".format((val4-sum(ha_list))/sum(ha_list)))

Relative differences to original values (calculated with Nan:s), 50 clusters
- (i)   Harvest revenues -0.003
- (ii)  Carbon storage   -0.086
- (iii) Deadwood index   -0.405
- (iv)  Combined Habitat 0.036

Relative differences to original values (calculated with Nan:s), 25 clusters
- (i)   Harvest revenues 0.016
- (ii)  Carbon storage   -0.085
- (iii) Deadwood index   -0.437
- (iv)  Combined Habitat 0.081


This time the differences between objectives are not so great. Also now the biggest problem is in the Deadwood and no longer HA. Well, that is some kind of progress.

### Then try also clustering Nan-lines with just replacing Nan:s with BAUs

In [61]:
%%time
nclust2 = 500
n_nan_opt_revenue, n_nan_opt_carbon, n_nan_opt_deadwood, n_nan_opt_ha = cNopt(x_nan_bau.values, x_nan_bau.values, x_nan_bau.values, opt, nclust2, seedn)

In [62]:
print('Relative differences to original values (calculated with Nan:s), 50 clusters')
print("(i) Harvest revenues difference {:.3f}".format((n_nan_opt_revenue-sum(revenue_list))/sum(revenue_list)))
print("(ii) Carbon storage {:.3f}".format((n_nan_opt_carbon-sum(carbon_list))/sum(carbon_list)))
print("(iii) Deadwood index {:.3f}".format((n_nan_opt_deadwood-sum(deadwood_list))/sum(deadwood_list)))
print("(iv) Combined Habitat {:.3f}".format((n_nan_opt_ha-sum(ha_list))/sum(ha_list)))

Results are somehow similar to the previous ones, meaning that relative differences are not so great and the worst one ise Deadwood.

## Clustering and running all


Let's now cluster and optimize everything just by replacing Nan values with BAU values:

In [74]:
nclust3 = 50
all_revenue, all_carbon, all_deadwood, all_ha = cNopt(x_bau.values, x_bau.values, x_bau.values, opt, nclust3, seedn)

In [76]:
print('Relative differences to original values clustering all, 50 clusters')
print("(i) Harvest revenues difference {:.3f}".format((all_revenue-solutions['revenue'])/solutions['revenue']))
print("(ii) Carbon storage {:.3f}".format((all_carbon-solutions['carbon'])/solutions['carbon']))
print("(iii) Deadwood index {:.3f}".format((all_deadwood-solutions['deadwood'])/solutions['deadwood']))
print("(iv) Combined Habitat {:.3f}".format((all_ha-solutions['ha'])/solutions['ha']))

And that doesn't work.

## Cluster separately and optimize together

We could try clustering all the lines including Nan:s using only columns that are not Nan:s.

In [86]:
mask = np.ones(len(x_nan1[0]), dtype=bool)
mask[np.isnan(x_nan1[0])] = False
mask[np.isnan(x_nan2[0])] = False
mask[np.isnan(x_nan3[0])] = False
mask[np.isnan(x_nan4[0])] = False
clust_x_nan = x_nan.values[:,mask]
norm_clust_nan = normalize(clust_x_nan)
norm_clust_num = normalize(x_num.values)

In [None]:
%%time 
nclust4 = 500
c_nan, xtoc_nan, dist_nan = cluster(norm_clust_nan, nclust4, seedn, verbose=1)
c_num, xtoc_num, dist_num = cluster(norm_clust_num, nclust4, seedn, verbose=1)

w_nan = np.array([sum(xtoc_nan == i) for i in range(len(c_nan))])
c_nan = np.array([x_nan_bau.iloc[xtoc_nan == i].mean(axis=0) for i in range(nclust4)])

w_num = np.array([sum(xtoc_num == i) for i in range(len(c_num))])
c_num = np.array([x_num.iloc[xtoc_num == i].mean(axis=0) for i in range(nclust4)])

In [110]:
combined_x = np.concatenate((c_nan, c_num), axis=0)
combined_weights_all = np.concatenate((w_nan, w_num), axis=0)

In [111]:
res_xtoc_all = np.concatenate((xtoc_nan, xtoc_num+nclust4), axis=0)

In [112]:
opt = SolverFactory('glpk')

prob1_all, prob2_all, prob3_all, prob4_all = optimize_all(normalize(combined_x), combined_weights_all, opt)

val1_all = model_to_real_values(x_bau.iloc[:, :7].values, res_xtoc_all, prob1_all[0].model)
val2_all = model_to_real_values(x_bau.iloc[:, 7:14].values, res_xtoc_all, prob2_all[0].model)
val3_all = model_to_real_values(x_bau.iloc[:, 14:21].values, res_xtoc_all, prob3_all[0].model)
val4_all= model_to_real_values(x_bau.iloc[:, 21:].values, res_xtoc_all, prob4_all[0].model)

In [113]:
print('Relative differences to original values clustering all, 50 clusters')
print("(i) Harvest revenues difference {:.3f}".format((val1_all-solutions['revenue'])/solutions['revenue']))
print("(ii) Carbon storage {:.3f}".format((val2_all-solutions['carbon'])/solutions['carbon']))
print("(iii) Deadwood index {:.3f}".format((val3_all-solutions['deadwood'])/solutions['deadwood']))
print("(iv) Combined Habitat {:.3f}".format((val4_all-solutions['ha'])/solutions['ha']))

Final words from this testing are as follows: Just replacing NAN values with BAU values in clustering and optimization works better than previous approaches. Even better options is to cluster all the lines containing some Nan-values separately using just columns without any Nan:s. This far the best option is to cluster all the 'Nan-classes' separately, using spesific number of features for every class.