# Number of clusters

Let's consider the proper number of clusters, so that system to be used in the live session will be justified. We are going to do this by clustering all the objectives separately and selecting correct number of clusters for everyone.

In [1]:
 %matplotlib inline
import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from ASF import ASF
from gradutil import *
from pyomo.opt import SolverFactory
from BorealWeights import BorealWeightedProblem
seedn = 1

In [2]:
%%time
revenue, carbon, deadwood, ha = init_boreal()
n_revenue = nan_to_bau(revenue)
n_carbon= nan_to_bau(carbon)
n_deadwood = nan_to_bau(deadwood)
n_ha = nan_to_bau(ha)
ide = ideal(False)
nad = nadir(False)
opt = SolverFactory('glpk')

In [3]:
x = pd.concat((n_revenue, n_carbon, n_deadwood, n_ha), axis=1)
x_stack = np.dstack((n_revenue, n_carbon, n_deadwood, n_ha))

x_norm = normalize(x.values)
x_norm_stack = normalize(x_stack)

Calculate the number of clusters that keeps the user waiting time less than a second.

In [65]:
%%time 
import time
dur = 0
nclust1 = 50
while dur < 10:
    nclust1 += 10
    c, xtoc, dist = cluster(x_norm, nclust1, seedn, verbose=0)
    w = np.array([sum(xtoc == i) for i in range(nclust1)])
    c_mean = np.array([x_norm_stack[xtoc == i].mean(axis=0) for i in range(nclust1)])
    start = time.time()
    ref = np.array((ide[0], 0, 0, 0))
    asf = ASF(ide, nad, ref, c_mean, weights=w)
    stom = ASF(ide, nad, ref, c_mean, weights=w, scalarization='stom')
    guess = ASF(ide, nad, ref, c_mean, weights=w, scalarization='guess')
    opt.solve(asf.model)
    opt.solve(stom.model)
    opt.solve(guess.model)
    dur = time.time() - start
print(nclust1)

So if possible, we try to keep the total number of clusters below that.

In [12]:
def kmeans_and_eval(x, x_opt, x_orig, rng, seeds=range(2,11)):
    distsum = []
    optires = []
    opt = SolverFactory('cplex')
    for nclust in rng:
        dists = []
        optis = []
        for seedn in seeds:
            c, xtoc, dist = cluster(x, nclust, seedn, verbose=0)
            w = np.array([sum(xtoc == i) for i in range(nclust) if sum(xtoc==i) > 0])
            c_close = np.array([x_opt[np.argmin(dist[xtoc == i])] for i in range(nclust) if len(dist[xtoc == i]) > 0])
            prob = BorealWeightedProblem(c_close, weights=w)
            res = opt.solve(prob.model)
            optis.append(model_to_real_values(x_orig, prob.model, xtoc))
            dists.append(np.nansum(dist))
        optires.append(optis)
        distsum.append(dists)
    return distsum, optires

In [6]:
rng = range(50,251,20)
seeds = range(2,11)

In [7]:
distsum_revenue, optires_revenue = kmeans_and_eval(x_norm[:,:7],x_norm[:,:7],x.values[:,:7], rng)

In [13]:
distsum_carbon, optires_carbon = kmeans_and_eval(x_norm[:,7:14],x_norm[:,7:14],x.values[:,7:14], rng, seeds)


In [14]:
%%time
distsum_deadwood, optires_deadwood = kmeans_and_eval(x_norm[:,14:21],x_norm[:,14:21],x.values[:,14:21], rng)

In [32]:
%%time
distsum_ha, optires_ha = kmeans_and_eval(x_norm[:,21:],x_norm[:,21:],x.values[:,21:], rng)

In [33]:
%pylab inline
pylab.rcParams['figure.figsize'] = (15,12)

fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and sum of intra cluster distances')

ax[0,0].plot(rng, np.mean(distsum_revenue, axis=1))
ax[0,0].set_title('Revenue')

ax[0,1].plot(rng, np.mean(distsum_carbon, axis=1))
ax[0,1].set_title('Carbon')

ax[1,0].plot(rng, np.mean(distsum_deadwood, axis=1))
ax[1,0].set_title('Deadwood')

ax[1,1].plot(rng, np.mean(distsum_ha, axis=1))
ax[1,1].set_title('HA')

In [60]:
fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and minimu, average and maximum of optimization results')

data = np.array([[optires_revenue, optires_carbon], [optires_deadwood, optires_ha]])
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(rng, [k for k in zip(np.max(data[i,j], axis=1), np.mean(data[i,j], axis=1), np.min(data[i,j], axis=1))])
        ax[i,j].set_title(names[i,j])

From the plots we can say that more clusters we have the "better" the clusters are. From the optimization perspective, the results are actually not getting better with more clusters for all the objectives. For Revenue and Carbon they do, but for Deadwood and HA not really. This should be studied with even more clusters sometime.
There has also before aroused some issues with Deadwood and HA values, and this is the issue now again.

The dispersion of results is decreasing with the increase of clusters. It is of course good news.

Nowadays we also have the map data, so we could use it in this also. We could get better results, but still it is more data handling and not so much contributing in to this thesis.

## We can conclude this by saying that we just fix the number of clusters to be as big as we want, which is about 60 clusters in this case. (Keeping calculation time under 1 sec.)