# Number of clusters

Let's consider the proper number of clusters, so that system to be used in the live session will be justified. We are going to do this by clustering all the objectives separately and selecting correct number of clusters for everyone.

In [1]:
%pylab inline
pylab.rcParams['figure.figsize'] = (15,12)

import seaborn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os
from ASF import ASF
from gradutil import *
from pyomo.opt import SolverFactory
from BorealWeights import BorealWeightedProblem
seedn = 2

In [2]:
%%time
revenue, carbon, deadwood, ha = init_boreal()
n_revenue = nan_to_bau(revenue)
n_carbon= nan_to_bau(carbon)
n_deadwood = nan_to_bau(deadwood)
n_ha = nan_to_bau(ha)
ide = ideal(False)
nad = nadir(False)
opt = SolverFactory('cplex')

In [3]:
x = pd.concat((n_revenue, n_carbon, n_deadwood, n_ha), axis=1)
x_stack = np.dstack((n_revenue, n_carbon, n_deadwood, n_ha))

In [4]:
revenue_norm = new_normalize(n_revenue.values)
carbon_norm = new_normalize(n_carbon.values)
deadwood_norm = new_normalize(n_deadwood.values)
ha_norm = new_normalize(n_ha.values)

x_norm = np.concatenate((revenue_norm, carbon_norm, deadwood_norm, ha_norm), axis=1)
x_norm_stack = np.dstack((revenue_norm, carbon_norm, deadwood_norm, ha_norm))
#x_norm = normalize(x.values)
#x_norm_stack = normalize(x_stack)

Lets try to calculate the maximum number of clusters that is computationally less expensive than using the original data. First calculate how long it takes to solve three additional scalarizations of the NIMBUS using the original data.

In [5]:
%%time
ref = np.array((ide[0], 0, 0, 0))
data = x_norm_stack
weights = np.ones(len(data))/len(data)
asf = ASF(ide, nad, ref, data, weights=weights)
stom = ASF(ide, nad, ref, data, weights=weights, scalarization='stom')
guess = ASF(ide, nad, ref, data, weights=weights, scalarization='guess')
opt.solve(asf.model)
opt.solve(stom.model)
opt.solve(guess.model)

We could decide that we want a clustering that at maximum takes as much time than this (8 min, 17 s)
Lets pick some number for clusters and try it

In [7]:
%%time
nclust = 8300
seedn = 2
c, xtoc, dist = cluster(x_norm, nclust, seedn, verbose=0)

In [8]:
%%time
from scipy.spatial.distance import euclidean 
x_opt = x_norm_stack
w = np.array([sum(xtoc == i) for i in range(nclust) if sum(xtoc == i) > 0])
            # Calculate the euclidian center of the cluster (mean)
            # and then the point closest to that center according to
            # euclidian distance, and then use the data format meant
            # for optimization
c_max = np.array([x_opt[min(np.array(range(len(xtoc)))[xtoc == i],
                              key=lambda index: euclidean(x_norm[index],np.mean(x_norm[xtoc == i],axis=0)))]
                    for i in range(nclust) if sum(xtoc == i) > 0])
ref = np.array((ide[0], 0, 0, 0))
asf = ASF(ide, nad, ref, c_max, weights=w)
stom = ASF(ide, nad, ref, c_max, weights=w, scalarization='stom')
guess = ASF(ide, nad, ref, c_max, weights=w, scalarization='guess')
opt.solve(asf.model)
opt.solve(stom.model)
opt.solve(guess.model)

It looks like that about 8500 is the number of clusters that we are willing to use at maximum

In [28]:
%%time
w = np.array([sum(xtoc == i) for i in range(nclust) if sum(xtoc==i) > 0])
c_close = np.array([x_this[np.argmin(dist[xtoc == i])] for i in range(nclust) if len(dist[xtoc == i]) > 0])

Calculate the number of clusters that keeps the user waiting time less than 10 seconds.

In [29]:
%%time 
import time
dur = 0
nclust1 = 50
nclust2 = 8500
current_clust = nclust1
while nclust1 + 10 < nclust2:
    c, xtoc, dist = cluster(x_norm, current_clust, seedn, verbose=0)
    w = np.array([sum(xtoc == i) for i in range(current_clust) if sum(xtoc==i) > 0])
    indices = [min(np.array(range(len(self.xtoc)))[self.xtoc == i],
                   key=lambda index: euclidean(clustdata[index],
                                               np.mean(clustdata[self.xtoc == i], axis=0)))
                   for i in range(nclust) if sum(self.xtoc == i) > 0]
    c_mean = x_norm_stack[indices]
    #c_mean = np.array([x_norm_stack[np.argmin(dist[xtoc == i])] for i in range(current_clust) if len(dist[xtoc==i]) > 0])
    start = time.time()
    ref = np.array((ide[0], 0, 0, 0))
    asf = ASF(ide, nad, ref, c_mean, weights=w)
    stom = ASF(ide, nad, ref, c_mean, weights=w, scalarization='stom')
    guess = ASF(ide, nad, ref, c_mean, weights=w, scalarization='guess')
    opt.solve(asf.model)
    opt.solve(stom.model)
    opt.solve(guess.model)
    dur = time.time() - start
    if dur >= 10:
        print('Over 10: {}'.format(current_clust))
        nclust2 = current_clust
        current_clust = int((current_clust - nclust1)/2 + nclust1)
    else:
        print('Under 10: {}'.format(current_clust))
        nclust1 = current_clust
        current_clust = int((nclust2 - current_clust)/2 + current_clust)
print('Final: {}'.format(current_clust))

So if possible, we try to keep the total number of clusters below that.

In [7]:
def kmeans_and_eval(x, x_opt, x_orig, rng, seeds):
    distsum = []
    optires = []
    opt = SolverFactory('cplex')
    for nclust in rng:
        dists = []
        optis = []
        for seedn in seeds:
            c, xtoc, dist = cluster(x, nclust, seedn, verbose=0)
            w = np.array([sum(xtoc == i) for i in range(nclust) if sum(xtoc==i) > 0])
            c_close = np.array([x_opt[np.argmin(dist[xtoc == i])] for i in range(nclust) if len(dist[xtoc == i]) > 0])
            prob = BorealWeightedProblem(c_close, weights=w)
            res = opt.solve(prob.model)
            optis.append(model_to_real_values(x_orig, prob.model, xtoc))
            dists.append(np.nansum(dist))
        optires.append(optis)
        distsum.append(dists)
    return distsum, optires

### Calculating indices up to 600 clusters

In [8]:
rng600 = range(50, 601, 50)
seeds = range(2,11)

In [9]:
%%time
distsum_revenue600, optires_revenue600 = kmeans_and_eval(x_norm[:,:7],x_norm[:,:7],x.values[:,:7], rng600, seeds)

In [10]:
%%time
distsum_carbon600, optires_carbon600 = kmeans_and_eval(x_norm[:,7:14],x_norm[:,7:14],x.values[:,7:14], rng600, seeds)


In [11]:
%%time
distsum_deadwood600, optires_deadwood600 = kmeans_and_eval(x_norm[:,14:21],x_norm[:,14:21],x.values[:,14:21], rng600, seeds)

In [12]:
%%time
distsum_ha600, optires_ha600 = kmeans_and_eval(x_norm[:,21:],x_norm[:,21:],x.values[:,21:], rng600, seeds)

In [8]:
pylab.rcParams['figure.figsize'] = (15,12)
fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and sum of intra cluster distances. Values from {} independent runs'.format(len(seeds)))

values = np.array([[distsum_revenue600, distsum_carbon600],[distsum_deadwood600, distsum_ha600]])
names = np.array([['Revenue','Carbon'],['Deadwood','Combined Habitat']])

for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(rng600, np.mean(values[i,j], axis=1))
        ax[i,j].set_title(names[i,j])

In [14]:
pylab.rcParams['figure.figsize'] = (15,12)
fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and minimum, average and maximum of optimization results. Values from {} independent runs'.format(len(seeds)))

data = np.array([[optires_revenue600, optires_carbon600], [optires_deadwood600, optires_ha600]])
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
optims = np.array([ideal(False)[:2], ideal(False)[2:]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(rng600, [k for k in zip(np.max(data[i,j], axis=1), np.mean(data[i,j], axis=1), np.min(data[i,j], axis=1))])
        ax[i,j].plot((min(rng600), max(rng600)),(optims[i,j], optims[i,j]))
        ax[i,j].set_title(names[i,j])

### Calculating indices from 600 to 20 000 clusters

In [None]:
rng20000 = range(600,20000,1000)

In [None]:
%%time
distsum_revenue20000, optires_revenue20000 = kmeans_and_eval(x_norm[:,:7],x_norm[:,:7],x.values[:,:7], rng20000, seeds)

In [None]:
%%time
distsum_carbon20000, optires_carbon20000 = kmeans_and_eval(x_norm[:,7:14],x_norm[:,7:14],x.values[:,7:14], rng20000, seeds)


In [None]:
%%time
distsum_deadwood20000, optires_deadwood20000 = kmeans_and_eval(x_norm[:,14:21],x_norm[:,14:21],x.values[:,14:21], rng20000, seeds)

In [None]:
%%time
distsum_ha20000, optires_ha20000 = kmeans_and_eval(x_norm[:,21:],x_norm[:,21:],x.values[:,21:], rng20000, seeds)

In [None]:
fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and sum of intra cluster distances. Values from {} independent runs'.format(len(seeds)))

values = np.array([[distsum_revenue20000, distsum_carbon20000],[distsum_deadwood20000, distsum_ha20000]])
names = np.array([['Revenue','Carbon'],['Deadwood','Combined Habitat']])

for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(rng20000, np.mean(values[i,j], axis=1))
        ax[i,j].set_title(names[i,j])

In [None]:
fig, ax = plt.subplots(2,2)
fig.suptitle('Number of clusters and minimum, average and maximum of optimization results. Values from {} independent runs'.format(len(seeds)))

data = np.array([[optires_revenue20000, optires_carbon20000], [optires_deadwood20000, optires_ha20000]])
names = np.array([['Revenue', 'Carbon'],['Deadwood', 'Habitat']])
optims = np.array([ideal(False)[:2], ideal(False)[2:]])
for i in range(np.shape(ax)[0]):
    for j in range(np.shape(ax)[1]):
        ax[i,j].plot(rng20000, [k for k in zip(np.max(data[i,j], axis=1), np.mean(data[i,j], axis=1), np.min(data[i,j], axis=1))])
        ax[i,j].plot((min(rng20000),max(rng20000)),(optims[i,j], optims[i,j]))
        ax[i,j].set_title(names[i,j])

## Some final thoughts

In [None]:
id_rev = ideal()['revenue']
(id_rev- np.max(optires_revenue))/id_rev

In [None]:
id_carb = ideal()['carbon']
(id_carb- np.max(optires_carbon))/id_carb

In [None]:
id_dead = ideal()['deadwood']
(id_dead- np.max(optires_deadwood))/id_dead

In [None]:
id_ha = ideal()['ha']
(id_ha- np.max(optires_ha))/id_ha

From the plots we can say that more clusters we have the "better" the clusters are. From the optimization perspective, the results are actually not getting better with more clusters for all the objectives. For Revenue and Carbon they do, but for Deadwood and HA not really. This should be studied with even more clusters sometime.
There has also before aroused some issues with Deadwood and HA values, and this is the issue now again.

The dispersion of results is decreasing with the increase of clusters. It is of course good news.

Nowadays we also have the map data, so we could use it in this also. We could get better results, but still it is more data handling and not so much contributing in to this thesis.

## We can conclude this by saying that we just fix the number of clusters to be as big as we want, which is about 60 clusters in this case. (Keeping calculation time under 1 sec.)