In [1]:
%matplotlib inline 
import sys
sys.path.append("..")


import numpy as np
import random
import csv

from idtrees.utils import read_tifs #, load_data # Import data utils
from idtrees.utils.get_data import *
import matplotlib.pyplot as plt
from configs import *

import pandas as pd
from sklearn.model_selection import train_test_split




### Extracting ITCs as differently sized HSI images with a Tree Species Label

In [2]:
im_all_new, new_class_ids, class_id_val, n_px_val, sci_names, special_val_px = get_data()

Number of trees, labeled with species and bounding box:  1165
Classes to be used: [23. 26. 22.  2. 30. 18.]
Counts for these classes [ 53.  97. 103. 139. 169. 367.]
Number of trees, labeled with species and bounding box:  1165


In [3]:
spectra = [] # List of spectrum per pixel # TODO write as ndarray
class_id_new = [] # List of target per pixel 

for index in range(len(im_all_new)):
    # Append the spectra and class id of all pixels in bbox to a list
    n_px = np.prod(im_all_new[index].shape[1:])
    spectra.append(im_all_new[index].reshape(-1, n_px))
    class_id_new.append(new_class_ids[index] * np.ones(n_px))

# Convert list into ndarray
spectra = np.concatenate(spectra, axis=1)#.numpy())
class_id_new = np.concatenate(class_id_new, axis=0)

# Add class ids as zero'th row 
pixel_data = np.vstack((class_id_new[np.newaxis,:], spectra))


In [4]:
freq = pd.DataFrame({'class_ids': class_id_val, 'num_pix': n_px_val, 'sci_names': sci_names})
freq.loc[6, :] = ['34.', str(np.sum(special_val_px)), "ALL OTHER"]

In [5]:
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets

In [6]:
# Prepare data
n_train = int(.8 * pixel_data.shape[1])
xy = np.rollaxis(pixel_data, 1) # Format X into (n_samples, n_features)
np.random.shuffle(xy) # Shuffle randomly along axis of n_samples 
X = xy[:, 1:] 
Y = xy[:, 0]

In [7]:
# Do PCA
do_pca = True
if do_pca:
    pca = PCA(n_components='mle', whiten=False, svd_solver='full')
    X = pca.fit_transform(X)
print('Shape after [PCA]', X.shape)

Shape after [PCA] (35488, 360)


In [8]:
X_train = X[:n_train, :]
y_train = Y[:n_train]
X_test = X[n_train:, :]
y_test = Y[n_train:]

## 1. Initial SVM

In [9]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)

In [10]:
linear_pred = linear.predict(X_test)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
sig_pred = sig.predict(X_test)

# retrieve the accuracy and print it for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_poly = poly.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)
accuracy_sig = sig.score(X_test, y_test)

print("Accuracy Linear Kernel:", accuracy_lin)
print("Accuracy Polynomial Kernel:", accuracy_poly)
print('Accuracy Radial Basis Kernel:', accuracy_rbf)
print('Accuracy Sigmoid Kernel:', accuracy_sig)
      
# creating a confusion matrix
cm_lin = confusion_matrix(y_test, linear_pred)
cm_poly = confusion_matrix(y_test, poly_pred)
cm_rbf = confusion_matrix(y_test, rbf_pred)
cm_sig = confusion_matrix(y_test, sig_pred)
      
print(cm_lin)
print(cm_poly)
print(cm_rbf)
print(cm_sig)

Accuracy Linear Kernel: 0.6138348830656523
Accuracy Polynomial Kernel: 0.5076077768385461
Accuracy Radial Basis Kernel: 0.6073541842772612
Accuracy Sigmoid Kernel: 0.2738799661876585
[[ 347    4   18    0    0  311  225]
 [   0 1694    0    0   39    0   16]
 [  82    0  304    0    0  334  175]
 [  42    0    3    0    0  251  118]
 [   0  169    0    0  258    0    6]
 [  73    0   28    0    0  980  175]
 [ 131   36   85    0   13  407  774]]
[[  63    2    4    0    0  508  328]
 [   3 1464    0    0   20  190   72]
 [  12    0   77    0    0  404  402]
 [   4    1    4    0    0  301  104]
 [   1  227    0    0  156   35   14]
 [  19    6    5    0    0 1030  196]
 [  29   42   22    0    1  539  813]]
[[ 360    6   20    0    0  288  231]
 [   0 1691    0    0   36    1   21]
 [  81    0  295    0    0  290  229]
 [  64    3   26    3    0  209  109]
 [   0  230    0    0  197    1    5]
 [  93    2   38    0    0  931  192]
 [ 138   47   88    0    3  336  834]]
[[ 240  241  126

## 2. Rebalanced, removing Other, normalize

In [23]:
## Lets normalize and center the data
X_std = X - X.mean(axis=1).reshape(-1,1)
X_std = X_std/X_std.std(axis=1).reshape(-1,1)

## Lets leave out 34
X_std = X_std[Y != 34, :]
Y_std = Y[Y!=34]

X_train, X_test, y_train, y_test = train_test_split(X_std, Y_std, test_size=0.2, random_state=42)


In [24]:
X_train.shape, y_train.shape

((22468, 360), (22468,))

In [34]:
def rebalance_data(X_ub, Y_ub, up_balance_scale=3.):
    # Get classes and counts
    vals, counts = np.unique(Y_ub, return_counts=True)
    # See how bad the inbalance is and choose n_choose according to up_balance_scale
    if counts.max()/counts.min() >up_balance_scale:
        n_choose = int(counts.min()*up_balance_scale)
    else:
        n_choose = int(counts.max())

    X_new = np.zeros((n_choose*vals.shape[0], X_ub.shape[1]))
    Y_new = np.zeros(n_choose*vals.shape[0])
    for i, val in enumerate(vals):
        bool_arr = (Y_ub == val)
        if n_choose-bool_arr.sum()<0:
            random_idxs = np.random.randint(counts[i], size=(n_choose))
            X_new[i*n_choose:(i+1)*n_choose,:] = X_ub[bool_arr, :][random_idxs,:]
        else:
            random_idxs = np.random.randint(counts[i], size=(n_choose-bool_arr.sum()))
            X_new[i*n_choose:(i+1)*n_choose,:] = np.concatenate((X_ub[bool_arr, :],X_ub[bool_arr, :][random_idxs,:]),
                                                                axis=0)
        Y_new[i*n_choose:(i+1)*n_choose] = val
    return X_new, Y_new
X_train, y_train = rebalance_data(X_train, y_train)


## 3. Re-run SVM

In [36]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)

In [37]:
linear_pred = linear.predict(X_test)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
sig_pred = sig.predict(X_test)

# retrieve the accuracy and print it for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_poly = poly.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)
accuracy_sig = sig.score(X_test, y_test)

print("Accuracy Linear Kernel:", accuracy_lin)
print("Accuracy Polynomial Kernel:", accuracy_poly)
print('Accuracy Radial Basis Kernel:', accuracy_rbf)
print('Accuracy Sigmoid Kernel:', accuracy_sig)
      
# creating a confusion matrix
cm_lin = confusion_matrix(y_test, linear_pred)
cm_poly = confusion_matrix(y_test, poly_pred)
cm_rbf = confusion_matrix(y_test, rbf_pred)
cm_sig = confusion_matrix(y_test, sig_pred)
      
print(cm_lin)
print(cm_poly)
print(cm_rbf)
print(cm_sig)

Accuracy Linear Kernel: 0.7010859889620794
Accuracy Polynomial Kernel: 0.6368168061242656
Accuracy Radial Basis Kernel: 0.6599608331849742
Accuracy Sigmoid Kernel: 0.32419440982730996
[[ 494    0   93  160    0  167]
 [   1 1542    0    0  197    0]
 [ 106    0  524  123    0  122]
 [  67    0   25  315    0   38]
 [   0   50    0    0  353    0]
 [ 194    0  127  209    0  710]]
[[ 550    1   63  147    3  150]
 [   6 1378    0    7  349    0]
 [ 176    0  423  137    0  139]
 [ 113    0   38  249    0   45]
 [   0   46    0    0  357    0]
 [ 228    1  127  263    1  620]]
[[ 547   74   97   65    0  131]
 [   1 1608    0    0  131    0]
 [ 191   83  465   33    0  103]
 [ 137   61   34  170    0   43]
 [   0  104    1    0  298    0]
 [ 243  160  171   47    0  619]]
[[341  48 350  93  10  72]
 [ 26 808  52 103 727  24]
 [350  37 337  86  12  53]
 [147  43 163  45   9  38]
 [  2 212   2   7 173   7]
 [423 138 380 137  45 117]]


## 3. Running new models

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

'''
Lightweight script to test many models and find winners
:param X_train: training split
:param y_train: training target vector
:param X_test: test split
:param y_test: test target vector
:return: DataFrame of predictions
'''

dfs = []
models = [('LogReg', LogisticRegression()), 
          ('RF', RandomForestClassifier()),
          ('KNN', KNeighborsClassifier()),
          ('XGB', XGBClassifier())
        ]
results = []
names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
target_names = sci_names
for name, model in models:
    print(name, model, "="*10)
    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
    cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, y_pred, target_names=target_names))
    print(confusion_matrix(y_test, y_pred))
    results.append(cv_results)
    names.append(name)
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
final = pd.concat(dfs, ignore_index=True)



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

LogReg
                       precision    recall  f1-score   support

       Acer rubrum L.       0.55      0.54      0.54       914
Pinus palustris Mill.       0.96      0.86      0.91      1740
      Quercus alba L.       0.65      0.57      0.61       875
     Quercus coccinea       0.35      0.58      0.44       445
Quercus laevis Walter       0.60      0.87      0.71       403
     Quercus rubra L.       0.65      0.56      0.60      1240

             accuracy                           0.68      5617
            macro avg       0.63      0.66      0.63      5617
         weighted avg       0.70      0.68      0.68      5617

[[ 492    6  103  147    2  164]
 [   1 1504    0   10  223    2]
 [ 119    0  501  124    0  131]
 [  75    0   34  257    2   77]
 [   0   51    0    1  351    0]
 [ 212    1  132  197    6  692]]
RF
                       precision    recall  f1-score   support

       Acer rubrum L.       0.48      0.59      0.53       914
Pinus palustris Mill.       0.9

In [None]:
final