In [1]:
%matplotlib inline 
import sys
sys.path.append("..")


import numpy as np
import random
import csv

from idtrees.utils import read_tifs #, load_data # Import data utils
from idtrees.utils.get_data import *
import matplotlib.pyplot as plt
from configs import *

import pandas as pd
from sklearn.model_selection import train_test_split


### Extracting ITCs as differently sized HSI images with a Tree Species Label

In [2]:
im_all_new, new_class_ids, class_id_val, n_px_val, sci_names, special_val_px = get_data()

Number of trees, labeled with species and bounding box:  1165
Classes to be used: [23. 26. 22.  2. 30. 18.]
Counts for these classes [ 53.  97. 103. 139. 169. 367.]
Number of trees, labeled with species and bounding box:  1165


In [3]:
spectra = [] # List of spectrum per pixel # TODO write as ndarray
class_id_new = [] # List of target per pixel 

for index in range(len(im_all_new)):
    # Append the spectra and class id of all pixels in bbox to a list
    n_px = np.prod(im_all_new[index].shape[1:])
    spectra.append(im_all_new[index].reshape(-1, n_px))
    class_id_new.append(new_class_ids[index] * np.ones(n_px))

# Convert list into ndarray
spectra = np.concatenate(spectra, axis=1)#.numpy())
class_id_new = np.concatenate(class_id_new, axis=0)

# Add class ids as zero'th row 
pixel_data = np.vstack((class_id_new[np.newaxis,:], spectra))


In [4]:
freq = pd.DataFrame({'class_ids': class_id_val, 'num_pix': n_px_val, 'sci_names': sci_names})
freq.loc[6, :] = ['34.', str(np.sum(special_val_px)), "ALL OTHER"]

In [5]:
from sklearn.decomposition import PCA
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn import svm, datasets

In [6]:
# Prepare data
n_train = int(.8 * pixel_data.shape[1])
xy = np.rollaxis(pixel_data, 1) # Format X into (n_samples, n_features)
np.random.shuffle(xy) # Shuffle randomly along axis of n_samples 
X = xy[:, 1:] 
Y = xy[:, 0]

In [7]:
# Do PCA
do_pca = True
if do_pca:
    pca = PCA(n_components='mle', whiten=False, svd_solver='full')
    X = pca.fit_transform(X)
print('Shape after [PCA]', X.shape)

Shape after [PCA] (35488, 360)


In [8]:
X_train = X[:n_train, :]
y_train = Y[:n_train]
X_test = X[n_train:, :]
y_test = Y[n_train:]

## 1. Initial SVM

In [9]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)
poly = svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo').fit(X_train, y_train)
sig = svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo').fit(X_train, y_train)

In [10]:
linear_pred = linear.predict(X_test)
poly_pred = poly.predict(X_test)
rbf_pred = rbf.predict(X_test)
sig_pred = sig.predict(X_test)

# retrieve the accuracy and print it for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_poly = poly.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)
accuracy_sig = sig.score(X_test, y_test)

print("Accuracy Linear Kernel:", accuracy_lin)
print("Accuracy Polynomial Kernel:", accuracy_poly)
print('Accuracy Radial Basis Kernel:', accuracy_rbf)
print('Accuracy Sigmoid Kernel:', accuracy_sig)
      
# creating a confusion matrix
cm_lin = confusion_matrix(y_test, linear_pred)
cm_poly = confusion_matrix(y_test, poly_pred)
cm_rbf = confusion_matrix(y_test, rbf_pred)
cm_sig = confusion_matrix(y_test, sig_pred)
      
print(cm_lin)
print(cm_poly)
print(cm_rbf)
print(cm_sig)

Accuracy Linear Kernel: 0.6108763031839954
Accuracy Polynomial Kernel: 0.503803888419273
Accuracy Radial Basis Kernel: 0.6107354184277262
Accuracy Sigmoid Kernel: 0.2948717948717949
[[ 362    2   10    0    0  306  231]
 [   0 1732    0    0   45    0   21]
 [  79    0  278    0    0  319  175]
 [  42    0    3    0    0  270   96]
 [   0  167    0    0  211    0    3]
 [  51    0   29    0    0  970  186]
 [ 160   46   92    0    8  421  783]]
[[  77    0    4    0    0  608  222]
 [   1 1525    0    0   25  218   29]
 [  17    0   71    0    0  472  291]
 [   4    2    1    0    0  333   71]
 [   0  199    0    0  138   40    4]
 [  15    8    2    0    0 1033  178]
 [  28   52   18    0    1  679  732]]
[[ 339    2   18    0    0  279  273]
 [   1 1751    0    0   30    0   16]
 [  81    0  260    0    0  278  232]
 [  66    2   23    6    0  217   97]
 [   0  215    0    0  163    0    3]
 [  80    3   49    1    0  918  185]
 [ 162   45   80    0    1  324  898]]
[[ 186   30  305 

## 2. Rebalanced, removing Other, normalize

In [8]:
## Lets normalize and center the data
X_std = X - X.mean(axis=1).reshape(-1,1)
X_std = X_std/X_std.std(axis=1).reshape(-1,1)

## Lets leave out 34
X_std = X_std[Y != 34, :]
Y_std = Y[Y!=34]

X_train, X_test, y_train, y_test = train_test_split(X_std, Y_std, test_size=0.2, random_state=2)


In [9]:
X_train.shape, y_train.shape

((22468, 360), (22468,))

In [10]:
def rebalance_data(X_ub, Y_ub, up_balance_scale=3.):
    # Get classes and counts
    vals, counts = np.unique(Y_ub, return_counts=True)
    # See how bad the inbalance is and choose n_choose according to up_balance_scale
    if counts.max()/counts.min() >up_balance_scale:
        n_choose = int(counts.min()*up_balance_scale)
    else:
        n_choose = int(counts.max())

    X_new = np.zeros((n_choose*vals.shape[0], X_ub.shape[1]))
    Y_new = np.zeros(n_choose*vals.shape[0])
    for i, val in enumerate(vals):
        bool_arr = (Y_ub == val)
        if n_choose-bool_arr.sum()<0:
            random_idxs = np.random.randint(counts[i], size=(n_choose))
            X_new[i*n_choose:(i+1)*n_choose,:] = X_ub[bool_arr, :][random_idxs,:]
        else:
            random_idxs = np.random.randint(counts[i], size=(n_choose-bool_arr.sum()))
            X_new[i*n_choose:(i+1)*n_choose,:] = np.concatenate((X_ub[bool_arr, :],X_ub[bool_arr, :][random_idxs,:]),
                                                                axis=0)
        Y_new[i*n_choose:(i+1)*n_choose] = val
    return X_new, Y_new
X_train, y_train = rebalance_data(X_train, y_train)


## 3. Re-run SVM

In [12]:
linear = svm.SVC(kernel='linear', C=1, decision_function_shape='ovo').fit(X_train, y_train)
rbf = svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo').fit(X_train, y_train)

linear_pred = linear.predict(X_test)
rbf_pred = rbf.predict(X_test)

# retrieve the accuracy and print it for all 4 kernel functions
accuracy_lin = linear.score(X_test, y_test)
accuracy_rbf = rbf.score(X_test, y_test)

print("Accuracy Linear Kernel:", accuracy_lin)
print('Accuracy Radial Basis Kernel:', accuracy_rbf)
      
# creating a confusion matrix
cm_lin = confusion_matrix(y_test, linear_pred)
cm_rbf = confusion_matrix(y_test, rbf_pred)
      
print(cm_lin)
print(cm_rbf)

Accuracy Linear Kernel: 0.7062488873063913
Accuracy Radial Basis Kernel: 0.6150970268826775
[[ 484    1   81  138    0  185]
 [   0 1544    0    0  195    0]
 [ 134    0  533  135    0  118]
 [  42    0   23  284    0   43]
 [   0   35    0    0  380    0]
 [ 161    3  118  238    0  742]]
[[ 425    1   75   78    0  310]
 [   1 1240    0    1  114  383]
 [ 150    0  416   51    0  303]
 [  92    0   30  162    0  108]
 [   0   72    0    0  306   37]
 [ 165    1  156   34    0  906]]


## 3. Running new models

In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn import model_selection
from sklearn.utils import class_weight
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

'''
Lightweight script to test many models and find winners
:param X_train: training split
:param y_train: training target vector
:param X_test: test split
:param y_test: test target vector
:return: DataFrame of predictions
'''

dfs = []
models = [('LogReg', LogisticRegression(max_iter=1e4)), 
          ('RF', RandomForestClassifier(n_jobs=4)),
          ('XGB', XGBClassifier()),
          ('KNN', KNeighborsClassifier(n_jobs=4))
        ]
results = []
names = []
scoring = ['accuracy', 'precision_weighted', 'recall_weighted', 'f1_weighted']
target_names = sci_names
for name, model in models:
    print(name, model, "="*10)
    kfold = model_selection.KFold(n_splits=5, shuffle=True, random_state=90210)
    cv_results = model_selection.cross_validate(model, X_train, y_train, cv=kfold, scoring=scoring)
    clf = model.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(name)
    print(classification_report(y_test, y_pred, target_names=target_names))
    print(confusion_matrix(y_test, y_pred))
    results.append(cv_results)
    names.append(name)
    this_df = pd.DataFrame(cv_results)
    this_df['model'] = name
    dfs.append(this_df)
final = pd.concat(dfs, ignore_index=True)

LogReg
                       precision    recall  f1-score   support

       Acer rubrum L.       0.59      0.55      0.57       889
Pinus palustris Mill.       0.98      0.88      0.93      1739
      Quercus alba L.       0.66      0.60      0.63       920
     Quercus coccinea       0.34      0.65      0.44       392
Quercus laevis Walter       0.65      0.93      0.77       415
     Quercus rubra L.       0.68      0.56      0.62      1262

             accuracy                           0.70      5617
            macro avg       0.65      0.70      0.66      5617
         weighted avg       0.73      0.70      0.71      5617

[[ 491    2   95  137    0  164]
 [   0 1535    0    1  203    0]
 [ 141    0  554  117    0  108]
 [  47    0   32  253    0   60]
 [   0   29    0    0  386    0]
 [ 149    2  157  241    2  711]]
RF
                       precision    recall  f1-score   support

       Acer rubrum L.       0.51      0.51      0.51       889
Pinus palustris Mill.       0.9























XGB
                       precision    recall  f1-score   support

       Acer rubrum L.       0.53      0.54      0.53       889
Pinus palustris Mill.       0.97      0.94      0.96      1739
      Quercus alba L.       0.63      0.58      0.60       920
     Quercus coccinea       0.53      0.53      0.53       392
Quercus laevis Walter       0.79      0.88      0.83       415
     Quercus rubra L.       0.66      0.69      0.67      1262

             accuracy                           0.73      5617
            macro avg       0.69      0.69      0.69      5617
         weighted avg       0.73      0.73      0.73      5617

[[ 478    3  106   87    0  215]
 [   2 1639    0    0   98    0]
 [ 155    0  532   51    0  182]
 [  86    0   37  209    0   60]
 [   0   49    0    0  366    0]
 [ 177    0  165   46    0  874]]
KNN
                       precision    recall  f1-score   support

       Acer rubrum L.       0.47      0.53      0.50       889
Pinus palustris Mill.       0.96 

In [None]:
final