In [2]:
import numpy as np
import random
import csv
import time

from idtrees.utils import read_tifs #, load_data # Import data utils
%matplotlib inline 
import matplotlib.pyplot as plt

In [3]:
data = read_tifs.get_hsi_pixels()

Number of trees, labeled with species and bounding box:  1165


In [4]:
print('Data shape: ', data.shape)
print('Number of pixels within bounding boxes: ', data.shape[1])
class_ids = np.unique(data[0,:]) # Class_ids should start with 1
print('Number of classes: ', class_ids.shape[0])

Data shape:  (370, 35488)
Number of pixels within bounding boxes:  35488
Number of classes:  33


# Testing baseline scikit-learn classifiers

In [5]:
# Prepare data
n_train = int(.8 * data.shape[1])
xy = np.rollaxis(data, 1) # Format X into (n_samples, n_features)
np.random.shuffle(xy) # Shuffle randomly along axis of n_samples 
X = xy[:, 1:] 
Y = xy[:, 0]

In [6]:
# Do PCA
do_pca = True
if do_pca:
    from sklearn.decomposition import PCA
    pca = PCA(n_components=40, whiten=False)
    X = pca.fit_transform(X)
print('Shape after pca', X.shape)

Shape after pca (35488, 40)


In [7]:
X_train = X[:n_train, :]
Y_train = Y[:n_train]
X_val = X[n_train:, :]
Y_val = Y[n_train:]

## Fit and test multiclass logistic regression
Insights:
- Choice of solver has no effect on clf accuracy (tested newton-cg, lbfgs, saga with c=10., max_iter=1e5)
- SAGA optimizer took significantly longer to train, but its the only optimizer that can do l1 or elasticnet penalty.
- Reg: Best $c=1e3$. Weaker regularization, c $\uparrow$, increases predictive accuracy. Acc. increases only by $1e{-}3$ for $c>1e3$. Oddly enough weaker regularization increases training time (c=1, 12.7s; c=1e3, 87s); maybe because optimization is more challenging. 
- max_iter > 1e5 doesnt impact clf accuracy for lbfgs solver, c=10
- introducing class weight to balance dataset significantly reduced acc.

In [8]:
# Fit and test multiclass logistic regression
from sklearn.linear_model import LogisticRegression
print('Input shape: X: {},  Y: {}'.format(X_train.shape, Y_train.shape))
for c in [1., 1e2, 1e3]:
    start = time.time()
    clf = LogisticRegression(random_state=0, C = c, multi_class='multinomial', max_iter=1e5).fit(X_train, Y_train)
    print('Log Reg mean acc. with c=%6.1f in %13.1f s: \t%14.7f'%(c, time.time() - start , clf.score(X_val, Y_val)))
    # clf.predict(X_test[:, :])
    # c=1e3 - 0.62454; c=1e4 - 0.

Input shape: X: (28390, 40),  Y: (28390,)
Log Reg mean acc. with c=1.0 in          12.7s:0.5635390
Log Reg mean acc. with c=100.0 in          86.9s:0.6222880
Log Reg mean acc. with c=1000.0 in         192.2s:0.6213018


In [None]:
# Check out Random Forest classifier
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(max_depth=20, random_state=0).fit(X_train, Y_train)
# clf.predict(X_val[:n_samples, :])
print('Random Forest score: ', clf.score(X_val, Y_val))