## <span style="color:#0B3B2E;float:right;font-family:Calibri">Jordan Graesser</span>

# MpGlue
### Preparing data and testing model parameters

---
### Ranking and optimizing
---

In [1]:
import mpglue as gl

CL = gl.classification()

In [2]:
# Print help documentation for the Chi-square 
#   test to rank feature importance.
print help(CL.rank_feas)

Help on method rank_feas in module mpglue.classification.classification:

rank_feas(self, rank_text=None, rank_method='chi2', top_feas=1.0, be_quiet=False) method of mpglue.classification.classification.classification instance
    Ranks image features by importance.
    
    Args:
        rank_text (Optional[str]): A text file to write ranked features to. Default is None.
        rank_method (Optional[str]): The method to use for feature ranking. Default is 'chi2' (Chi^2). Choices are 
            ['chi2', 'RF'].
        top_feas (Optional[float or int]): The percentage or total number of features to reduce to. 
            Default is 1., or no reduction.
        be_quiet (Optional[bool]): Whether to be quiet and do not print to screen. Default is False.
    
    Returns:
        None, writes to ``rank_text`` if given and prints results to screen.
    
    Examples:
        >>> # rank image features
        >>> cl.split_samples('/samples.txt', scale_data=True)
        >>> cl.rank_feas(

In [4]:
samples = '../testing/data/08N_points_merged.txt'

# Sample the data
CL.split_samples(samples)

# Rank the explanatory variables with a chi-square test
CL.rank_feas(rank_method='chi2', top_feas=.2)

In [6]:
# Rank the explanatory variables with a Random Forest 
#   model and compare results.

# First, construct a Random Forest model.
CL.construct_model(classifier_info={'classifier': 'RF'})

# Use the RF model to rank feature importance.
CL.rank_feas(rank_method='RF', top_feas=.2)

11:37:01:INFO:4340:classification._train_model:  Training a RF model with 4,783 samples and 41 variables ...


---
### Optimizing parameters
---

In [6]:
print help(CL.optimize_parameters)

Help on method optimize_parameters in module mpglue.classification.classification:

optimize_parameters(self, file_name, classifier_info={'classifier': 'RF'}, n_trees_list=[500, 1000, 1500, 2000], trials_list=[2, 5, 10], max_depth_list=[25, 30, 35, 40, 45, 50], min_samps_list=[2, 5, 10], criterion_list=['gini'], rand_vars_list=['sqrt'], cf_list=[0.25, 0.5, 0.75], committees_list=[1, 2, 5, 10], rules_list=[25, 50, 100, 500], extrapolation_list=[0, 1, 5, 10], class_weight_list=[None, 'balanced', 'balanced_subsample'], learn_rate_list=[0.1, 0.2, 0.4, 0.6, 0.8, 1.0], bool_list=[True, False], c_list=[1.0, 10.0, 20.0, 100.0], gamma_list=[0.001, 0.001, 0.01, 0.1, 1.0, 5.0], k_folds=3, perc_samp=0.5, ignore_feas=[], use_xy=False, classes2remove=[], method='overall', f1_class=0, stratified=False, spacing=1000.0, calibrate_proba=False, output_file=None) method of mpglue.classification.classification.classification instance
    Finds the optimal parameters for a classifier by training and testing

In [None]:
# Find the optimum parameters for a Random Forest,
#   using the default parameter list.
CL.optimize_parameters(samples, 
                       classifier_info={'classifier': 'RF'}, 
                       use_xy=True)


Finding the best paramaters for a RF model ...



---
### Testing
---

In [7]:
emat = gl.error_matrix()

print help(emat)

Help on error_matrix in module mpglue.classification.error_matrix object:

class error_matrix(__builtin__.object)
 |  Computes accuracy statistics
 |  
 |  Args:
 |      po_text (str): Predicted and observed labels as a text file, where (predicted, observed)
 |          are the last two columns.
 |      po_array (ndarray): Predicted and observed labels as an array, where (predicted, observed)
 |          are the last two columns.
 |      header (Optional[bool]): Whether ``file`` or ``predicted_observed`` contains a header. Default is False.
 |      class_list (Optional[list])
 |      discrete (Optional[bool])
 |      e_matrix (Optional[ndarray])
 |  
 |  Attributes:
 |      n_classes (int): Number of unique classes.
 |      class_list (list): List of unique classes.
 |      e_matrix (ndarray): Error matrix.
 |      accuracy (float): Overall accuracy.
 |      report
 |      f_scores (float)
 |      f_beta (float)
 |      hamming (float)
 |      kappa_score (float)
 |      mae (float)
 |

In [9]:
import numpy as np

# Create some random data
test_array = np.random.randn(100, 2).astype('uint8')

emat.get_stats(po_array=test_array)

In [10]:
print dir(emat)

['X', '__class__', '__delattr__', '__dict__', '__doc__', '__format__', '__getattribute__', '__hash__', '__init__', '__module__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', 'accuracy', 'class_list', 'discrete', 'e_matrix', 'error_matrix2xy', 'f_beta', 'f_scores', 'get_stats', 'hamming', 'kappa', 'kappa_score', 'merge_lists', 'n_classes', 'n_samples', 'n_samps', 'producers', 'producers_accuracy', 'report', 'sample_bias', 'time_stamp', 'users', 'users_accuracy', 'write_stats', 'y']


In [11]:
print emat.accuracy

49.0


In [12]:
print emat.e_matrix

[[48  7  1  0 19]
 [ 9  0  0  1  1]
 [ 0  1  0  0  1]
 [ 2  0  0  0  0]
 [ 7  1  1  0  1]]


In [13]:
print emat.kappa_score

-0.0793650793651


In [14]:
print emat.report

             precision    recall  f1-score   support

          0       0.64      0.73      0.68        66
          1       0.00      0.00      0.00         9
          2       0.00      0.00      0.00         2
        254       0.00      0.00      0.00         1
        255       0.10      0.05      0.06        22

avg / total       0.44      0.49      0.46       100



In [1]:
print emat.write_stats('datasets/my_report.txt')

---
### Models in MpGlue
---

In [7]:
# Check the available models.
print CL.model_options()


        Supported models

        Parameter name -- Long name
              {Classifier defaults}
              *Scikit-learn parameter names and defaults

        AB_DT -- AdaBoost with CART (classification problems)
              *Scikit-learn
        AB_EX_DT-- AdaBoost with extremely random trees (classification problems)
              *Scikit-learn
        AB_RF-- AdaBoost with Random Forest (classification problems)
              *Scikit-learn
        AB_EX_RF-- AdaBoost with Extremely Random Forest (classification problems)
              *Scikit-learn
        AB_DTR-- AdaBoost with CART (regression problems)
              *Scikit-learn
        AB_EX_DTR-- AdaBoost with extremely random trees (regression problems)
              *Scikit-learn
        Bag   -- Bagging (classification problems)
              *Scikit-learn
        BagR  -- Bagging (regression problems)
              *Scikit-learn
        Bag_EX_DT-- Bagging with extra trees (classification problems)
              *S

In [24]:
# Construct a Random Forest model with 100 trees.
CL.construct_model(classifier_info={'classifier': 'RF',
                                    'n_estimators': 100})

# The model is stored in `model`.
print CL.model


Training a RF model with 4,783 samples and 41 variables ...

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=25, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=100, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)


In [25]:
# Construct an extremely randomized Random Forest 
#   with 100 trees.
CL.construct_model(classifier_info={'classifier': 'EX_RF',
                                    'n_estimators': 100})

print CL.model


Training an EX_RF model with 4,783 samples and 41 variables ...

ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=25, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)


In [31]:
# Construct a boosted extremely randomized Random Forest with 
#   100 trees and 10 trials (boosts).
CL.construct_model(classifier_info={'classifier': 'AB_EX_RF',
                                    'n_estimators': 100,
                                    'trials': 10})

print CL.model


Training an AB_EX_RF model with 4,783 samples and 41 variables ...

AdaBoostClassifier(algorithm='SAMME.R',
          base_estimator=ExtraTreesClassifier(bootstrap=False, class_weight=None, criterion='gini',
           max_depth=25, max_features='auto', max_leaf_nodes=None,
           min_impurity_split=1e-07, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=-1, oob_score=False, random_state=None,
           verbose=0, warm_start=False),
          learning_rate=0.1, n_estimators=10, random_state=None)


* We trained one model in the examples above.
* MpGlue also supports training ensemble models through Scikit-learn's `VotingClassifier` module.
* To train a voting classifier, simply provide a list of classifiers instead of a string.

In [None]:
# Construct an ensemble voting model and 
#   save the model to file.
CL.construct_model(classifier_info={'classifier': ['RF', 'EX_RF', 'Bayes', 'QDA', 'NN'],
                                    'n_estimators': 100,
                                    'trials': 5},
                   output_model='/voting_model.model')

---
### Making predictions on an image
---

In [None]:
# Load the voting model.
CL.construct_model(input_model='/voting_model.model')

In [None]:
# Apply the model to an image.
CL.predict('/input_image.tif',
           '/output_image.tif')

# *Note that the model could also be loaded with `predict`.
# This syntax would not require the `construct_model` method.
CL.predict('/input_image.tif',
           '/output_image.tif',
           input_model='voting_model.model')

In [None]:
# Apply the model to an image, adjusting the block size.
CL.predict('/input_image.tif',
           '/output_image.tif',
           row_block_size=2048,
           col_block_size=2048)

In [None]:
# Apply the model to an image, adjusting the number of parallel jobs.
CL.predict('/input_image.tif',
           '/output_image.tif',
           n_jobs=4,       # model parallel jobs
           n_jobs_vars=4)  # image band reading parallel jobs

In [None]:
# Apply the model to an image, and then apply
#   posterior probability label relaxation.
CL.predict('/input_image.tif',
           '/output_image.tif',
           relax_probabilities=True)

* The default for `predict` is to apply predictions block by block, reading from one image and writing to one image.
* However, sometimes the input image might be very large, making block writes to the output slow.
* `predict` can write to individual blocks instead of to one image.
* In the example below, individual blocks will be written as `/output_image_00001.tif`, `/output_image_00002.tif`, etc.

In [None]:
# Apply the model to an image, writing to individual blocks.
CL.predict('/input_image.tif',
           '/output_image.tif',
           write2blocks=True)