In [None]:
%matplotlib inline


# Classifier comparison


A comparison of a several classifiers in scikit-learn on synthetic datasets.
The point of this example is to illustrate the nature of decision boundaries
of different classifiers.
This should be taken with a grain of salt, as the intuition conveyed by
these examples does not necessarily carry over to real datasets.

Particularly in high-dimensional spaces, data can more easily be separated
linearly and the simplicity of classifiers such as naive Bayes and linear SVMs
might lead to better generalization than is achieved by other classifiers.

The plots show training points in solid colors and testing points
semi-transparent. The lower right shows the classification accuracy on the test
set.


In [None]:
# Modifed from example developed by Gaël Varoquaux and Andreas Müller
# License: BSD 3 clause

import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_moons, make_circles, make_classification
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
h = .02  # step size in the mesh

names = ["Nearest Neighbors", 
         "Gaussian Process",
         "Decision Tree", 
         "Random Forest", 
         "Neural Net", 
         "Naive Bayes"
        ]

classifiers = [
    KNeighborsClassifier(3),
    GaussianProcessClassifier(1.0 * RBF(1.0), warm_start=True),
    DecisionTreeClassifier(max_depth=5),
    RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1),
    MLPClassifier(alpha=1),
    GaussianNB(),
    ]

In [None]:
if False:
    #datasets = [make_moons(noise=0.3, random_state=0), 
    #            make_circles(noise=0.2, factor=0.5, random_state=1),
    #            linearly_separable]
    pass
else:
    import pandas as pd
    df1 = pd.read_csv('./data/landsat5_training.csv')
    df2 = pd.read_csv('./data/landsat8_training.csv')
    df = pd.concat([df1,df2])
    print(df.head())
    
    dvars = df[['ndvi', 'bnn']].as_matrix()
    
    mapping = {'veg':1, 'back': 2, 'norm_back':3, 'water':4}
    dtype = df["type"].replace(mapping).as_matrix()
    print (dtype)

    #et = (df[['red', 'green', 'blue', 'nir', 'ndvi', 'bn', 'bnn']].as_matrix(),df["type"].as_matrix())
    et = (dvars,dtype)
    print ("shape", et[0].shape)

    datasets = [et]

    ## scale the NDVI back to float
    #Erin_train["NDVI"] /= 1000.0
    #Erin_T = (Erin_train[["NDVI","PAN"]].as_matrix(),Erin_train["VEG"].as_matrix())
    #print ("shape", Erin_T[0].shape)
    
    ##Geri_train = pd.read_csv("../Erin_32628_training_data_v2.csv")
    ##Geri_T = (Geri_train[["NDVI","PAN"]].as_matrix(),Geri_train["VEG"].as_matrix())
    ##print Geri_T
    ##print "shapes", Geri_T[0].shape    
    
    #datasets = [Erin_T]
    #datasets = [Erin_T, Geri_T]

In [None]:
figure = plt.figure(figsize=(27, 9))
i = 0
# iterate over datasets
for ds_cnt, ds in enumerate(datasets):
    i += 1
    #print ds_cnt
    # preprocess dataset, split into training and test part
    X, y = ds

    X = StandardScaler().fit_transform(X)
    X_train, X_test, y_train, y_test = \
        train_test_split(X, y, test_size=.4, random_state=42)
    #print "shapes:",X_train.shape, X_test.shape, y_train.shape, y_test.shape
    
    x_min, x_max = X[:, 0].min(), X[:, 0].max()
    y_min, y_max = X[:, 1].min(), X[:, 1].max()
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    #print "x_min=",x_min,"  x_max=",x_max
    #print "y_min=",y_min,"  y_max=",y_max
    #print "shape",xx.shape,yy.shape
    
    # just plot the dataset first
    cm = plt.cm.RdBu
    cm_bright = ListedColormap(['#FF0000', '#0000FF'])
    ax = plt.subplot(len(datasets), len(classifiers) + 1, i)

    ax.set_title("ndvi")
    plt.ylabel("bnn")
    # Plot the training points
    ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
    # and testing points
    ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright, alpha=0.6)
    ax.set_xlim(xx.min(), xx.max())
    ax.set_ylim(yy.min(), yy.max())
    #ax.set_xticks(())
    #ax.set_yticks(())

    for name, clf in zip(names, classifiers):
        i += 1
        
        ax = plt.subplot(len(datasets), len(classifiers) + 1, i)
        clf.fit(X_train, y_train)
        score = clf.score(X_test, y_test)

        if ds_cnt == 0:
            ax.set_title(name)

        # Plot the decision boundary. For that, we will 
        # assign a color to each point in the mesh 
        # [x_min, x_max]x[y_min, y_max].
        if hasattr(clf, "decision_function"):
            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
        else:
            Z = clf.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:, 1]
        try:
            # Put the result into a color plot
            Z = Z.reshape(xx.shape)
        except:
            continue
        ax.contourf(xx, yy, Z, cmap=cm, alpha=.8)

        # Plot also the training points
        ax.scatter(X_train[:, 0], X_train[:, 1], c=y_train, cmap=cm_bright)
        # and testing points
        ax.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cm_bright,
                   alpha=0.6)

        ax.set_xlim(xx.min(), xx.max())
        ax.set_ylim(yy.min(), yy.max())
        #ax.set_xticks(())
        #ax.set_yticks(())
        ax.text(xx.max() - .3, yy.min() + .3, ('%.2f' % score).lstrip('0'),
                size=15, horizontalalignment='right')
plt.tight_layout()
plt.show()