# Supervised Learning 

# Part 1

Part 1 will cover the material cover the ML algorithms from morning lecture:
- knn
- linear regression 
- regularization

## import modules 

In [None]:
%matplotlib notebook
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.datasets import make_classification, make_blobs
from matplotlib.colors import ListedColormap
from sklearn.datasets import load_breast_cancer
from sklearn import neighbors
import matplotlib.patches as mpatches
%matplotlib inline

## A few functions for plotting (feel free to ignore)

In [None]:
def plot_class_regions_for_classifier(clf, X, y, X_test=None, y_test=None, title=None, target_names = None, plot_decision_regions = True):

    numClasses = np.amax(y) + 1
    color_list_light = ['#FFFFAA', '#EFEFEF', '#AAFFAA', '#AAAAFF']
    color_list_bold = ['#EEEE00', '#000000', '#00CC00', '#0000CC']
    cmap_light = ListedColormap(color_list_light[0:numClasses])
    cmap_bold  = ListedColormap(color_list_bold[0:numClasses])

    h = 0.03
    k = 0.5
    x_plot_adjust = 0.1
    y_plot_adjust = 0.1
    plot_symbol_size = 50

    x_min = X[:, 0].min()
    x_max = X[:, 0].max()
    y_min = X[:, 1].min()
    y_max = X[:, 1].max()
    x2, y2 = np.meshgrid(np.arange(x_min-k, x_max+k, h), np.arange(y_min-k, y_max+k, h))

    P = clf.predict(np.c_[x2.ravel(), y2.ravel()])
    P = P.reshape(x2.shape)
    plt.figure()
    if plot_decision_regions:
        plt.contourf(x2, y2, P, cmap=cmap_light, alpha = 0.8)

    plt.scatter(X[:, 0], X[:, 1], c=y, cmap=cmap_bold, s=plot_symbol_size, edgecolor = 'black')
    plt.xlim(x_min - x_plot_adjust, x_max + x_plot_adjust)
    plt.ylim(y_min - y_plot_adjust, y_max + y_plot_adjust)

    if (X_test is not None):
        plt.scatter(X_test[:, 0], X_test[:, 1], c=y_test, cmap=cmap_bold, s=plot_symbol_size, marker='^', edgecolor = 'black')
        train_score = clf.score(X, y)
        test_score  = clf.score(X_test, y_test)
        title = title + "\nTrain score = {:.2f}, Test score = {:.2f}".format(train_score, test_score)

    if (target_names is not None):
        legend_handles = []
        for i in range(0, len(target_names)):
            patch = mpatches.Patch(color=color_list_bold[i], label=target_names[i])
            legend_handles.append(patch)
        plt.legend(loc=0, handles=legend_handles)

    if (title is not None):
        plt.title(title)
        plt.show()

def plot_two_class_knn(X, y, n_neighbors, weights, X_test, y_test):
    X_mat = X
    y_mat = y

    # Create color maps
    cmap_light = ListedColormap(['#FFFFAA', '#AAFFAA', '#AAAAFF','#EFEFEF'])
    cmap_bold  = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000'])

    clf = neighbors.KNeighborsClassifier(n_neighbors, weights=weights)
    clf.fit(X_mat, y_mat)

    # Plot the decision boundary by assigning a color in the color map
    # to each mesh point.
    
    mesh_step_size = .01  # step size in the mesh
    plot_symbol_size = 50
    
    x_min, x_max = X_mat[:, 0].min() - 1, X_mat[:, 0].max() + 1
    y_min, y_max = X_mat[:, 1].min() - 1, X_mat[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, mesh_step_size),
                         np.arange(y_min, y_max, mesh_step_size))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.figure()
    plt.pcolormesh(xx, yy, Z, cmap=cmap_light,shading='auto')

    # Plot training points
    plt.scatter(X_mat[:, 0], X_mat[:, 1], s=plot_symbol_size, c=y, cmap=cmap_bold, edgecolor = 'black')
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    title = "Neighbors = {}".format(n_neighbors)
    if (X_test is not None):
        train_score = clf.score(X_mat, y_mat)
        test_score  = clf.score(X_test, y_test)
        title = title + "\nTrain score = {:.2f}, Test score = {:.2f}".format(train_score, test_score)

    patch0 = mpatches.Patch(color='#FFFF00', label='class 0')
    patch1 = mpatches.Patch(color='#000000', label='class 1')
    plt.legend(handles=[patch0, patch1])

    plt.xlabel('Feature 0')
    plt.ylabel('Feature 1')
    plt.title(title)
    plt.show()

## K-Nearest Neighobur classification 

For this example, we use the iris data set. https://archive.ics.uci.edu/ml/datasets/iris

The Iris flower data set or Fisher's Iris data set is a multivariate data set introduced by the British statistician and biologist Ronald Fisher in his 1936 paper The use of multiple measurements in taxonomic problems as an example of linear discriminant analysis.

The data set consists of 150 samples from each of three species of Iris (Iris setosa, Iris virginica and Iris versicolor). Four features were measured from each sample: the length and the width of the sepals and petals, in centimeters. Based on the combination of these four features, Fisher developed a linear discriminant model to distinguish the species from each other.

### Get data and print description

In [None]:
# get data and print information about it 

iris = datasets.load_iris() # get data

print('The iris.data has {} samples each of which has {} features. e.g. first two data: {}'
      .format(iris.data.shape[0], iris.data.shape[1], iris.data[:2]))
print('The iris.target has lables (one of {}) for {} samples. e.g. first two labels {}'
      .format(iris.target_names, iris.target.shape[0], iris.target[:2]))

print(iris.DESCR)

### Get selected features and targets from iris data set for training

X represents the features and y represents the targets. 

In [None]:
X = iris.data[:, :2]  # we only take the first two features.
y = iris.target

### Plot data 

In [None]:
labels={0:'setosa',1:'versicolor',2:'virginica'}
fig,ax=plt.subplots()
for i in range(3):
    ax.scatter(X[y==i, 0], X[y==i, 1],label=labels[i])
ax.set_xlabel('Sepal length')
ax.set_ylabel('Sepal width')
ax.set_title('Iris Dataset')
ax.legend();

### Build a KNN Classification Model 

In [None]:
from sklearn.neighbors import KNeighborsClassifier # import model 

#### Train/Test Split 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.6, random_state = 0)

#### Train model 

In [None]:
knn = KNeighborsClassifier(n_neighbors = 10)
knn.fit(X_train, y_train)

#### Evaluate Model 

In [None]:
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn.score(X_test, y_test)))

Below is sklearn's classification report.  This includes alternative classification metrics to accuracy. 

In [None]:
from sklearn.metrics import classification_report  # includes various metrics for evaluating classification models
y_pred=knn.predict(X_test)
print(classification_report(y_test, y_pred, target_names=iris.target_names))

Below we visualize a confusion matrix.

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay  # computes and visually displays the confusion matrix 
cm=confusion_matrix(y_pred, y_test) #,labels=['setosa','versicolor','virginica'])
cm_display = ConfusionMatrixDisplay(cm,display_labels=['setosa','versicolor','virginica']).plot()

#### Predict with Model 

Finally, we may want to use our model to make predictions on unseen data points.  Below, we demo how this can be done. 

In [None]:
examples = [[5.9,  3.], [4.4, 1.9]]
predictions= knn.predict(examples)

print('Predicted iris type for ', examples, ' is ', 
          [iris.target_names[x] for x in predictions ] )

#### OPTIONAL EXERCISE

In the demo above we trained and evaluated a KNN model with k=10.  Using the Iris dataset as defined in variables X and Y above, train a KNN model with k = 1 and k = 100 and compute the accuracy of each model with testing data.  What is the best value of k: 1,10 or 50?

In [None]:
# fit models with 1 nearest neighbor 
knn1 = KNeighborsClassifier(n_neighbors = 1)
knn1.fit(X_train, y_train)

# compute accuracy
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn1.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn1.score(X_test, y_test)))

In [None]:
# fit models with 1 nearest neighbor 
knn50 = KNeighborsClassifier(n_neighbors = 50)
knn50.fit(X_train, y_train)

# compute accuracy
print('Accuracy of K-NN classifier on training set: {:.2f}'
     .format(knn50.score(X_train, y_train)))
print('Accuracy of K-NN classifier on test set: {:.2f}'
     .format(knn50.score(X_test, y_test)))

## Visualizing Decision Boundaries 

### Decision boundaries with synthetic data sets

In [None]:
# synthetic dataset for classification (binary) 
plt.figure()
plt.title('Sample binary classification problem with two features')

# create synthetic dataset 
X_C2, y_C2 = make_classification(n_samples = 100, n_features=2,
                                n_redundant=0, n_informative=2,
                                n_clusters_per_class=1, flip_y = 0.1,
                                class_sep = 0.5, random_state=0)

# plot synthetic dataset
cmap_bold = ListedColormap(['#FFFF00', '#00FF00', '#0000FF','#000000']) 
plt.scatter(X_C2[:, 0], X_C2[:, 1], c=y_C2,
           marker= 'o', s=50, cmap=cmap_bold)
plt.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_C2, y_C2,
                                                   random_state=0)
plot_two_class_knn(X_train, y_train, 1, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 5, 'uniform', X_test, y_test)
plot_two_class_knn(X_train, y_train, 10, 'uniform', X_test, y_test)

#### OPTIONAL EXERCISE

By visually inspecting the data above, what is the best value of k-- 1, 5 or 10? 

#### Answer

The model where k=5 is the best model.  The model where k=1 appears to overfit the data, as islands appear around rara data points.  The model where k=10 seems to underfit the data, particularly on the right side of the plot. 

## Linear Regression 

### linear regression with synthetic binary data

#### import modules

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression

#### make and plot synthetic dataset

In [None]:
# get data 
X_R1, y_R1 = make_regression(n_samples = 100, n_features=1,
                            n_informative=1, bias = 150.0,
                            noise = 30, random_state=0)

# plot data 
plt.figure()
plt.title('Sample regression problem with one input variable')
plt.scatter(X_R1, y_R1, marker= 'o', s=50)
plt.show()

#### train test split and model training 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_R1, y_R1,
                                                   random_state = 0)
linreg = LinearRegression().fit(X_train, y_train)

#### Evaluate model 

In [None]:
print('linear model coeff (w): {}'
     .format(linreg.coef_))
print('linear model intercept (b): {:.3f}'
     .format(linreg.intercept_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

#### Plot Model 

In [None]:
plt.figure(figsize=(5,4))
plt.scatter(X_R1, y_R1, marker= 'o', s=50, alpha=0.8, label='training data')
plt.plot(X_R1, linreg.coef_ * X_R1 + linreg.intercept_, 'r-', label='model')
plt.title('Least-squares linear regression')
plt.xlabel('Feature value (x)')
plt.ylabel('Target value (y)')
plt.legend()
plt.show()

### linear regression with the diabetes housing dataset

In [None]:
# Regression with Boston Housing dataset
from sklearn.datasets import load_diabetes
diabetes = load_diabetes()
print('The diabetes.data has {} samples each of which has {} features. e.g. first two data: {}'
      .format(diabetes.data.shape[0], diabetes.data.shape[1], diabetes.data[:2]))
print('The diabetes.target has values for {} samples. e.g. first two labels {}'
      .format(diabetes.target.shape[0], diabetes.target[:5]))

In [None]:
print(diabetes.DESCR)

#### Exploring our data 

In linear regression, it is good to explore the relationships between your features and targets to see if linear relationships exist or can be engineered. Below we explore this via a few data visualizations. 

In [None]:
plt.figure()
plt.hist(diabetes.target, bins=25)
plt.xlabel("quantitative measure of disease progression one year after baseline")
plt.show()

In [None]:
plt.figure()
plt.scatter(diabetes.data[:, 0:1] , diabetes.target, marker='o')
plt.title("Variation in House prices")
plt.xlabel("standardized age")
plt.ylabel("quantitative measure of disease progression one year after baseline")
plt.show()

In [None]:
plt.figure()
plt.scatter(diabetes.data[:, 2:3] , diabetes.target, marker='o')
plt.title("Variation in House prices")
plt.xlabel("standardized bmi")
plt.ylabel("quantitative measure of disease progression one year after baseline")
plt.show()

#### train and evaluate model with all features

In [None]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2,
                                                   random_state = 33)
linreg = LinearRegression().fit(X_train, y_train)

print('Boston House dataset')
print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

#### OPTIONAL EXERCISE

How does simple linear regression compare to multiple linear regression?  In the model above we used all features available.  However, we see in our EDA that there wasn't necessarily a strong linear relationship between all of our features and our target.  

Try building a simple linear regression model where our features is body mass index.  Compute the R^2 coefficient for this model. How do these results compare to the multiple linear regression model we built in the previous cell?


To help get you started, the test/train split is already set up for below. 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data[:, 2:3], diabetes.target, test_size=0.2,
                                                   random_state = 33)

In [None]:
linreg = LinearRegression().fit(X_train, y_train)

print('Boston House dataset')
print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

# Regularized Regression

Below we use Ridge Regression to train a model with the Boston Housing dataset.  It is important to standardize your data when using regularization to ensure that the beta coefficients you find are all of similar magnitude.  
Below we choose to standardize or data with the MinMaxScalar.  Please note that there are two functions used, fit_transform and transform.  By performing the fit, we are computing the min and max values of the data provided and storing these values so that the can used to transform data.  Never fit the MinMaxScaler with testing data -- only use training. 

In [None]:
from sklearn.linear_model import Ridge

In [None]:
X_train, X_test, y_train, y_test = train_test_split(diabetes.data, diabetes.target, test_size=0.2,
                                                   random_state = 33)

# Note we typically would need to standardize our data.  however, this data comes pre-standardized
# code below is one way we could standardize our data with the min max scaler described above

'''
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
'''

linreg = Ridge(alpha=1.0).fit(X_train, y_train)

print('Boston House dataset')
print('linear model intercept: {}'
     .format(linreg.intercept_))
print('linear model coeff:\n{}'
     .format(linreg.coef_))
print('R-squared score (training): {:.3f}'
     .format(linreg.score(X_train, y_train)))
print('R-squared score (test): {:.3f}'
     .format(linreg.score(X_test, y_test)))

#### what is the best value of alpha to use? 

An important hyperparameter when using Ridge regression is alpha.  Below we consider several values of alpha so that we can find optimal performance. 

In [None]:
# what is the best value of alpha to use? 
r2s = []
alphas=[0.001,0.01,0.1,1,10,100,1000,10000]
for alpha in alphas:
    linreg = Ridge(alpha=alpha).fit(X_train, y_train)
    r2s.append( linreg.score(X_test, y_test) )
    print('Alpha: {} R-squared score (test): {:.3f}'.format(alpha,r2s[-1]))
    
fig,ax=plt.subplots()
ax.plot(alphas,r2s)
ax.scatter(alphas,r2s)
ax.set_xscale('log')
ax.set_xlabel('alpha')
ax.set_ylabel('R^2');

### Optional Excercise 

Copy and modify the code above to see how the performance changes when we use Lasso Regression instead of Ridge for various values of alpha.

https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html 

In [None]:
from sklearn.linear_model import Lasso

# what is the best value of alpha to use? 
r2s = []
alphas=[0.001,0.01,0.1,1,10,100,1000,10000]
for alpha in alphas:
    linreg = Lasso(alpha=alpha).fit(X_train, y_train)
    r2s.append( linreg.score(X_test, y_test) )
    print('Alpha: {} R-squared score (test): {:.3f}'.format(alpha,r2s[-1]))
    
fig,ax=plt.subplots()
ax.plot(alphas,r2s)
ax.scatter(alphas,r2s)
ax.set_xscale('log')
ax.set_xlabel('alpha')
ax.set_ylabel('R^2');

# Part 2 

The remainder of the machine learning methods will be covered in the afternoon session:

- logistic regression 
- support vector machines
- naive bayes
- decision tree based models

# Logistic Regression

### Cancer Dataset

In [None]:
# Regression with breast cancer dataset
cancer = load_breast_cancer()
print('The cancer.data has {} samples each of which has {} features. e.g. first two data: {}'
      .format(cancer.data.shape[0], cancer.data.shape[1], cancer.data[:2]))
print('The cancer.target has lables (one of {}) for {} samples. e.g. first two labels {}'
      .format(cancer.target_names, cancer.target.shape[0], cancer.target[:2]))

In [None]:
print(cancer.DESCR)

## Train and evaluate logistic regression model

Note the parameter C when training the Logistic Regression is similar to alpha used in ridge regression.  However, C and alpha have an inverse relation ship.  The smaller the value of C, the stronger the regularization is.  The regularization term is why standardizing the data is needed when training this model. 

In [None]:
# Logistic regression
# Logistic regression for binary classification )
from sklearn.linear_model import LogisticRegression

(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

X_train, X_test, y_train, y_test = train_test_split(X_cancer[:, 20:30], y_cancer,
                                                   random_state = 0)
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LogisticRegression(C=100).fit(X_train_scaled, y_train)

print('Accuracy of Logistic regression classifier on training set: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('Accuracy of Logistic regression classifier on test set: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

## Support Vector Machine (SVMs)

In [None]:
# Support vector machine
# Linear Support Vector Machine
from sklearn.svm import SVC
from sklearn.svm import LinearSVC

(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)

X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)

clf = LinearSVC().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

Note: you likely received a ConvergenceWarning from the cell above.  We included this cell to highlight the fact that this is another example of where standardizing data can help.  In general, standardized data makes the training process easier, specifically when your training process involves using a numerical optimizer to optimize a loss function.  Below we train the model again, but with standardized data.  

In [None]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
clf = LinearSVC().fit(X_train_scaled, y_train)
print('Breast cancer dataset (normalized with MinMax scaling)')
print('Polynomial-kernel (degree=3) SVC (with MinMax scaling) training set accuracy: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('Polynomial-kernel (degree=3) SVC (with MinMax scaling) test set accuracy: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

In [None]:
cm_display = ConfusionMatrixDisplay.from_estimator(clf, X_test_scaled, y_test,
                                                   display_labels=cancer.target_names,
                                                   cmap=plt.cm.Blues,normalize=None)

### Multi-class classification with linear models and SVMs

In [None]:
# Multi-class classification with linear models
# LinearSVC with M classes generates M one vs rest classifiers.
from sklearn.svm import LinearSVC

iris = datasets.load_iris()
X, y = iris.data, iris.target

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0)

scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = LinearSVC(C=5, random_state = 67).fit(X_train_scaled, y_train)
print('Coefficients:\n', clf.coef_)
print('Intercepts:\n', clf.intercept_)
print('Accuracy of Linear SVC classifier on training set: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('Accuracy of Linear SVC classifier on test set: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

In [None]:
cm_display = ConfusionMatrixDisplay.from_estimator(clf, X_test_scaled, y_test,
                                                   display_labels=iris.target_names,
                                                   cmap=plt.cm.Blues,normalize=None)

### SVMs and the kernel trick

In [None]:
#RBF Kernel normlized 
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

clf = SVC(kernel='rbf',C=10).fit(X_train_scaled, y_train)
print('Breast cancer dataset (normalized with MinMax scaling)')
print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

#### OPTIONAL EXERCISE

Try building another SVC model except instead of using the rbf kernel, use the polynomial kernel of degree 5:

https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html

How do the results compare? 

In [None]:
clf = SVC(kernel='poly',degree=5,C=10).fit(X_train_scaled, y_train)
print('Breast cancer dataset (normalized with MinMax scaling)')
print('RBF-kernel SVC (with MinMax scaling) training set accuracy: {:.2f}'
     .format(clf.score(X_train_scaled, y_train)))
print('RBF-kernel SVC (with MinMax scaling) test set accuracy: {:.2f}'
     .format(clf.score(X_test_scaled, y_test)))

## Naive Bayes Classification 

In [None]:
# Naive Bayes classifiers
from sklearn.naive_bayes import GaussianNB

X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2,
                       centers = 8, cluster_std = 1.3,
                       random_state = 4)

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,
                                                   random_state = 0)

nbclf = GaussianNB().fit(X_train, y_train)
plot_class_regions_for_classifier(nbclf, X_train, y_train, X_test, y_test,
                                'Gaussian Naive Bayes classifier: Dataset 1')

In [None]:
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X_cancer[:, 0:10] , y_cancer, random_state = 0)
nbclf = GaussianNB().fit(X_train, y_train)
print('Breast cancer dataset')
print('Accuracy of GaussianNB classifier on training set: {:.2f}'
     .format(nbclf.score(X_train, y_train)))
print('Accuracy of GaussianNB classifier on test set: {:.2f}'
     .format(nbclf.score(X_test, y_test)))

## Decision tree


### Iris dataset

In [None]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split


iris = load_iris()

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target, random_state = 3)
clf = DecisionTreeClassifier().fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

### Pruning decision trees

In [None]:
# setting maximum depth

clf2 = DecisionTreeClassifier(max_depth = 3).fit(X_train, y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf2.score(X_train, y_train)))
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf2.score(X_test, y_test)))

### Visualizing decision trees

In [None]:
import graphviz
from sklearn import tree
import importlib
import matplotlib as mpl
import pprint
importlib.reload(mpl); importlib.reload(plt); importlib.reload(sn)

In [None]:
fn=['sepal length (cm)','sepal width (cm)','petal length (cm)','petal width (cm)']
cn=['setosa', 'versicolor', 'virginica']
fig,ax=plt.subplots(dpi=300)
output=tree.export_text(clf, feature_names = fn ) #, class_names=cn,filled = True)

print(output)

print('Note labels:',labels)

In [None]:
fig = plt.figure(figsize=(25,20))
_ = tree.plot_tree(clf, 
                   feature_names=iris.feature_names,  
                   class_names=iris.target_names,
                   filled=True)
plt.savefig('tree.png')

## Random Forest


### Random blobs

In [None]:
# Random forests
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

X_D2, y_D2 = make_blobs(n_samples = 100, n_features = 2,
                       centers = 8, cluster_std = 1.3,
                       random_state = 4)

X_train, X_test, y_train, y_test = train_test_split(X_D2, y_D2,
                                                   random_state = 0)

clf = RandomForestClassifier().fit(X_train, y_train)
print('Accuracy of Random Forest classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of Random Forest classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))


### Cancer dataset

In [None]:
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer, random_state = 0)


clf = RandomForestClassifier().fit(X_train, y_train)
print('Accuracy of RandomForest classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))
print('Accuracy of RandomForest classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))


### Note :  If you want to visualize your trees in the random forest you can as demoed below! 

In [None]:
cancer= load_breast_cancer()
_ = tree.plot_tree(clf.estimators_[1],
                   filled=True)
plt.savefig('rf_tree.png')