<a href="https://colab.research.google.com/github/jdh4/furr/blob/master/getting_started_machine_linearning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import matplotlib.pyplot as plt

rng = np.random.RandomState(1)
qualitiative_colors = ['#1b9e77','#d95f02','#7570b3','#e7298a']

# *Regression*

## Load Diabetes Data Set

In [None]:
from sklearn.datasets import load_diabetes

In [None]:
# Load the diabetes dataset
diabetes_bunch = load_diabetes()
diabetes_X = diabetes_bunch.data
diabetes_y = diabetes_bunch.target

# Use only one feature
diabetes_X = diabetes_X[:, np.newaxis, 2]

# Split the data into training/testing sets
diabetes_X_train = diabetes_X[:-20]
diabetes_X_test = diabetes_X[-20:]

# Split the targets into training/testing sets
diabetes_y_train = diabetes_y[:-20]
diabetes_y_test = diabetes_y[-20:]

In [None]:
print(diabetes_bunch.DESCR)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(diabetes_X_train, diabetes_y_train, color=qualitiative_colors[0], s=10)
axs.set_xlabel('BMI (scaled)')
axs.set_ylabel('quantitative measure of diabetes progression')

## Linear Regression



In [None]:
from sklearn.linear_model import LinearRegression

### Create linear regression object

In [None]:
regr = LinearRegression()

### Train the model using the training set



In [None]:
regr.fit(diabetes_X_train, diabetes_y_train)

### Output the best fit values

In [None]:
print("Intercept: \n", regr.intercept_)
print("Coefficients: \n", regr.coef_)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(diabetes_X_train, diabetes_y_train, s=1, color=qualitiative_colors[0])
axs.plot(diabetes_X_train, regr.predict(diabetes_X_train), color=qualitiative_colors[1], linewidth=2)
axs.scatter(diabetes_X_train[:4], diabetes_y_train[:4], color=qualitiative_colors[0])
axs.vlines(diabetes_X_train[:4], regr.intercept_ + regr.coef_[0]*diabetes_X_train[:4], diabetes_y_train[:4], lw=2)
axs.set_xlabel('BMI (scaled)')
axs.set_ylabel('quantitative measure of diabetes progression')

### Make predictions using the test set

In [None]:
diabetes_y_pred_linear = regr.predict(diabetes_X_test)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=2, ncols=1, facecolor='white', dpi=200, sharex=True)
axs[0].scatter(diabetes_X_train, diabetes_y_train, s=1, color=qualitiative_colors[0])
axs[0].plot(diabetes_X_test, diabetes_y_pred_linear, color=qualitiative_colors[1], linewidth=2)
axs[0].scatter(diabetes_X_test, diabetes_y_test, color=qualitiative_colors[2], s=8)
axs[1].hlines(0, -0.1, 0.15, color=qualitiative_colors[1], linewidth=2)
axs[1].scatter(diabetes_X_test, diabetes_y_test-diabetes_y_pred_linear, color=qualitiative_colors[2], s=10)
axs[1].set_xlabel('BMI (scaled)')
axs[0].set_ylabel('diabetes progression')
axs[1].set_ylabel('truth - model')

## Create Periodic Data Set

In [None]:
X = np.linspace(start=0, stop=2.*np.pi, num=1000).reshape(-1, 1)
y_base = np.squeeze(np.sin(X))

noise_std = 0.5
y = y_base + rng.normal(loc=0.0, scale=noise_std, size=y_base.shape)

training_indices = rng.choice(np.arange(y.size), size=10, replace=False)
X_train, y_train = X[training_indices], y[training_indices]

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.plot(X, y_base, color="gray", label = r"$y = \sin(x)$")
axs.errorbar(X_train, y_train, noise_std, linestyle="None", color=qualitiative_colors[0], marker=".", markersize=10)
axs.legend()
axs.set_ylabel('y')
axs.set_xlabel('x')

## Gaussian Process Regression

In [None]:
from sklearn.gaussian_process import GaussianProcessRegressor

### Create kernel object

In [None]:
from sklearn.gaussian_process.kernels import RBF
kernel = 1 * RBF(length_scale=1.0, length_scale_bounds=(1e-4, 1e2))

### Create Gaussian Process object

In [None]:
gaussian_process = GaussianProcessRegressor(kernel=kernel, alpha=noise_std**2, n_restarts_optimizer=9)

### Train the model using the training set

In [None]:
gaussian_process.fit(X_train, y_train)

### Make predictions

In [None]:
mean_prediction, std_prediction = gaussian_process.predict(X, return_std=True)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=2, ncols=1, facecolor='white', dpi=200, sharex=True)
axs[0].plot(X, y_base, color="gray")
axs[0].errorbar(X_train, y_train, noise_std, linestyle="None", color=qualitiative_colors[0], marker=".", markersize=10)
axs[0].plot(X, mean_prediction, ls='--')
axs[0].fill_between(X.ravel(), mean_prediction - 1.96 * std_prediction, mean_prediction + 1.96 * std_prediction, alpha=0.25)
axs[0].set_ylabel('y')
axs[1].set_xlabel('x')
axs[1].plot(X, np.zeros(X.shape), color="gray")
axs[1].plot(X, y_base-mean_prediction, ls='--')
axs[1].fill_between(X.ravel(), (y_base-mean_prediction) - 1.96 * std_prediction, (y_base-mean_prediction) + 1.96 * std_prediction, alpha=0.25)
axs[1].set_ylabel('truth - model')

# Classification

## Make Classification Data Set

In [None]:
from sklearn.datasets import make_classification

In [None]:
#X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, random_state=100, n_clusters_per_class=1, class_sep=0.75)
X, y = make_classification(n_samples=200, n_features=2, n_redundant=0, n_informative=2, random_state=1, n_clusters_per_class=1, class_sep=1.5)


n_sample = len(X)

# randomize the data
order = rng.permutation(n_sample)
X = X[order]
y = y[order].astype(float)

# Split the data into training/testing sets
X_train = X[: int(0.8 * n_sample)]
y_train = y[: int(0.8 * n_sample)]
X_test = X[int(0.8 * n_sample) :]
y_test = y[int(0.8 * n_sample) :]

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.PRGn, edgecolor="k", s=50)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## Support Vector Classification

In [None]:
from sklearn.svm import SVC

### Create Support Vector Classification object

In [None]:
clf = SVC(kernel="linear")

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

### Output the best fit values

In [None]:
print("Intercept: \n", clf.intercept_)
print("Coefficients: \n", clf.coef_)

### Create a grid over the feature space

In [None]:
x_min = X[:, 0].min()-0.1
x_max = X[:, 0].max()+0.1
y_min = X[:, 1].min()-0.1
y_max = X[:, 1].max()+0.1

XX, YY = np.mgrid[x_min:x_max:200j, y_min:y_max:200j]

### Make predictions at each point in the feature 

In [None]:
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
Z_lim = max([-1*Z.min(), Z.max()])

### Make predictions using the test set

In [None]:
predicted_class = clf.predict(X_test)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
edgecolors = plt.cm.PRGn(predicted_class)
axs.scatter(X_test[:, 0], X_test[:, 1], s=80, c=y_test, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor=edgecolors)
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Create Support Vector Classification object **with poly kernel**

In [None]:
clf = SVC(kernel="poly")

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

### Make predictions at each point in the feature 

In [None]:
Z = clf.decision_function(np.c_[XX.ravel(), YY.ravel()])
Z = Z.reshape(XX.shape)
Z_lim = max([-1*Z.min(), Z.max()])

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], s=40, c=y_train, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor='k')
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Make predictions using the test set

In [None]:
predicted_class = clf.predict(X_test)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
edgecolors = plt.cm.PRGn(predicted_class)
axs.scatter(X_test[:, 0], X_test[:, 1], s=60, c=y_test, zorder=10, cmap=plt.cm.PRGn, alpha=0.5, edgecolor=edgecolors)
axs.pcolormesh(XX, YY, Z, cmap=plt.cm.PRGn, shading='auto', edgecolors=None, alpha=0.5, snap=True, vmin=-1*Z_lim, vmax=Z_lim)
axs.contour(XX, YY, Z, colors=["k", "k", "k"], linestyles=["--", "-", "--"], levels=[-0.5, 0, 0.5])
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## k-Nearest Neighbors Classifier

In [None]:
from sklearn.neighbors import KNeighborsClassifier

### Create Support Vector Classification object

In [None]:
number_of_neighbors = 5
clf =  KNeighborsClassifier(number_of_neighbors)

### Train the model using the training set

In [None]:
clf.fit(X_train, y_train)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.Dark2, edgecolor="k", s=20)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

### Make predictions using the test set

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200, sharex=True)
axs.scatter(X_train[:, 0], X_train[:, 1], c=y_train, zorder=10, cmap=plt.cm.Dark2, edgecolor="k", s=20)
for i in range(X_test.shape[0]):
    test_coords = X_test[i,:]
    neighbors = clf.kneighbors(test_coords[np.newaxis,:], 5,  return_distance=False)[0]
    predicted_class = clf.predict(test_coords[np.newaxis,:])
    axs.scatter(test_coords[0], test_coords[1], s=40, color=plt.cm.Dark2(predicted_class), zorder=10, edgecolor='None')
    for j in range(len(neighbors)):
        axs.plot([X_train[neighbors[j], 0], test_coords[0]], [X_train[neighbors[j], 1], test_coords[1]], '-', color=qualitiative_colors[1])
axs.set_xlim(-1,4)
axs.set_ylim(-2.1,2.15)
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

# Clustering

## Make Clustering Data Set

In [None]:
from sklearn.datasets import make_circles, make_moons, make_blobs

n_samples = 500
noisy_circles = make_circles(n_samples=n_samples, factor=0.5, noise=0.05)
noisy_moons = make_moons(n_samples=n_samples, noise=0.05)
blobs = make_blobs(n_samples=n_samples, random_state=8)

random_state = 170
X, y = make_blobs(n_samples=n_samples, random_state=random_state)
transformation = [[0.6, -0.6], [-0.4, 0.8]]
X_aniso = np.dot(X, transformation)
aniso = (X_aniso, y)

varied = make_blobs(n_samples=n_samples, cluster_std=[1.0, 2.5, 0.5], random_state=random_state)

no_structure = np.random.rand(n_samples, 2), np.zeros(n_samples)

datasets = [(varied,       {"n_clusters": 3}),
            (aniso,        {"n_clusters": 3}),
            (blobs,        {"n_clusters": 3}),
            (noisy_circles,{"n_clusters": 2}),
            (noisy_moons,  {"n_clusters": 2}),
            (no_structure, {"n_clusters": 3}),
           ]

In [None]:
fig, axs = plt.subplots(figsize=(24.,4.), nrows=1, ncols=6, facecolor='white', dpi=200)
for i_dataset, (dataset, algo_params) in enumerate(datasets):
    X, y = dataset
    axs[i_dataset].scatter(X[:,0], X[:,1], c=qualitiative_colors[0])
    axs[i_dataset].set_xlabel(r'$x_1$')
axs[0].set_ylabel(r'$x_2$')

## K-Means clustering

In [None]:
from sklearn.cluster import KMeans

In [None]:
fig, axs = plt.subplots(figsize=(24.,4.), nrows=1, ncols=6, facecolor='white', dpi=200)
for i_dataset, (dataset, algo_params) in enumerate(datasets):
    X, y = dataset
    
    # Create K-Means clustering object
    k_means = KMeans(init="k-means++", n_clusters=algo_params['n_clusters'], n_init=10)
    
    # Train the model using the training set
    k_means.fit(X)
    
    # Return the cluster centroids
    k_means_cluster_centers = k_means.cluster_centers_
    
    # Return the cluster assignments
    y_prediction = k_means.labels_

    axs[i_dataset].scatter(X[:,0], X[:,1], c=y_prediction, cmap=plt.cm.Accent)
    axs[i_dataset].scatter(k_means_cluster_centers[:,0], k_means_cluster_centers[:,1], c='None', marker="D", edgecolors='k', s=75)
    axs[i_dataset].set_ylabel(r'$x_2$')
    axs[i_dataset].set_xlabel(r'$x_1$')

# Dimensionality Reduction

## Make Dimensionality Reduction Data Set

In [None]:
n_samples = 500
cov = [[3, 3], [3, 4]]
X = rng.multivariate_normal(mean=[0, 0], cov=cov, size=n_samples)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(X[:, 0], X[:, 1], s=10, color = qualitiative_colors[0], edgecolor='None')
axs.set_ylabel(r'$x_2$')
axs.set_xlabel(r'$x_1$')

## Principal Component Analysis

In [None]:
from sklearn.decomposition import PCA

### Create Principal Component Analysis object

In [None]:
pca = PCA(n_components=2)

### Train the model using the training set

In [None]:
pca.fit(X)

### Apply coordinate transform

In [None]:
reduced_data = pca.transform(X)

In [None]:
fig, axs = plt.subplots(figsize=(8.,4.), nrows=1, ncols=2, facecolor='white', dpi=200)
axs[0].scatter(X[:, 0], X[:, 1], s=10, color = 'k', edgecolor='None')
for i, (comp, var) in enumerate(zip(pca.components_, pca.explained_variance_)):
    comp = comp * var  # scale component by its variance explanation power
    axs[0].arrow(0, 0, comp[0], comp[1], width=0.1, color = qualitiative_colors[i])
axs[0].set_ylabel(r'$x_2$')
axs[0].set_xlabel(r'$x_1$')

bins = np.linspace(np.min(reduced_data)-0.01, np.max(reduced_data)+0.01, int(n_samples/10))
axs[1].hist(reduced_data[:,0], histtype='step', bins= bins, lw=3, color = qualitiative_colors[0], label = 'Data projected onto first component')
axs[1].hist(reduced_data[:,1], histtype='step', bins= bins, lw=3, color = qualitiative_colors[1], label = 'Data projected onto second component')
axs[1].legend(loc=0, fontsize=8)
axs[1].set_xlabel(r'$\hat{x}$')

## Load Digits Data Set

In [None]:
from sklearn.datasets import load_digits

In [None]:
data, labels = load_digits(return_X_y=True)
(n_samples, n_features), n_digits = data.shape, np.unique(labels).size

print(f"number of digits: {n_digits}; number of samples: {n_samples}; number of features {n_features}")

In [None]:
fig, axs = plt.subplots(figsize=(30.,3.), nrows=1, ncols=10, facecolor='white', dpi=200)
for ax, image, label in zip(axs, data, labels):
    ax.set_axis_off()
    ax.imshow(image.reshape((8, 8)), cmap=plt.cm.gray_r, interpolation="nearest")
    ax.set_title("Training: %i" % label)

## Principal Component Analysis

### Create Principal Component Analysis object

In [None]:
pca = PCA(n_components=2)

### Train the model using the training set

In [None]:
pca.fit(data)

### Reduce dimensions by applying coordinate transform

In [None]:
reduced_data = pca.transform(data)

In [None]:
print("Data: \n", data.shape)
print("Reduced Data: \n", reduced_data.shape)

In [None]:
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=200)
axs.scatter(reduced_data[:, 0], reduced_data[:, 1], s=5, c=qualitiative_colors[1])
axs.set_ylabel(r'$\hat{x}_2$')
axs.set_xlabel(r'$\hat{x}_1$')

In [None]:
colors = ['#a6cee3','#1f78b4','#b2df8a','#33a02c','#fb9a99','#e31a1c','#fdbf6f','#ff7f00','#cab2d6','#6a3d9a']
markers = ["$0$","$1$","$2$","$3$","$4$","$5$","$6$","$7$","$8$","$9$"]
fig, axs = plt.subplots(figsize=(4.,4.), nrows=1, ncols=1, facecolor='white', dpi=400)
for i in range(len(markers)):
    to_plot = labels == i
    axs.scatter(reduced_data[to_plot, 0], reduced_data[to_plot, 1], s=20, c=colors[i], marker=markers[i])
axs.set_ylabel(r'$\hat{x}_2$')
axs.set_xlabel(r'$\hat{x}_1$')